# BoxOfficeMojo Functions

In [5]:
## Basic stuff
%load_ext autoreload
%autoreload
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("""<style>div.output_area{max-height:10000px;overflow:scroll;}</style>"""))


## Python Version
import sys
print("Python: {0}".format(sys.version))


## Install
import re
from time import sleep
from timeUtils import clock, elapsed
from ioUtils import saveFile, getFile
from fsUtils import setDir, isDir, mkDir, setFile, isFile, setSubFile
from searchUtils import findSubPatternExt
from webUtils import getWebData, getHTML

import datetime as dt
start = dt.datetime.now()
print("Notebook Last Run Initiated: "+str(start))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Python: 3.6.6 |Anaconda custom (64-bit)| (default, Jun 28 2018, 11:07:29) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
Notebook Last Run Initiated: 2018-11-14 19:56:04.154831


In [2]:
# Set Global Params
moviesDir = "/Users/tgadfort/Documents/code/movies/data"
def getMovieDir():
    return moviesDir

def getBoxOfficeDir():
    dirname = setDir(getMovieDir(), "boxoffice.com")
    if not isDir(dirname): mkDir(dirname)
    return dirname

##   getBoxOfficeMojoWeekendResults
##   parseBoxOfficeMojoResults()
##   mergeBoxOfficeMojoResults()
##   processBoxOfficeMojo()

# Get BoxOfficeMojo Files

In [None]:
def getBoxOfficeMojoWeekendResult(year, week, outdir):
    yname = str(year)
    if week < 10:
        wname = "0"+str(week)
    else:
        wname = str(week)
        
    url="http://www.boxofficemojo.com/weekend/chart/?yr="+yname+"&wknd="+wname+"&p=.htm"
    savename = setFile(outdir, yname+"-"+wname+".p")
    if isFile(savename): return
    getWebData(base=url, savename=savename, useSafari=False)
    sleep(2)

    

def getBoxOfficeMojoWeekendResults(startYear = 1982, endYear = 1983):
    outdir = setDir(getBoxOfficeDir(), "data")
    if not isDir(outdir): mkDir(outdir)
    years  = range(int(startYear), int(endYear)+1)
    months = range(1,53)
    for year in years:
        for month in months:
            getBoxOfficeMojoWeekendResult(year, month, outdir)

In [None]:
getBoxOfficeMojoWeekendResults()

# Parse BoxOffice Mojo Files

In [6]:
def parseBoxOfficeMojo(ifile):
    htmldata = getFile(ifile)
    bsdata   = getHTML(htmldata)
    tbl = None
    for table in bsdata.findAll("table"):
        if tbl:
            break
        for tr in table.findAll("tr"):
            if len(tr) >= 10:
                tbl = table
                break
            else:
                break
        
    #print len(tbl)
    keys = []
    data = []
    for i,tr in enumerate(tbl):
        vals = []
        if i == 0:
            for j,td in enumerate(tr.findAll("td")):
                for ref in td.findAll("a"):
                    key = ref.string
                    keys.append(key)
        else:
            if len(tr) <= 1: continue
            #print "\n\n\nNext...."
            #print tr
            #print "  tr-->",tr,'\t',len(tr)
            #print i,tr,len(data)
            for j,td in enumerate(tr.findAll("td")):
                if td.string == None:
                    continue
                try:
                    if re.search("TOTAL \((\d+) MOVIES\)", td.string):
                        break
                except:
                    print(j,td.string)
                    raise()
                key = keys[j]
                val = td.string
                vals.append(val)
                #print j,'\t',keys[j],'\t',td.string
            if len(vals) == 0: break
            if len(vals) != len(keys):
                print("Mismatch with keys/data")
                print(len(keys),'\t',keys)
                print(len(vals),'\t',vals)
                break
            else:
                data.append(vals)

    
    print("Found",len(data),"movies from",ifile            )
    return data
            

def parseBoxOfficeMojoResults(startYear = 1982, endYear = 2017):
    outdir   = getBoxOfficeDir()
    if endYear == None: endYear = startYear
    years    = range(int(startYear), int(endYear)+1)
    for year in years:
        retval = []
        files  = findSubPatternExt(outdir, "data", pattern=str(year), ext=".p")
        for ifile in files:
            result = parseBoxOfficeMojo(ifile)
            retval.append(result)

        savename = setSubFile(outdir, "results", str(year)+".json")
        print("Saving",len(retval),"weekends of movie data to",savename)
        save(savename, retval)

In [7]:
parseBoxOfficeMojoResults(1982, 1982)





Found 12 movies from /Users/tgadfort/Documents/code/movies/data/boxoffice.com/data/1982-13.p
Found 17 movies from /Users/tgadfort/Documents/code/movies/data/boxoffice.com/data/1982-03.p
Found 8 movies from /Users/tgadfort/Documents/code/movies/data/boxoffice.com/data/1982-09.p
Found 12 movies from /Users/tgadfort/Documents/code/movies/data/boxoffice.com/data/1982-07.p
Found 13 movies from /Users/tgadfort/Documents/code/movies/data/boxoffice.com/data/1982-14.p
Found 10 movies from /Users/tgadfort/Documents/code/movies/data/boxoffice.com/data/1982-10.p
Found 14 movies from /Users/tgadfort/Documents/code/movies/data/boxoffice.com/data/1982-04.p
Found 12 movies from /Users/tgadfort/Documents/code/movies/data/boxoffice.com/data/1982-15.p
Found 14 movies from /Users/tgadfort/Documents/code/movies/data/boxoffice.com/data/1982-11.p


EOFError: 

# Merge BoxOfficeMojo Files

In [None]:
def mergeBoxOfficeMojoResults():
    outdir = getBoxOfficeDir()
    retval = {}
    files  = findSubExt(outdir, "results", ext=".json")
    for ifile in files:
        year = getBaseFilename(ifile)
        data = get(ifile)
        retval[year] = data
              
    savename = setFile(outdir, "results.json")
    print "Saving",len(retval),"years of movie data to",savename
    save(savename, retval)

# Process BoxOfficeMojo

In [None]:




def processBoxOfficeMojo():
    outdir   = getBoxOfficeDir()
    savename = setFile(outdir, "results.json")
    
    data = get(savename)
    movies = {}
    yearlyData = {}
    for i,year in enumerate(data.keys()):
        movies[year] = {}
        ydata = data[year]
        for wdata in ydata:
            for mdata in wdata:
                movie  = mdata[2]
                retval = search("\((\d+)\)",movie)
                if retval:
                    stryear  = retval.group()
                    movie = movie.replace(stryear, "").strip()

                gross  = convertCurrency(mdata[9])
                weekly = convertCurrency(mdata[4])
                money  = max(gross, weekly)
                if movies[year].get(movie) == None:
                    movies[year][movie] = money
                else:                    
                    movies[year][movie] = max(money, movies[year][movie])

        yearlyData[year] = sorted(movies[year].items(), key=operator.itemgetter(1), reverse=True)
        print "---->",year,"<----"
        for item in yearlyData[year][:25]:
            print item
        print '\n'
        
    savename = setFile(outdir, "boxofficemojo.json")
    print "Saving",len(yearlyData),"yearly results to",savename
    save(savename, yearlyData)