# Wikipedia Functions

In [1]:
## Basic stuff
%load_ext autoreload
%autoreload
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("""<style>div.output_area{max-height:10000px;overflow:scroll;}</style>"""))

## Python Version
import sys
print("Python: {0}".format(sys.version))

from wikifilm import wikifilm
from wikipedia import wikipedia

from timeUtils import clock, elapsed
import datetime as dt
start = dt.datetime.now()
print("Notebook Last Run Initiated: "+str(start))

Python: 3.6.6 |Anaconda custom (64-bit)| (default, Jun 28 2018, 11:07:29) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
Notebook Last Run Initiated: 2019-01-01 20:23:17.394120


In [2]:
film = wikifilm()
#film.getWikiFilmYearlyData()
#film.processWikiFilmYearlyData()

In [127]:
import re
from time import sleep
from timeUtils import clock, elapsed
from ioUtils import saveFile, getFile
from fsUtils import setDir, isDir, mkDir, setFile, isFile, setSubFile
from fileUtils import getBaseFilename
from searchUtils import findSubPatternExt, findPatternExt, findExt
from strUtils import convertCurrency
from webUtils import getWebData, getHTML
from movieDB import movieDB
from os import getcwd
import operator


##############################################################################################################################
# Box Office 
##############################################################################################################################
class GoldenGlobes(movieDB):
    def __init__(self, basedir=None):
        self.name = "GoldenGlobes"
        movieDB.__init__(self, dbdir=self.name)
    
    
    ###########################################################################################################################
    # Get GoldenGlobes Files
    ###########################################################################################################################
    def downloadGoldenGlobesCategoryData(self, category, outdir, debug=False):

        url  = "https://en.wikipedia.org/wiki/Golden_Globe_Award_for_{0}".format(category)
        savename = setFile(outdir, category+".p")
        if isFile(savename): return
        if debug:
            print("Downloading {0}".format(url))
        getWebData(base=url, savename=savename, useSafari=False)
        sleep(1)


    def getGoldenGlobesCategoryData(self, debug=False):
        outdir = self.getDataDir()
        if debug:
            print("Data Directory: {0}".format(outdir))
        if not isDir(outdir): mkDir(outdir)

        categories = ["Best_Motion_Picture_-_Drama", "Best_Motion_Picture_-_Musical_or_Comedy",
                      "Best_Animated_Feature_Film", "Best_Foreign_Language_Film"]
        for category in categories:
            self.downloadGoldenGlobesCategoryData(category, outdir, debug)
                

                
    
    
    ###########################################################################################################################
    # Parse Box Office Weekend Files
    ###########################################################################################################################  
    def parseGoldenGlobesFilmData(self, table, category, debug=False):
        filmdata = {}
        
        ths = table.findAll("th")
        ths = [x.text for x in ths if x is not None]
        ths = [x.replace("\n", "") for x in ths]
        
        print(ths)
        
        trs  = table.findAll("tr")
        year = None
        pbs  = None
        for i,tr in enumerate(trs[1:]):            
            
            
            tds = tr.findAll("td")
            if len(tds) == 1:
                continue

            
            bs  = len(tr.findAll("b"))
            
            ## Check for new year
            if bs > 1 and pbs == 0:
                try:
                    year = tds[0].text
                    year = int(year)
                    tds  = tds[1:]
                    
                except:
                    raise ValueError("Could not find year in {0}".format(tds[0]))            

            pbs = bs
            tds = [x.text for x in tds]
            tds = [x.replace("\n", "") for x in tds]
            tds = [x.strip() for x in tds]                
            tds.insert(0, year)

            if tds[0] is None:
                tds  = tds[1:]
                try:
                    year = int(tds[0])
                except:
                    raise ValueError("Could not find year in {0}".format(tds[0]))      
                
            if len(tds) + 1 == len(ths):
                tds.insert(2, tds[1])
                
                
            #print(i,year,'\t',len(tds),'\t',len(ths),'\t',tds[0],'\t',tds[1],'\t',tds[2])
            #continue

            #continue

                
            try:
                row = dict(zip(ths, tds))
            except:
                raise ValueError("Could not zip: [{0}], [{1}]".format(ths, tds))

            
            if row.get("Film") is None:
                try:
                    row["Film"] = "{0} ({1})".format(row["English title"], row["Original title"])
                except:
                    raise ValueError("Could not create film name: {0}".format(row))
                
            if filmdata.get(year) is None:
                filmdata[year] = {}
            if filmdata[year].get(category) is None:
                filmdata[year][category] = []

            try:
                movie = row["Film"]
            except:
                raise ValueError("Cannot find movie in {0}".format(row))

            filmdata[year][category].append(movie)
            

            if debug:
                print("{0: <10}{1: <20}{2}".format(year,category,movie))
                    
        return filmdata



    def parseGoldenGlobesCategoryData(self, ifile, category, debug = False):
        htmldata = getFile(ifile)
        bsdata   = getHTML(htmldata)
            
        data   = {}
        done   = False
        tables = bsdata.findAll("table", {"class": "wikitable"})
        if debug:
            print("  Found {0} tables".format(len(tables)))
        for table in tables:
            yeardata = self.parseGoldenGlobesFilmData(table, category, debug=False)
            data = {**data, **yeardata}
        
        for year,yearData in data.items():
            for category in yearData.keys():
                data[year][category] = list(set(data[year][category]))
        
        return data



    def processGoldenGlobesCategoryData(self, debug=False):
        outdir = self.getDataDir()
        files = findExt(outdir, ext="*.p")

        from collections import OrderedDict
        movies = OrderedDict()
        for ifile in files:
            
            if debug:
                print("Processing {0}".format(ifile))
            category = getBaseFilename(ifile)
            results  = self.parseGoldenGlobesCategoryData(ifile, category, debug=debug)
            
            if len(results) == 0:
                raise ValueError("No results for {0}".format(ifile))
                

            for year,yearData in results.items():
                for category,categoryData in yearData.items():
                    if movies.get(year) is None:
                        movies[year] = []
                    for movie in categoryData:
                        movies[year].append(movie)

        for year in movies.keys():
            movies[year] = list(set(movies[year]))
            yearlyMovies = movies[year]
            movies[year] = []
            for movie in yearlyMovies:
                movies[year].append([movie,10])

            print(movies[year])
                
        savename = setFile(self.getResultsDir(), "{0}.json".format(self.name))
        print("Saving {0} Years of GoldenGlobes Data to {1}".format(len(movies), savename))
        saveFile(savename, movies)
        #yamldata.saveYaml(savename, movies)   

# Get/Parse/Merge/Process Wikipedia Data

In [128]:
gg = GoldenGlobes()
_, _ = clock("Last Run")

Current Time is Sat Jan 12, 2019 18:58:44 for Last Run


In [129]:
gg.getGoldenGlobesCategoryData(debug=True)

Data Directory: /Users/tgadfort/Documents/code/movies/GoldenGlobes/data


In [130]:
gg.processGoldenGlobesCategoryData(debug=True)

Processing /Users/tgadfort/Documents/code/movies/GoldenGlobes/data/Best_Foreign_Language_Film.p
  Found 6 tables
['Year', 'English title', 'Original title', 'Director', 'Country']
['Year', 'English title', 'Original title', 'Director', 'Country']
['Year', 'English title', 'Original title', 'Director', 'Country']
['Year', 'English title', 'Original title', 'Director', 'Country']
['Year', 'English title', 'Original title', 'Director', 'Country']
['Year', 'English title', 'Original title', 'Director', 'Country']
Processing /Users/tgadfort/Documents/code/movies/GoldenGlobes/data/Best_Motion_Picture_-_Drama.p
  Found 8 tables
['Year', 'Film', 'Director[4]', 'Producer/s[5]']
['Year', 'Film', 'Director[4]', 'Producer/s[5]']
['Year', 'Film', 'Director[4]', 'Producer/s[5]']
['Year', 'Film', 'Director[4]', 'Producer/s[5]']
['Year', 'Film', 'Director[4]', 'Producer/s[5]']
['Year', 'Film', 'Director[4]', 'Producer/s[5]']
['Year', 'Film', 'Director[4]', 'Producer/s[5]']


ValueError: Could not find year in 2000[10]