# Combine Movie Functions

In [5]:
## Basic stuff
%load_ext autoreload
%autoreload
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("""<style>div.output_area{max-height:10000px;overflow:scroll;}</style>"""))

## Python Version
import sys
print("Python: {0}".format(sys.version))

from wikipedia import wikipedia
from wikifilm import wikifilm
from oscar import oscars
from razzies import razzies
from boxofficemojo import boxofficemojo
from rottentomatoes import rottentomatoes
from ultimatemovierankings import ultimatemovierankings 
#from combine import combine


import datetime as dt
start = dt.datetime.now()
print("Notebook Last Run Initiated: "+str(start))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Python: 3.6.6 |Anaconda custom (64-bit)| (default, Jun 28 2018, 11:07:29) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
Notebook Last Run Initiated: 2019-01-01 20:06:31.498268


# Get/Parse/Merge/Process Rotten Tomatoes Data

In [37]:
import re
from time import sleep
from collections import OrderedDict
from timeUtils import clock, elapsed
from ioUtils import saveFile, getFile
from fsUtils import setDir, isDir, mkDir, setFile, isFile, setSubFile
from fileUtils import getBaseFilename
from searchUtils import findSubPatternExt, findPatternExt, findExt
from strUtils import convertCurrency
from webUtils import getWebData, getHTML
from numpy import repeat
from movieDB import movieDB
from os import getcwd
import operator
from movieRenames import manualRenames



##############################################################################################################################
# Combine Movies
##############################################################################################################################
class combine(movieDB):
    def __init__(self, basedir=None):
        self.name = "combine"
        movieDB.__init__(self, dbdir=self.name)
        
        self.sources           = set()
        self.movieSource       = {}
        self.movieSourceData   = {}
        self.movieSourceMovies = {}
        self.movieSourceYears  = {}
        self.movieSourceVal    = {}
    
        self.years = []
        
        self.keepIMAX = False
        
        self.movies = None
       
    
    def setMovieData(self, key, source, val):
        self.sources.add(key)
        self.movieSource[key]       = source
        print(key)
        print(source)
        print(self.movieSource[key])
        self.movieSourceData[key]   = None
        self.movieSourceMovies[key] = None
        self.movieSourceYears[key]  = None
        self.movieSourceVal[key]    = val

    
    def setOscarData(self, source, val=None):
        key = "Oscar"
        self.setMovieData(key, source, val)
       
    def setRazziesData(self, source, val=None):
        key = "Razzies"
        self.setMovieData(key, source, val)
       
    def setWikiFilmData(self, source, val=None):
        key = "Wiki Film"
        self.setMovieData(key, source, val)
       
    def setUltimateMovieRankingsData(self, source, val=None):
        key = "Ultimate Movie Rankings"
        self.setMovieData(key, source, val)
       
    def setRottenTomatoesData(self, source, val=None):
        key = "Rotten Tomatoes"
        self.setMovieData(key, source, val)
       
    def setBoxOfficeMojoData(self, source, val=None):
        key = "Box Office Mojo"
        self.setMovieData(key, source, val)
        
        
    
    
    def getData(self):
        years = []
        for key in self.sources:
            resultsDir  = self.movieSource[key].getResultsDir()
            resultsName = self.movieSource[key].name
            filename = setFile(resultsDir, "{0}.json".format(resultsName))
            if isFile(filename):
                self.movieSourceData[key]  = getFile(filename)
                self.movieSourceYears[key] = list(self.movieSourceData[key].keys())
                print("Found {0} Years of {1} Movies".format(len(self.movieSourceYears[key]), key))
                years = years + self.movieSourceYears[key]
            else:
                raise ValueError("There is not results file: {0}".format(filename))
                
        
        self.years = sorted(list(set(years)))
        print("Found Data Between {0} and {1}".format(min(self.years), max(self.years)))
        self.years = years


    
    def getYearlyMovies(self, data, year, name, minval, debug=False):
        movies = []
        if data.get(year) is not None:
            if minval is not None:
                movies = [x[0] for x in data[year] if x[1] >= minval]
            else:
                movies = [x[0] for x in data[year]]
                
            if debug:
                print("  {0}  {1: <20}: {2}/{3}".format(year, name, len(movies), len(data[year])))
        else:
            if debug:
                print("  {0}  {1: <20}: None".format(year, name))
        return movies
        
        
    def saveCorrections(self, debug=True):
        savename = setFile(self.getDataDir(), "corr.yaml")
        corrData = getFile(savename)

        try:
            savename = setFile(self.getDataDir(), "saved.yaml")
            savedData = getFile(savename)
        except:
            savedData = {}

        for movie,corrs in corrData.items():
            if savedData.get(movie) is None:
                if debug:
                    print("Adding {0}".format(movie))
                savedData[movie] = corrs
            else:
                newSaved = list(set(savedData[movie] + corrs))
                if len(newSaved) != len(savedData[movie]):
                    print("Adding new corrections to {0}".format(movie))
                savedData[movie] = newSaved

        savename = setFile(self.getDataDir(), "saved.yaml")
        saveFile(idata=savedData, ifile=savename, debug=debug)        
        
            
    def mergeMovies(self, debug=False):
        verydebug=False
        yearlyMovies = OrderedDict()
        movies = OrderedDict()
        
        repData   = {}
        savename  = setFile(self.getDataDir(), "saved.yaml")
        savedData = getFile(savename)
        for corrMovie,corrs in savedData.items():
            for corr in corrs:
                repData[corr] = corrMovie
                       
        keys = {}
        for key in self.sources:
            keys[key] = [self.movieSourceData[key], self.movieSourceVal[key]]
            
        
        for year in self.years:             
            keyMovies = {}
            for key,keydata in keys.items():
                keyfunc = keydata[0]
                keyVal  = keydata[1]
                keyMovies[key] = self.getYearlyMovies(keyfunc, year, key, keyVal, debug=verydebug)
                keyMovies[key] = [manualRenames(x, int(year), self.keepIMAX) for x in keyMovies[key]]
                keyMovies[key] = dict(zip(keyMovies[key], repeat(key, len(keyMovies[key]))))
         
            
            
            ###### Merge The Movies
            for key,keysData in keyMovies.items():
                for movie,name in keysData.items():
                    if repData.get(movie):
                        movie = repData[movie]
                    key = "{0} [{1}]".format(movie, year)
                    if movies.get(key) is None:
                        movies[key] = name

                   
       

        if debug:
            print("Found {0} movies".format(len(movies)))




        ### Start with Oscar Movies (remove from other categories)
        
        removes = []
        ordering = ["Oscar", "Rotten Tomatoes", "Razzies", "Ultimate Movie Rankings", "Box Office Mojo", "Wiki Film"]
        for key,name in movies.items():
            #print(key)
            movie = key[:-7]
            year  = key[-5:-1]
            
            #if movie.find("*") != -1: print(key,name)
            
            
            for dy in [1, -1, 2, -2]:
                test   = "{0} [{1}]".format(movie, int(year)+dy)
                result = movies.get(test)
                if result is not None:
                    #print("   <---- {0}".format(test))
                    if ordering.index(result) > ordering.index(name):
                        if verydebug:
                            print("Removing {0}: {1} because it is already listed as {2}: {3}".format(test,result,key,name))
                        removes.append(test)

        for key in removes:
            try:
                del movies[key]
            except:
                print("Could not remove {0}".format(key))
                
        if debug:
            print("There are {0} final movies".format(len(movies)))
            
        self.movies = movies
        
        savename = setFile(self.getResultsDir(), "movies.json")
        saveFile(idata=movies, ifile=savename, debug=True)






In [38]:
%load_ext autoreload
%autoreload

comb = combine()
comb.setOscarData(oscars(wikipedia()), 10)
comb.setWikiFilmData(wikifilm(), 1)
comb.setRazziesData(razzies(), 1)
comb.setRottenTomatoesData(rottentomatoes(), 90)
comb.setBoxOfficeMojoData(boxofficemojo(), 5e6)
comb.setUltimateMovieRankingsData(ultimatemovierankings(), 90)
comb.getData()
comb.saveCorrections(debug=True)
comb.mergeMovies(debug=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Oscar
<oscar.oscars object at 0x1a1acb15f8>
<oscar.oscars object at 0x1a1acb15f8>
Wiki Film
<wikifilm.wikifilm object at 0x1a1acb1048>
<wikifilm.wikifilm object at 0x1a1acb1048>
Razzies
<razzies.razzies object at 0x1a1acb1dd8>
<razzies.razzies object at 0x1a1acb1dd8>
Rotten Tomatoes
<rottentomatoes.rottentomatoes object at 0x1a1acb1208>
<rottentomatoes.rottentomatoes object at 0x1a1acb1208>
Box Office Mojo
<boxofficemojo.boxofficemojo object at 0x1a1acb13c8>
<boxofficemojo.boxofficemojo object at 0x1a1acb13c8>
Ultimate Movie Rankings
<ultimatemovierankings.ultimatemovierankings object at 0x1a1acb1358>
<ultimatemovierankings.ultimatemovierankings object at 0x1a1acb1358>
Found 67 Years of Ultimate Movie Rankings Movies
Found 38 Years of Razzies Movies
Found 97 Years of Wiki Film Movies
Found 93 Years of Rotten Tomatoes Movies
Found 84 Years of Oscar Movies
Found 36 Years of Box Office Mojo Movies
Foun

# Find Corrections

In [42]:
data = {}
from searchUtils import findNearest

savename = setFile(comb.getDataDir(), "saved.yaml")
savedData = getFile(savename)

repData = {}
for corrMovie,corrs in savedData.items():
    for corr in corrs:
        repData[corr] = corrMovie

movielist = [x[:-7] for x,name in movies.items() if name in ["Oscar", 'Rotten Tomatoes', 'Box Office']]
for i,movie in enumerate(movielist):
    if repData.get(movie):
        #print([movielist[i]],' -> ',[repData[movie]])
        movielist[i] = repData[movie]

cutoff = 0.9
#for key,name in movies.items():
#    movie = key[:-7]
for im,movie in enumerate(movielist):
    if movie.find("IMAX") != -1:
        continue
    if movie.find("re-issue") != -1:
        continue

    if im % 100 == 0:
        print(im,'/',len(movielist))
    if movie.endswith(' '):        
        if data.get(movie[:-1]) is None:
            print(len(data),'\t',movie[:-1])
            data[movie[:-1]] = set()
            data[movie[:-1]].add(movie)
        tmpList = list(filter(lambda a: a != movie[:-1], movielist))
        results = findNearest(movie[:-1], tmpList, 3, cutoff)
        if len(results) > 0:
            for value in results:
                data[movie[:-1]].add(value)
    else:
        tmpList = list(filter(lambda a: a != movie, movielist))    
        results = findNearest(movie, tmpList, 3, cutoff)
        if len(results) > 0:
            if data.get(movie) is None:
                print(len(data),'\t',movie)
                data[movie] = set()
            for value in results:
                data[movie].add(value)
                
    if len(data) > 300:
        break
        
for k in data.keys():
    data[k] = list(data[k])

savename = setFile(comb.getDataDir(), "corr.yaml")
saveFile(idata=data, ifile=savename, debug=True)

0 / 18945
0 	 Men In Black
1 	 The Informer
2 	 Gold Diggers Of 1935
3 	 Thanks A Million
100 / 18945
4 	 Romeo And Juliet
5 	 Gold Diggers Of 1937
6 	 Popular Science J-6-2
7 	 That Girl From Paris
8 	 A Damsel In Distress
9 	 Popular Science J-7-1
200 / 18945
10 	 Bringing Up Baby
300 / 18945
11 	 The Letter
12 	 The Mark Of Zorro
13 	 Hit Parade Of 1941
14 	 Citizen Kane
15 	 Dr. Jekyll And Mr. Hyde
400 / 18945
16 	 Tanks A Million
17 	 The Son Of Monte Cristo
18 	 Beauty And The Beach
19 	 The Tanks Are Coming
20 	 The Night Before Christmas
500 / 18945
21 	 Hit Parade Of 1943
600 / 18945
700 / 18945
22 	 Life With Feathers
800 / 18945
23 	 The Killers
24 	 The Stranger
25 	 Night And Day
26 	 Life With Father


KeyboardInterrupt: 