In [494]:
%%writefile IMDB_Scrape.py

from imdb import IMDb
import numpy as np
import pandas as pd
import datetime

def scrape_IMDB(df):
    ia = IMDb()
    
    new_df  = df[['Date Seen', 'Title']].copy()
    temp_df = pd.DataFrame(columns=['Title','Link_IMDB','Starring_IMDB','Genres_IMDB','Runtime_IMDB','Countries_IMDB',
                                    'Languages_IMDB','Aspect_IMDB','Budget_IMDB','FirstWkendGross_IMDB','WorldWideGross_IMDB',
                                    'USFilmRating_IMDB','IMDB_Rating','IMDB_Votes','Directors_IMDB','Writers_IMDB',
                                    'Producers_IMDB','Cinematographers_IMDB','ProductionCos_IMDB','DistributionCos_IMDB',
                                    'IMDB_Plot','Company_Plot','FirstPlot_IMDB','Synopsis_IMDB'
                                    ])
    no_matches=[]

    #loop through movies
    for movie_name in df.Title:
        if movie_name in temp_df.Title:
            continue
        if str(list(df.loc[df.Title == movie_name]['Release'])[0]) == 'nan':
            print('{} - no release year'.format(movie_name))
            no_matches.append(movie_name)
            continue
        else:
            movie_release = str(list(df.loc[df.Title == movie_name]['Release'])[0])
            movie_year = datetime.datetime.strptime(movie_release, '%m/%d/%Y').year
            print("\n{} - {}".format(movie_name, movie_year))

            #movie lookup with imdb search
            imdb_search = ia.search_movie(movie_name)
            for movie_results in imdb_search:
                if 'episode of' not in movie_results.keys() and "SPOILER" not in movie_results['title'].upper() and "REVIEW" not in movie_results['title'].upper():
                    movie_id = movie_results.movieID
                    if 'year' in movie_results.keys() and 'kind' in movie_results.keys():
                        if str(movie_results['year']) == str(movie_year) and movie_results['kind']=='movie':
                            movie = ia.get_movie(movie_id)
                            break
            print(ia.get_imdbURL(movie_results))

            #initialize vars to nulls
            imdb_link=np.nan
            starring=[]
            genres=[]
            runtimes=np.nan
            countries=[]
            languages=[]
            aspect=np.nan
            box_office=np.nan
            budget=np.nan
            US_1wkGross=np.nan
            Worldwide_Gross=np.nan
            certs=np.nan
            cert_rating=np.nan
            rating=np.nan
            votes=np.nan
            directors=[]
            writers=[]
            producers=[]
            cinematographers=[]
            prod_co=[]
            prod_co_unique=[]
            dist_co=[]
            dist_co_unique=[]
            plot=[]
            IMDB_plot=np.nan
            dist_co_plot=[]
            first_plot=np.nan
            plot_outline=np.nan
            synopsis=np.nan

            try:
                imdb_link = ia.get_imdbURL(movie_results)
            except:
                pass
            try:
                starring = [x['name'] for x in movie['cast'] if x.keys() != []][0:20]
            except:
                pass
            try:
                genres = movie['genres']
            except:
                pass
            try:
                runtimes = movie['runtimes'][0]
            except:
                pass
            try:
                countries = movie['countries']
            except:
                pass
            try:
                languages = movie['languages']
            except:
                pass
            try:
                aspect = movie['aspect ratio'].split(' (')[0]
            except:
                pass
            try:
                box_office = movie['box office']
            except:
                pass
            try:
                budget = box_office['Budget'].split(' (')[0]
            except:
                pass
            try:
                US_1wkGross = box_office['Opening Weekend United States'].split(', ')[0]
            except:
                pass
            try:
                Worldwide_Gross = box_office['Cumulative Worldwide Gross'].split(', ')[0]
            except:
                pass
            try:
                certs = movie['certificates'] 
                for x in certs:
                    if 'UNITED STATES' in x.upper():
                        cert_rating = x.split(':')[-1]
            except:
                pass
            try:
                rating = movie['rating']
            except:
                pass
            try:
                votes = movie['votes']
            except:
                pass
            try:
                directors = [x['name'] for x in movie['directors'] if x.keys() != []][0:5]
            except:
                pass
            try:
                writers = [x['name'] for x in movie['writers'] if x.keys() != []][0:5]
            except:
                pass
            try:
                producers = [x['name'] for x in movie['producers'] if x.keys() != []][0:5]
            except:
                pass
            try:
                cinematographers = [x['name'] for x in movie['cinematographers'] if x.keys() != []][0:5]
            except:
                pass
            try:
                prod_co = movie['production companies']
                prod_co_unique = list(set([x['name'] for x in prod_co]))
            except:
                pass
            try:
                dist_co = movie['distributors']
                dist_co_unique = list(set([x['name'] for x in dist_co]))
            except:
                pass
            try:
                plot = movie['plot']
                dist_co_plot = []
                for x in plot:
                    if 'IMDB' in x.split('::')[-1].upper():
                        IMDB_plot = x.split('::')[0]
                    for i in dist_co_unique:
                        if x.split('::')[-1].upper() in i.upper():
                            dist_co_plot.append(x.split('::')[0])
                    for i in prod_co_unique:
                        if x.split('::')[-1].upper() in i.upper():
                            dist_co_plot.append(x.split('::')[0])
                first_plot = plot[0].split('::')[0]
            except:
                pass
            try:
                plot_outline=movie['plot outline']
            except:
                pass
            try:
                synopsis = movie['synopsis']
            except:
                pass

            row = [movie_name,imdb_link,starring,genres,runtimes,countries,languages,
                   aspect,budget,US_1wkGross,Worldwide_Gross,cert_rating,
                   rating,votes,directors,writers,producers,
                   cinematographers,prod_co_unique,dist_co_unique,IMDB_plot,dist_co_plot,
                   first_plot,synopsis               
                  ]
            temp_df.loc[len(temp_df)] = row

    ##merge ratings df and scraped df
    new_df = pd.merge(df, temp_df, on='Title', how='inner')
    return new_df

Overwriting IMDB_Scrape.py
