In [1]:
from time import time
from multiprocessing import Pool, cpu_count
import tqdm  # show multi-processing progress bar

import pandas as pd

import mojo_api

# for auto reload a module after editing.
%load_ext autoreload
%autoreload 2

In [2]:
df = pd.DataFrame()
year_start = 2014
year_end = 2019
for year in range(year_start - 1, year_end + 1):
    year_df = mojo_api.get_movie_list(year)
    year_df.to_csv('./data/_movie_list_{}.csv'.format(year), index=False)
    print("Finished collecting movie list of {}, movie number: {}".format(year, year_df.shape[0]))
    df = pd.concat([df, year_df])

Finished collecting movie list of 2013, movie number: 802
Finished collecting movie list of 2014, movie number: 808
Finished collecting movie list of 2015, movie number: 803
Finished collecting movie list of 2016, movie number: 811
Finished collecting movie list of 2017, movie number: 816
Finished collecting movie list of 2018, movie number: 908
Finished collecting movie list of 2019, movie number: 833


In [4]:
# A movie title can appear across two years. Need to remove duplicate.
df_remove_dup = df.groupby('Title').agg({
    'Year': 'first',
    'Rank': 'first',
    'Title': 'first'
})
top = 150
df_remove_dup = df_remove_dup.sort_values(['Year', 'Rank'])
df_remove_dup = df_remove_dup[df_remove_dup['Year'] >= year_start]
df_remove_dup = df_remove_dup[df_remove_dup['Rank'] <= top]
df_remove_dup.to_csv('./data/_movie_list_all.csv', index=False)

In [38]:
df = pd.read_csv('./data/_movie_list_all.csv')
df['tt_id'] = None
df['rl_id'] = None
df['release_date'] = None
df['company'] = None
df['mpaa'] = None
df['genres'] = None
df['director'] = None
df['actors'] = None
df['budget'] = None
df['bo_opening'] = None
df['bo_gross'] = None
df['imdb_score'] = None

In [41]:
def add_info_chunk(df):
    for i in range(df.shape[0]):
        title = df['Title'].iloc[i]
        year = df['Year'].iloc[i]
        movie = mojo_api.Movie()
        movie.get_train_movie_info(title, year)
        df.loc[i, 'tt_id'] = movie.tt_id
        df.loc[i, 'rl_id'] = movie.rl_id
        df.loc[i, 'release_date'] = movie.release_date
        df.loc[i, 'company'] = movie.company
        df.loc[i, 'mpaa'] = movie.mpaa
        df.loc[i, 'genres'] = movie.genres
        df.loc[i, 'director'] = movie.director
        df.loc[i, 'actors'] = movie.actors
        df.loc[i, 'budget'] = movie.budget
        df.loc[i, 'bo_opening'] = movie.bo_opening
        df.loc[i, 'bo_gross'] = movie.bo_gross
        df.loc[i, 'imdb_score'] = movie.imdb_score
#     print(i, movie.title)
    return df

time_start = time()
with Pool(cpu_count()) as p:
    total = top #df.shape[1]
    chunks = list(tqdm.tqdm(
                p.imap(add_info_chunk, [df[df['Rank'] == i].reset_index(drop=True) for i in range(1, top+1)]), total=total))
time_end = time()
time_run = (time_end - time_start) / 60
print("Finished. Time: {0:.1f} min.".format(time_run))





  0%|          | 0/150 [00:00<?, ?it/s][A[A[A[A



  1%|          | 1/150 [00:25<1:04:28, 25.96s/it][A[A[A[A



  3%|▎         | 5/150 [00:43<47:05, 19.48s/it]  [A[A[A[A

Finished. Time: 0.7 min.


In [44]:
df_filled = pd.DataFrame()
for chunk in chunks:
    df_filled = pd.concat([df_filled, chunk])
df_filled = df_filled.sort_values(['Year', 'Rank'])
df_filled = df_filled.reset_index(drop=True)

In [45]:
df_filled.head(40)

Unnamed: 0,Year,Rank,Title,tt_id,rl_id,release_date,company,mpaa,genres,director,actors,budget,bo_opening,bo_gross,imdb_score
0,2014,1,Guardians of the Galaxy,tt2015381,rl3177416193,2014-08-01,Walt Disney Studios Motion Pictures,PG-13,"Action,Adventure,Comedy,Sci-Fi",James Gunn,"Chris Pratt,Vin Diesel",170,94.3209,333.177,8.0
1,2014,2,The Hunger Games: Mockingjay - Part 1,tt1951265,rl4283991553,2014-11-21,Lionsgate,PG-13,"Action,Adventure,Sci-Fi,Thriller",Francis Lawrence,"Jennifer Lawrence,Josh Hutcherson",125,121.898,337.136,6.6
2,2014,3,Captain America: The Winter Soldier,tt1843866,rl3194193409,2014-04-04,Walt Disney Studios Motion Pictures,PG-13,"Action,Adventure,Sci-Fi,Thriller",Anthony Russo,"Chris Evans,Samuel L. Jackson",170,95.0237,259.767,7.7
3,2014,4,The Lego Movie,tt1490017,rl643728897,2014-02-07,Warner Bros.,PG,"Action,Adventure,Animation,Comedy,Family,Fantasy",Christopher Miller,"Chris Pratt,Will Ferrell",60,69.0503,257.761,7.7
4,2014,5,Transformers: Age of Extinction,tt2109248,rl2960623105,2014-06-27,Paramount Pictures,PG-13,"Action,Adventure,Sci-Fi",Michael Bay,"Mark Wahlberg,Nicola Peltz",210,100.038,245.439,5.6
5,2015,1,Jurassic World,tt0369610,rl2371716609,2015-06-12,Universal Pictures,PG-13,"Action,Adventure,Sci-Fi",Colin Trevorrow,"Chris Pratt,Bryce Dallas Howard",150,208.806,652.271,7.0
6,2015,2,Star Wars: Episode VII - The Force Awakens,tt2488496,rl2691925505,2015-12-18,Walt Disney Studios Motion Pictures,PG-13,"Action,Adventure,Sci-Fi",J.J. Abrams,"Daisy Ridley,John Boyega",245,247.967,936.662,7.9
7,2015,3,Avengers: Age of Ultron,tt2395427,rl675644929,2015-05-01,Walt Disney Studios Motion Pictures,PG-13,"Action,Adventure,Sci-Fi",Joss Whedon,"Robert Downey Jr.,Chris Evans",250,191.271,459.006,7.3
8,2015,4,Inside Out,tt2096673,rl4016604673,2015-06-19,Walt Disney Studios Motion Pictures,PG,"Adventure,Animation,Comedy,Drama,Family,Fantasy",Pete Docter,"Amy Poehler,Bill Hader",175,90.4403,356.462,8.1
9,2015,5,Furious 7,tt2820852,rl1045661185,2015-04-03,Universal Pictures,PG-13,"Action,Adventure,Thriller",James Wan,"Vin Diesel,Paul Walker",190,147.187,353.007,7.1


In [34]:
df[df['Year'] == 2019].reset_index(drop=True)

Unnamed: 0,Year,Rank,Title,tt_id,rl_id,release_date,company,mpaa,genres,director,actors,budget,bo_opening,bo_gross,imdb_score
0,2019,1,Avengers: Endgame,,,,,,,,,,,,
1,2019,2,The Lion King,,,,,,,,,,,,
2,2019,3,Toy Story 4,,,,,,,,,,,,
3,2019,4,Frozen II,,,,,,,,,,,,
4,2019,5,Captain Marvel,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124,2019,145,The Dead Don't Die,,,,,,,,,,,,
125,2019,146,Jexi,,,,,,,,,,,,
126,2019,147,Run the Race,,,,,,,,,,,,
127,2019,149,The Current War: Director's Cut,,,,,,,,,,,,


In [154]:
movie.year

2019

In [67]:
ii = 202, "Unfriended" tt4680034 not correct
ii = 445, "The Great Wall" tt7535780 not correct
ii = 766, "Greta" tt9477532

https://www.imdb.com/search/title/?title=Lone%20Survivor&release_date=2014-01-01,2014-12-31


NameError: name 'headers' is not defined

In [164]:
df[~df['tt_id'].isnull()][df['budget'].isnull()]

  """Entry point for launching an IPython kernel.


Unnamed: 0,Year,Rank,Title,tt_id,rl_id,release_date,company,mpaa,genres,director,actors,budget,bo_opening,bo_gross,imdb_score
9,2014,10,The Amazing Spider-Man 2,tt1872181,rl846431745,2014-05-02,Sony Pictures Releasing,PG-13,"Action,Adventure,Sci-Fi",Marc Webb,"Andrew Garfield,Andrew Garfield",,91608337,202853933,6.6
13,2014,14,The Hobbit: The Battle of the Five Armies,tt2310332,rl2354677249,2014-12-17,Warner Bros.,PG-13,"Adventure,Fantasy",Peter Jackson,"Ian McKellen,Ian McKellen",,54724334,255119788,7.4
48,2014,53,Son of God,tt3210686,rl2574288385,2014-02-28,Twentieth Century Fox,PG-13,"Biography,Drama,History",Christopher Spencer,"Diogo Morgado,Diogo Morgado",,25601865,59700064,5.7
72,2014,78,St. Vincent,tt2170593,rl1282573825,2014-10-10,The Weinstein Company,PG-13,"Comedy,Drama",Theodore Melfi,"Bill Murray,Bill Murray",,109878,44137712,7.2
75,2014,81,The Expendables 3,tt2333784,rl1448117761,2014-08-15,Lionsgate,PG-13,"Action,Adventure,Thriller",Patrick Hughes,"Sylvester Stallone,Sylvester Stallone",,15879645,39322544,6.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
776,2019,143,Brittany Runs a Marathon,tt7671064,rl705201665,2019-08-23,Amazon Studios,R,"Comedy,Drama",Paul Downs Colaizzo,"Jillian Bell,Jillian Bell",,180711,7189808,6.8
777,2019,145,The Dead Don't Die,tt8695030,rl3960047105,2019-06-14,Focus Features,R,"Comedy,Fantasy,Horror,Sci-Fi",Jim Jarmusch,"Bill Murray,Bill Murray",,2540240,6563605,5.5
778,2019,146,Jexi,tt9354944,rl3423110657,2019-10-11,Lionsgate,R,Comedy,Jon Lucas,"Adam Devine,Adam Devine",,3106730,6546159,6.1
779,2019,147,Run the Race,tt3201736,rl738625025,2019-02-22,Roadside Attractions,PG,"Drama,Sport",Chris Dowling,"Tanner Stine,Tanner Stine",,2161480,6424420,6
