In [1]:
from time import time
from multiprocessing import Pool, cpu_count
import tqdm  # show multi-processing progress bar

import pandas as pd

import mojo_api

# for auto reload a module after editing.
%load_ext autoreload
%autoreload 2

In [2]:
df = pd.DataFrame()
year_start = 2014
year_end = 2019
for year in range(year_start - 1, year_end + 1):
    year_df = mojo_api.get_movie_list(year)
    year_df.to_csv('./data/_movie_list_{}.csv'.format(year), index=False)
    print("Finished collecting movie list of {}, movie number: {}".format(year, year_df.shape[0]))
    df = pd.concat([df, year_df])

Finished collecting movie list of 2013, movie number: 802
Finished collecting movie list of 2014, movie number: 808
Finished collecting movie list of 2015, movie number: 803
Finished collecting movie list of 2016, movie number: 811
Finished collecting movie list of 2017, movie number: 816
Finished collecting movie list of 2018, movie number: 908
Finished collecting movie list of 2019, movie number: 833


In [3]:
# A movie title can appear across two years. Need to remove duplicate.
df_remove_dup = df.groupby('Title').agg({
    'Year': 'first',
    'Rank': 'first',
    'Title': 'first'
})
top = 150
df_remove_dup = df_remove_dup.sort_values(['Year', 'Rank'])
df_remove_dup = df_remove_dup[df_remove_dup['Year'] >= year_start]
df_remove_dup = df_remove_dup[df_remove_dup['Rank'] <= top]
df_remove_dup.to_csv('./data/_movie_list_all.csv', index=False)

In [4]:
df = pd.read_csv('./data/_movie_list_all.csv')
df['tt_id'] = None
df['rl_id'] = None
df['release_date'] = None
df['company'] = None
df['mpaa'] = None
df['genres'] = None
df['director'] = None
df['actors'] = None
df['budget'] = None
df['bo_opening'] = None
df['bo_gross'] = None
df['imdb_score'] = None

In [5]:
def add_info_chunk(df):
    for i in range(df.shape[0]):
        title = df['Title'].iloc[i]
        year = df['Year'].iloc[i]
        movie = mojo_api.Movie()
        movie.get_train_movie_info(title, year)
        df.loc[i, 'tt_id'] = movie.tt_id
        df.loc[i, 'rl_id'] = movie.rl_id
        df.loc[i, 'release_date'] = movie.release_date
        df.loc[i, 'company'] = movie.company
        df.loc[i, 'mpaa'] = movie.mpaa
        df.loc[i, 'genres'] = movie.genres
        df.loc[i, 'director'] = movie.director
        df.loc[i, 'actors'] = movie.actors
        df.loc[i, 'budget'] = movie.budget
        df.loc[i, 'bo_opening'] = movie.bo_opening
        df.loc[i, 'bo_gross'] = movie.bo_gross
        df.loc[i, 'imdb_score'] = movie.imdb_score
#     print(i, movie.title)
    return df

time_start = time()
with Pool(cpu_count()) as p:
    total = top #df.shape[1]
    chunks = list(tqdm.tqdm(
                p.imap(add_info_chunk, [df[df['Rank'] == i].reset_index(drop=True) for i in range(1, top+1)]), total=total))
time_end = time()
time_run = (time_end - time_start) / 60
print("Finished. Time: {0:.1f} min.".format(time_run))

  1%|          | 1/150 [00:19<49:24, 19.90s/it]

imdb_id not found. title: Star Wars: Episode IX - The Rise of Skywalker, year: 2019.


 16%|█▌        | 24/150 [01:52<05:29,  2.62s/it]

imdb_id not found. title: The Angry Birds Movie, year: 2016.


 17%|█▋        | 25/150 [02:04<10:59,  5.28s/it]

imdb_id not found. title: The Upside, year: 2019.


 23%|██▎       | 34/150 [02:41<08:15,  4.27s/it]

imdb_id not found. title: A Madea Family Funeral, year: 2019.


 58%|█████▊    | 87/150 [05:50<04:21,  4.15s/it]

Isle of Dogs : budget still not found on IMDB.


 61%|██████    | 91/150 [06:03<03:30,  3.57s/it]

Judy : budget still not found on IMDB.


 65%|██████▍   | 97/150 [06:21<03:23,  3.85s/it]

imdb_id not found. title: The Witch, year: 2016.


 69%|██████▊   | 103/150 [06:49<03:40,  4.69s/it]

Won't You Be My Neighbor? : budget still not found on IMDB.


 75%|███████▍  | 112/150 [07:20<02:25,  3.82s/it]

The Farewell : budget still not found on IMDB.


 75%|███████▌  | 113/150 [07:20<01:42,  2.76s/it]

The Choice : budget still not found on IMDB.


 77%|███████▋  | 116/150 [07:35<02:30,  4.44s/it]

imdb_id not found. title: The Transporter Refuelled, year: 2015.
Monkey Kingdom : budget still not found on IMDB.


 80%|████████  | 120/150 [07:52<02:12,  4.43s/it]

Megan Leavey : budget still not found on IMDB.


 84%|████████▍ | 126/150 [08:14<01:31,  3.81s/it]

RBG : budget still not found on IMDB.
Strange Magic : budget still not found on IMDB.


 85%|████████▍ | 127/150 [08:23<02:01,  5.28s/it]

The Lighthouse : budget still not found on IMDB.


 86%|████████▌ | 129/150 [08:23<01:19,  3.77s/it]

Greta : budget still not found on IMDB.
Dark Waters : budget still not found on IMDB.


 87%|████████▋ | 130/150 [08:34<01:55,  5.79s/it]

Padmaavat : budget still not found on IMDB.


 89%|████████▊ | 133/150 [08:40<01:19,  4.68s/it]

Three Identical Strangers : budget still not found on IMDB.
Hotel Mumbai : budget still not found on IMDB.


 90%|█████████ | 135/150 [08:49<01:03,  4.25s/it]

No manches Frida : budget still not found on IMDB.
The Old Man & the Gun : budget still not found on IMDB.
Free Solo : budget still not found on IMDB.
Apollo 11 : budget still not found on IMDB.
No Manches Frida 2 : budget still not found on IMDB.


 93%|█████████▎| 140/150 [09:00<00:32,  3.22s/it]

The Miracle Season : budget still not found on IMDB.


 94%|█████████▍| 141/150 [09:03<00:29,  3.25s/it]

Amy : budget still not found on IMDB.
Brittany Runs a Marathon : budget still not found on IMDB.
Bajrangi Bhaijaan : budget still not found on IMDB.
PK : budget still not found on IMDB.
Penguins : budget still not found on IMDB.


 95%|█████████▍| 142/150 [09:19<00:57,  7.19s/it]

The Dead Don't Die : budget still not found on IMDB.


 97%|█████████▋| 145/150 [09:20<00:25,  5.13s/it]

I Am Not Your Negro : budget still not found on IMDB.
Island of Lemurs: Madagascar : budget still not found on IMDB.
Run the Race : budget still not found on IMDB.


 98%|█████████▊| 147/150 [09:26<00:13,  4.39s/it]

I'll See You in My Dreams : budget still not found on IMDB.
The Wife : budget still not found on IMDB.
imdb_id not found. title: The Current War: Director's Cut, year: 2019.
Beatriz at Dinner : budget still not found on IMDB.


100%|██████████| 150/150 [09:37<00:00,  3.85s/it]

Finished. Time: 9.6 min.





In [10]:
title = "The Farewell"
df_filled[df_filled['Title'] == title]

Unnamed: 0,Year,Rank,Title,tt_id,rl_id,release_date,company,mpaa,genres,director,actors,budget,bo_opening,bo_gross,imdb_score
753,2019,113,The Farewell,tt8637428,rl1091143169,2019-07-12,A24,PG,"Comedy,Drama",Lulu Wang,"Shuzhen Zhao,Awkwafina",,0.355662,17.6958,7.6


In [19]:
df_filled = pd.DataFrame()
for chunk in chunks:
    df_filled = pd.concat([df_filled, chunk])
df_filled = df_filled.sort_values(['Year', 'Rank'])
df_filled = df_filled.reset_index(drop=True)
df_filled = df_filled[~df_filled['tt_id'].isnull()]
df_filled = df_filled[~df_filled['budget'].isnull()]
df_filled.to_csv('./data/metadata_2014-2019.csv', index=False)