In [1]:
import numpy as np
import pandas as pd
import requests
from tqdm import tqdm
tqdm.pandas()

In [2]:
# Load movies
movies = pd.read_csv("../data/drive/movies.csv", header=0)
movies = movies.replace({np.nan: None}) # replace NaN with None
movies.head(2)

Unnamed: 0,id,title,tagline,description,genres,keywords,date,collection,runtime,revenue,...,director,cast,production_companies,production_countries,popularity,average_vote,num_votes,language,imdb_id,poster_url
0,862,Toy Story,,"Led by Woody, Andy's toys live happily in his ...","animation, comedy, family","jealousy, toy, boy, friendship, friends, rival...",1995-10-30,Toy Story Collection,81,373554000.0,...,John Lasseter,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney,...",Pixar Animation Studios,United States of America,21.9469,7.7,5415,en,tt0114709,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg
1,8844,Jumanji,Roll the dice and unleash the excitement!,When siblings Judy and Peter discover an encha...,"adventure, fantasy, family","board game, disappearance, based on children's...",1995-12-15,,104,262797000.0,...,Joe Johnston,"Robin Williams, Jonathan Hyde, Kirsten Dunst, ...","TriStar Pictures, Teitler Film, Interscope Com...",United States of America,17.0155,6.9,2413,en,tt0113497,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg


In [3]:
fail_request= list()
def movie_revenue(movie_id:int)->float:
    """Getting movie revenue information."""
    URL = "https://api.themoviedb.org/3/movie/{}?api_key=2d8e3000e80b3a5a64a5ff5ecce0ad7e".format(movie_id)
    r = requests.get(url = URL)
    data = r.json()
    if r.status_code==200:
        revenue = data['revenue']
    else:
        fail_request.append(movie_id)
        revenue = 0.0
    return revenue

In [4]:
#multiprocessing
from multiprocessing import  Pool
from functools import partial
from os import cpu_count
 
n_cpu = cpu_count()
 
def parallelize(data, func, num_of_processes=n_cpu):
    data_split = np.array_split(data, num_of_processes)
    pool = Pool(num_of_processes)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data
 
def run_on_subset(func, data_subset):
    return data_subset.progress_apply(func)
 
def parallelize_on_rows(data, func, num_of_processes=n_cpu):
    return parallelize(data, partial(run_on_subset, func), num_of_processes)

In [5]:
#revenue['new_revenue'] = revenue['id'].progress_apply(lambda x: movie_revenue(str(x)))
movies['new_revenue'] = parallelize_on_rows(movies['id'],movie_revenue)

100%|██████████| 5829/5829 [38:01<00:00,  2.56it/s]  
100%|██████████| 5829/5829 [56:51<00:00,  1.71it/s]  
100%|██████████| 5828/5828 [1:00:24<00:00,  1.61it/s]
100%|██████████| 5829/5829 [1:03:04<00:00,  1.54it/s]
100%|██████████| 5828/5828 [1:03:08<00:00,  1.54it/s]
100%|██████████| 5829/5829 [1:03:23<00:00,  1.53it/s]
100%|██████████| 5828/5828 [1:03:29<00:00,  1.53it/s]
100%|██████████| 5828/5828 [1:03:31<00:00,  1.53it/s]


In [6]:
fail_request

[]

In [7]:
#Count additional data we get from tmdb api
movies[movies['new_revenue']!=0].shape[0] - movies[movies['revenue']!=0].shape[0]

1120

In [8]:
movies[movies['new_revenue']!=0].shape[0]

8661