In [48]:
import os
import numpy as np
import pandas as pd
from math import ceil
from tqdm import tqdm
from joblib import Parallel, delayed
from sklearn.preprocessing import MultiLabelBinarizer

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

In [6]:
INTERIM_DIR = '..\\data\\interim'
FINAL_DIR = '..\\data\\final'

#### Reading all data files

In [7]:
movies_df = pd.read_csv(os.path.join(INTERIM_DIR, 'movies.csv'))
title_ratings_df = pd.read_csv(os.path.join(INTERIM_DIR, 'title_ratings.csv'))
title_akas_df = pd.read_csv(os.path.join(INTERIM_DIR, 'title.akas.csv'))
title_principals_df = pd.read_csv(os.path.join(INTERIM_DIR, 'title.principals.csv'))
title_crew_df = pd.read_csv(os.path.join(INTERIM_DIR, 'title.crew.csv'))
names_df = pd.read_csv(os.path.join(INTERIM_DIR, 'names.basics.csv'))

### New Feature 1: Popularity Index of Actors/Directors/Crew

#### Popularity Index of a Person

**Description:**  
  
  1. For each person in the names_df, we have a list of titleID.
  2. Using the list of titleIDs, we can get a list of each person's ratings from the title_ratings_df.
  
**Assumption:**
  1. The main assumption is that the ratings associated with the title are related with how popular the movie is.
  2. We are going to infer the popularity of the person from the popularity of the titles, which assumes that the titles represent their popularity entirely.

**Intuition:**
  
  1. **Mean** of the list of ratings gives us an idea of how popular the person is. Even if the person generally is average, but they have done one/two movies that perform exceptionally well (according to ratings), the mean will capture that.
  2. **Median** of the list of ratings captures the general consensus of the person's performance that is how they are usually expected to perform.
  3. **Standard Deviation** of these numbers will characterize the distribution of the ratings. For Eg. if the mean of the ratings is high, and the standard deviation is low, it means that the person consistently has performed exceptionally well. Similarly if the mean is low but standard deviation is high, it means that the person has been 'average' in their performance sometimes.

In [8]:
names_df

Unnamed: 0.1,Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,0,nm0000001,Fred Astaire,1899.0,1987.0,"soundtrack,actor,miscellaneous","tt0053137,tt0031983,tt0050419,tt0072308"
1,1,nm0000002,Lauren Bacall,1924.0,2014.0,"actress,soundtrack","tt0071877,tt0038355,tt0117057,tt0037382"
2,2,nm0000003,Brigitte Bardot,1934.0,,"actress,soundtrack,music_department","tt0057345,tt0056404,tt0049189,tt0054452"
3,3,nm0000004,John Belushi,1949.0,1982.0,"actor,soundtrack,writer","tt0072562,tt0080455,tt0078723,tt0077975"
4,4,nm0000005,Ingmar Bergman,1918.0,2007.0,"writer,director,actor","tt0069467,tt0060827,tt0083922,tt0050986"
...,...,...,...,...,...,...,...
851242,851242,nm9993533,Lara Christie,,,actress,"tt0123527,tt0074965"
851243,851243,nm9993594,Adeoluwa Owu,,,"cinematographer,director,camera_department","tt13204624,tt15892144,tt16423720,tt16253232"
851244,851244,nm9993616,Ryan Mac Lennan,,,actor,tt4844148
851245,851245,nm9993650,Marcin Balcerak,,,actor,tt8739208


In [9]:
def calculate_pop_ind_parallel(names_df, title_ratings_df):
    '''
    Calculate the popularity index of a person using titles they 
    are known for in parallel.
    
    params:
        - names_df         (DataFrame): names table
        - title_ratings_df (DataFrame): ratings table
    
    returns:
        - output      (list of tuples): list of popularity indices
    '''
    name_IDs = names_df['nconst'].tolist()
    
    # Get the number of logical processors and distribute the load
    num_cores = os.cpu_count()
    names_per_core = ceil(len(name_IDs)/num_cores)
    print("names_per_core: {}".format(names_per_core))
    
    name_list_per_core = [name_IDs[i*names_per_core: (i+1)*names_per_core]
                          for i in range(num_cores)]
    
    # Execute in parallel
    output_list = Parallel(n_jobs=-1, verbose=30, max_nbytes=None)(delayed(get_pop_ind)(nameID_list, 
                                                                                        names_df.copy(), 
                                                                                        title_ratings_df.copy())
                                                                  for nameID_list in name_list_per_core)
    
    # Aggregate results from all parallel workers
    result = []
    [result.extend(list_) for list_ in output_list if list_!=[]]
    
    return result

In [10]:
def get_pop_ind(nameID_list, names_df, title_ratings_df):
    '''
    Calculate the popularity index values from the names_df and title ratings 
    
    Steps (for each nameID):
    1. Collect the titleIDs that a person is known for 
    2. Collect the ratings of these titles from the ratings table
    3. Calculate the mean and std. deviation for those ratings and return
    
    params:
        - nameID_list             (list): list of nameIDs
        - names_df           (DataFrame): names table
        - title_ratings_df   (DataFrame): ratings table
    
    returns:
        - pop_inds      (list of tuples): list of popularity indices
    '''
    
    pop_inds = []
    
    for nameID in nameID_list:
        
        pop_ind_mean, pop_ind_med, pop_ind_std = 0,0,0
        
        # get the record for nameID from names_df
        knownfor = names_df[names_df['nconst']==nameID]['knownForTitles'].iloc[0]
        
        if not pd.isna(knownfor):
            
            # get the titles that the person is known for
            titles = [title for title in knownfor.split(',') if title!='']
            # get the ratings for those titles
            ratings = title_ratings_df[title_ratings_df['tconst'].isin(titles)]['averageRating']
            
            pop_ind_mean = ratings.mean()
            pop_ind_std = ratings.std()
            pop_ind_med = ratings.median()
        
        pop_inds.append((pop_ind_mean, pop_ind_med, pop_ind_std))
    
    return pop_inds

In [11]:
popularity_index = calculate_pop_ind_parallel(names_df, title_ratings_df)

names_per_core: 53203


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 147.1min
[Parallel(n_jobs=-1)]: Done   2 out of  16 | elapsed: 154.7min remaining: 1082.9min
[Parallel(n_jobs=-1)]: Done   3 out of  16 | elapsed: 156.7min remaining: 679.2min
[Parallel(n_jobs=-1)]: Done   4 out of  16 | elapsed: 160.7min remaining: 482.2min
[Parallel(n_jobs=-1)]: Done   5 out of  16 | elapsed: 162.3min remaining: 357.0min
[Parallel(n_jobs=-1)]: Done   6 out of  16 | elapsed: 162.5min remaining: 270.8min
[Parallel(n_jobs=-1)]: Done   7 out of  16 | elapsed: 162.5min remaining: 208.9min
[Parallel(n_jobs=-1)]: Done   8 out of  16 | elapsed: 162.6min remaining: 162.6min
[Parallel(n_jobs=-1)]: Done   9 out of  16 | elapsed: 162.7min remaining: 126.5min
[Parallel(n_jobs=-1)]: Done  10 out of  16 | elapsed: 162.8min remaining: 97.7min
[Parallel(n_jobs=-1)]: Done  11 out of  16 | elapsed: 162.9min remaining: 74.0min
[Parallel(n_jobs=-1)]: Done  12

In [12]:
popularity_index[0:10]

[(7.025, 7.0, 0.12583057392117913),
 (7.4, 7.55, 0.5944184833375672),
 (6.775, 6.949999999999999, 1.0626225419530053),
 (7.300000000000001, 7.7, 1.023067283548187),
 (8.125, 8.1, 0.04999999999999982),
 (8.0, 8.0, 0.43969686527576385),
 (8.025, 7.9, 0.3304037933599836),
 (8.174999999999999, 8.25, 0.9105858919765154),
 (7.625, 7.699999999999999, 0.3862210075418824),
 (7.9, 7.9, 0.163299316185545)]

In [13]:
mean, median, std = zip(*popularity_index)

In [14]:
print(len(mean), len(median), len(std))

851247 851247 851247


#### Setting the values in the Dataframe

In [15]:
names_df['pop_ind_mean'] = list(mean)
names_df['pop_ind_median'] = list(median)
names_df['pop_ind_std'] = list(std)

In [16]:
names_df[['pop_ind_mean', 'pop_ind_median','pop_ind_std']]

Unnamed: 0,pop_ind_mean,pop_ind_median,pop_ind_std
0,7.025000,7.00,0.125831
1,7.400000,7.55,0.594418
2,6.775000,6.95,1.062623
3,7.300000,7.70,1.023067
4,8.125000,8.10,0.050000
...,...,...,...
851242,4.550000,4.55,0.636396
851243,,,
851244,6.500000,6.50,
851245,5.300000,5.30,


#### Let us test how these values behave for extremely well known actors

In [28]:
stars = ['Tom Cruise', 'Leonardo DiCaprio', 'Angelina Jolie', 'Nicole Kidman', 'Brad Pitt', 'Johnny Depp']

names_df[names_df['primaryName'].isin(stars)][['primaryName','pop_ind_mean', 'pop_ind_median','pop_ind_std']]

Unnamed: 0,primaryName,pop_ind_mean,pop_ind_median,pop_ind_std
92,Brad Pitt,7.15,7.05,0.768115
128,Tom Cruise,7.0,7.1,0.68313
135,Johnny Depp,7.25,7.2,0.58023
137,Leonardo DiCaprio,8.325,8.35,0.4272
172,Nicole Kidman,7.525,7.55,0.411299
1390,Angelina Jolie,7.1,7.15,0.547723


#### As expected, the values of the popularity index are fairly high for these actors and the standard deviation is low. This suggests consistently great performance as we had imagined.

In [29]:
names_df.to_csv(os.path.join(FINAL_DIR, 'names.basics.csv'), index=False)

### Infering the popularity of the movie from the popularity of its cast and crew

**Description:**  
1. We essentially want to estimate the popularity of a movie based on the popularity of its actors/directors/writers
2. For this, we analyze the means and standard deviations of each of the values that make up our popularity index for persons
3. These 6 values collectively will represent the popularity index of the movie.
4. By calculating the mean and standard deviation, we effectively summarize the popularity indices of its cast

**NOTE:** This again takes into account the crew and only the top 5 actors as we have filtered the title.principals table

In [30]:
def split_nameIDs(nconst_str):
    '''
    Split the nconst string by commas and filter out empty strings
    
    params:
        - nconst_str (str): String of comma separated nameIDs
    
    returns:
        - (list): List of nameIDs
    '''
    return [nameID for nameID in nconst_str.split(',') if nameID!='']

In [31]:
def get_movie_popind(titleID_list, 
                     title_principals=title_principals_df, 
                     title_crew=title_crew_df, 
                     names_df=names_df):
    '''
    Get a feature vector representing the popularity index of movie based
    on the popularity indices of its cast and crew 
    
    params:
        - titleID_list            (list): list of titleIDs
        - title_principals   (DataFrame): movie actors table
        - title_crew         (DataFrame): title crew data
        - names_df           (DataFrame): names table
    
    returns:
        - pop_indexes   (list of tuples): popularity indices of movies
    '''
    pop_indexes = []
    
    for titleID in titleID_list:
    
        pi_mean_mean, pi_mean_std = 0,0
        pi_med_mean, pi_med_std = 0,0
        pi_std_mean, pi_std_std = 0,0

        people_involved = set()

        # Adding top 5 actors associated with the movie to the set
        people_involved.update(title_principals[title_principals['tconst']==titleID]['nconst'].values)

        # Adding writers and directors
        crew = title_crew[title_crew['tconst']==titleID]

        if not pd.isna(crew['writers'].iloc[0]):
            people_involved.update(split_nameIDs(crew['writers'].iloc[0]))

        if not pd.isna(crew['directors'].iloc[0]):
            people_involved.update(split_nameIDs(crew['directors'].iloc[0]))

        # Choose the people involved in the names list
        people_involved_df = names_df[names_df['nconst'].isin(people_involved)]

        # Calculate the statistics of the people's popularity indexes
        pi_mean = people_involved_df['pop_ind_mean']
        pi_med = people_involved_df['pop_ind_median']
        pi_std = people_involved_df['pop_ind_std']

        pi_mean_mean, pi_mean_std = pi_mean.mean(), pi_mean.std()
        pi_med_mean, pi_med_std = pi_med.mean(), pi_med.std()
        pi_std_mean, pi_std_std = pi_std.mean(), pi_std.std()
        
        pop_indexes.append((pi_mean_mean, pi_mean_std,\
                           pi_med_mean, pi_med_std,\
                           pi_std_mean, pi_std_std))
    
    return pop_indexes

In [32]:
def calculate__movie_pop_ind_parallel(movies_df,
                                      title_principals=title_principals_df, 
                                      title_crew=title_crew_df, 
                                      names_df=names_df):
    '''
    Calculate the popularity index of a movie in parallel
    
    params:
        - movies_df          (DataFrame): dataset
        - title_principals   (DataFrame): movie actors table
        - title_crew         (DataFrame): title crew data
        - names_df           (DataFrame): names table
    
    returns:
        - result        (list of tuples): popularity indices of movies
    '''
    title_IDs = movies_df['tconst'].tolist()
    
    # Distribute load across CPU cores
    num_cores = os.cpu_count()
    titles_per_core = ceil(len(title_IDs)/num_cores)
    print("titles_per_core: {}".format(titles_per_core))
    
    titles_list_per_core = [title_IDs[i*titles_per_core: (i+1)*titles_per_core]
                          for i in range(num_cores)]
    
    # Execute in parallel
    output_list = Parallel(n_jobs=-1, verbose=30, max_nbytes=None)(delayed(get_movie_popind)(titleID_list,
                                                                                        title_principals_df.copy(),
                                                                                        title_crew_df.copy(),
                                                                                        names_df.copy()) 
                                                                  for titleID_list in titles_list_per_core)
    
    # Aggregate results
    result = []
    [result.extend(list_) for list_ in output_list if list_!=[]]
    
    return result

In [33]:
movie_pop_ind = calculate__movie_pop_ind_parallel(movies_df)

titles_per_core: 16558


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 54.8min
[Parallel(n_jobs=-1)]: Done   2 out of  16 | elapsed: 55.2min remaining: 386.3min
[Parallel(n_jobs=-1)]: Done   3 out of  16 | elapsed: 55.4min remaining: 239.9min
[Parallel(n_jobs=-1)]: Done   4 out of  16 | elapsed: 55.5min remaining: 166.6min
[Parallel(n_jobs=-1)]: Done   5 out of  16 | elapsed: 55.5min remaining: 122.2min
[Parallel(n_jobs=-1)]: Done   6 out of  16 | elapsed: 55.6min remaining: 92.6min
[Parallel(n_jobs=-1)]: Done   7 out of  16 | elapsed: 55.7min remaining: 71.6min
[Parallel(n_jobs=-1)]: Done   8 out of  16 | elapsed: 55.8min remaining: 55.8min
[Parallel(n_jobs=-1)]: Done   9 out of  16 | elapsed: 55.8min remaining: 43.4min
[Parallel(n_jobs=-1)]: Done  10 out of  16 | elapsed: 55.8min remaining: 33.5min
[Parallel(n_jobs=-1)]: Done  11 out of  16 | elapsed: 55.8min remaining: 25.4min
[Parallel(n_jobs=-1)]: Done  12 out of  16 | el

In [34]:
pi_mean_mean, pi_mean_std,\
pi_med_mean, pi_med_std,\
pi_std_mean, pi_std_std = zip(*movie_pop_ind)

In [35]:
variables = [pi_mean_mean, pi_mean_std,\
             pi_med_mean, pi_med_std,\
             pi_std_mean, pi_std_std]

names = ['pi_mean_mean', 'pi_mean_std',\
         'pi_med_mean', 'pi_med_std',\
         'pi_std_mean', 'pi_std_std']

pi_dict = list(zip(names, variables))

In [36]:
for (column_name, values) in pi_dict:
    
    movies_df[column_name] = values

movies_df.head()

Unnamed: 0.1,Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,numVotes,pi_mean_mean,pi_mean_std,pi_med_mean,pi_med_std,pi_std_mean,pi_std_std
0,1,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,70,"Action,Adventure,Biography",6.1,753,6.073333,0.303803,6.16,0.189737,0.893082,0.614934
1,2,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,90,Drama,4.6,17,5.243333,0.584713,5.33,0.643817,0.583854,0.273735
2,3,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,94,Drama,4.5,23,4.85463,0.619967,4.827778,0.627052,1.253192,0.50713
3,4,tt0000630,movie,Hamlet,Amleto,0,1908,94,Drama,3.8,24,5.138889,0.512099,5.45,0.4272,1.408882,0.346523
4,5,tt0000675,movie,Don Quijote,Don Quijote,0,1908,94,Drama,4.9,19,5.075,0.883883,5.075,0.883883,0.883883,0.35


In [37]:
na_values = dict(zip(names, [0]*len(names)))
print(na_values)
movies_df = movies_df.fillna(value=na_values)
movies_df.isnull().sum()

{'pi_mean_mean': 0, 'pi_mean_std': 0, 'pi_med_mean': 0, 'pi_med_std': 0, 'pi_std_mean': 0, 'pi_std_std': 0}


Unnamed: 0        0
tconst            0
titleType         0
primaryTitle      0
originalTitle     0
isAdult           0
startYear         0
runtimeMinutes    0
genres            0
averageRating     0
numVotes          0
pi_mean_mean      0
pi_mean_std       0
pi_med_mean       0
pi_med_std        0
pi_std_mean       0
pi_std_std        0
dtype: int64

In [38]:
movies_df.to_csv(os.path.join(FINAL_DIR, 'movies.csv'), index=False)

### Feature 2: One hot encoding of genres

**Description:**

   1. The _genres_ column has at most 3 genres associated with the movie.
   2. We transform this column into a sparse one-hot encoded vector _(since there are 28 unique genres)_ te represent each movie.
    
**Intuition:**

   1. Each vector will represent the genre combination of the movie and might turn out to be distinguishing if the model is able to associate the cast and the genres to predict the movie's average rating


In [41]:
# List all genres

genres = pd.read_csv(os.path.join(INTERIM_DIR, 'genres.csv'))['genres'].values
genres

array(['Game-Show', 'Sport', 'Animation', 'Comedy', 'History',
       'Documentary', 'Horror', 'Crime', 'War', 'Music', 'Drama',
       'Sci-Fi', 'Adult', 'Short', 'Mystery', 'Adventure', 'Biography',
       'Fantasy', 'Film-Noir', 'Family', 'Action', 'Thriller',
       'Reality-TV', 'Musical', 'News', 'Talk-Show', 'Romance', 'Western'],
      dtype=object)

In [42]:
binarizer = MultiLabelBinarizer()
binarizer.fit([list(genres)])
binarizer.classes_

array(['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir',
       'Game-Show', 'History', 'Horror', 'Music', 'Musical', 'Mystery',
       'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport',
       'Talk-Show', 'Thriller', 'War', 'Western'], dtype=object)

In [43]:
mllabels = binarizer.transform(movies_df['genres'].apply(lambda x: [x for x in x.split(',') if x!=[]]))

In [44]:
labels = pd.DataFrame(data=mllabels, columns=binarizer.classes_)

In [45]:
pd.concat([movies_df, labels], axis=1)

Unnamed: 0.1,Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,...,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,1,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,70,"Action,Adventure,Biography",6.1,...,0,0,0,0,0,0,0,0,0,0
1,2,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,90,Drama,4.6,...,0,0,0,0,0,0,0,0,0,0
2,3,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,94,Drama,4.5,...,0,0,0,0,0,0,0,0,0,0
3,4,tt0000630,movie,Hamlet,Amleto,0,1908,94,Drama,3.8,...,0,0,0,0,0,0,0,0,0,0
4,5,tt0000675,movie,Don Quijote,Don Quijote,0,1908,94,Drama,4.9,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264914,275171,tt9916190,movie,Safeguard,Safeguard,0,2020,90,"Action,Adventure,Thriller",3.6,...,0,0,0,0,0,0,0,1,0,0
264915,275172,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,84,Thriller,5.8,...,0,0,0,0,0,0,0,1,0,0
264916,275173,tt9916362,movie,Coven,Akelarre,0,2020,92,"Drama,History",6.4,...,0,0,0,0,0,0,0,0,0,0
264917,275174,tt9916428,movie,The Secret of China,Hong xing zhao yao Zhong guo,0,2019,94,"Adventure,History,War",3.8,...,0,0,0,0,0,0,0,0,1,0


In [46]:
movies_df = pd.concat([movies_df, labels], axis=1)

In [47]:
movies_df.to_csv(os.path.join(FINAL_DIR, 'movies.csv'), index=False)