# Generate a Cast/Crew Score of all IMDb Movies

Using the cast and crew (writers and directors) of a film, a score is generated for each movie in the IMDb dataset. Despite being multithreaded, running this is a very time-consuming ordeal. The program is taking an in-depth look at every actor, director, and writer involved in each IMDb-listed movie. 

## Imports

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import threading
import math
import sys

## Reading, Cleaning, and Merging Data

In [None]:
# Read data into DataFrames

name_basics = pd.read_csv("data/name.basics.tsv.gz", compression = 'gzip', sep = '\t', low_memory = False)
title_basics_raw = pd.read_csv("data/title.basics.tsv.gz", compression = 'gzip', sep = '\t', low_memory = False)
title_crew = pd.read_csv("data/title.crew.tsv.gz", compression = 'gzip', sep = '\t', low_memory = False)
title_principals_raw = pd.read_csv("data/title.principals.tsv.gz", compression = 'gzip', sep = '\t', low_memory = False)
title_ratings_raw = pd.read_csv("data/title.ratings.tsv.gz", compression = 'gzip', sep = '\t', low_memory = False)

In [None]:
# Cleaning and merging data

global title_ratings
global title_basics
global title_principals

# Looking back, this was not done properly. I should have kept things separate, but I wanted to ensure that 
# each DataFrame only had the rows that met conditions applied to the others.
title_ratings = title_ratings_raw.loc[title_ratings_raw['numVotes'] > 100]
title_basics = title_basics_raw.loc[title_basics_raw['titleType'] == 'movie']
title_basics = title_basics.loc[~title_basics['genres'].str.contains('Documentary')]
title_basics = pd.merge(title_basics, title_ratings, how='inner', on='tconst').reset_index()
title_ratings = title_basics.copy()
title_principals = pd.merge(title_basics, title_principals_raw, how='inner', on='tconst').reset_index()

In [None]:
global scores
scores = pd.DataFrame(columns = ['nconst', 'score', 'focus', 'role'])

global titleScores
titleScores = pd.DataFrame(columns = ['tconst', 'score', 'focus'])

# If some calculations have been made already, load what has been saved instead of starting from scratch
try:
    scores = pd.read_csv('personScores.csv')
except:
    pass

try:
    titleScores = pd.read_csv('titleScores.csv')
except:
    pass

## Functions for Generating Scores

In [None]:
# Retrieve name_basics row for a given actor

def getPerson(name = None, nconst = None):
    if(nconst != None):
        return name_basics.loc[name_basics['nconst'] == nconst]
    else:
        return name_basics.loc[name_basics['primaryName'] == name]

In [None]:
# Retrieve works of a given actor

def getWorks(name = None, nconst = None, role = 'a'):
    if(nconst == None):
        nconst = list(getPerson(name=name)['nconst'])[0]
        
    # Separate scores for people's work as actors, directors, and writers. This accounts for people who may be 
    # excellent at one job, but not so spectacular at another. 
    # Actor/Actress
    if(role == 'a'):
        works = title_principals.loc[title_principals['nconst'] == nconst]

        # Need to account for actor and actress titles
        works = works.loc[title_principals['category'] == 'actor'].append(works.loc[title_principals['category'] == 'actress'])

    # Director
    elif(role == 'd'):
        works = title_crew.loc[title_crew['directors'].str.contains(nconst)]

    # Writer
    elif(role == 'w'):
        works = title_crew.loc[title_crew['writers'].str.contains(nconst)]

    return works

In [None]:
# Calculate a person's score based on film ratings and number of votes

def generateScore(name = None, nconst = None, role = 'a', focus='rating'):
    global scores
    
    # Find the unique id of the person if it was not provided
    if(nconst == None):
        nconst = list(name_basics.loc[name_basics['primaryName'] == name]['nconst'])[0]
        
    # If the person's score has already been generated, return it to save time
    if(nconst in scores.values):
        person = scores.loc[scores['nconst'] == nconst]
        if(list(person['focus'])[0] == focus and list(person['role'])[0] == role):
            return list(person['score'])[0]
    
    works = getWorks(name=None, nconst=nconst, role=role)

    filmScores = []
    for index, row in works.iterrows():
        titles = title_ratings.loc[title_ratings['tconst'] == row['tconst']]
        # Get the average rating and number of votes for each film
        rating = list(titles['averageRating'])[0]
        votes = list(titles['numVotes'])[0]

        # If the focus is on predicting rating, put more emphasis on film rating for score
        # If the focus is on predicting popularity, put more emphasis on votes for score
        if(rating != np.nan and votes != np.nan and votes > 100):
            if(focus == 'rating'):
                filmScores.append(rating*rating*votes/10000)
            elif(focus == 'popularity'):
                filmScores.append(rating*votes*votes/1000000)
           
    # If an error is encountered, the score is None. I have not been able to find what causes the errors.
    # I suspect missing data, but it could definitely be a problem on my end. 
    try:
        if(np.nan in filmScores):
            filmScores = filmScores.remove(np.nan)
        score = int(np.mean(filmScores))
    except:
        score = None

    scores = scores.append({'nconst': nconst, 'score': score, 'focus': focus, 'role': role}, ignore_index=True)
    # Overall person's score is median of film scores
    return score

In [None]:
# Calculate the score of a title from its cast and crew

def generateTitleScore(title=None, tconst=None, focus='rating'):
    global titleScores
    
    # Find the unique id of the title if it has not been provided
    if(tconst == None):
        tconst = list(title_basics.loc[title_basics['primaryTitle'].str.lower() == title.lower()]['tconst'])[0]
        
    # If the title's score has already been generated, return it to save time
    if(tconst in titleScores.values):
        t = titleScores.loc[titleScores['tconst'] == tconst]
        if(list(t['focus'])[0] == focus):
            return list(t['score'])[0]
    
    # Retrieve the main people involved in the movie. Actors, directors, writers, and other crew. 
    principals = title_principals.loc[title_principals['tconst'] == tconst]
    
    principalScores = []
    
    # Generate scores of principals
    for index, row in principals.iterrows():
        if(row['category'] == 'actor' or row['category'] == 'actress'):
            # Put a higher focus on actors, as the general population seems more 
            # interested in actors than writers and directors
            principalScores.append(generateScore(nconst = row['nconst'])*2)
        elif(row['category'] == 'director'):
            principalScores.append(generateScore(nconst = row['nconst'], role='d'))
        elif(row['category'] == 'writer'):
            principalScores.append(generateScore(nconst = row['nconst'], role='w'))
    
    # If an error is encountered, the score is None. I have not been able to find what causes the errors.
    # I suspect missing data, but it could definitely be a problem on my end. 
    try:
        if(np.nan in principalScores):
            principalScores = principalScores.remove(np.nan)
        score = int(np.mean(principalScores))
    except:
        score = None

    titleScores = titleScores.append({'tconst': tconst, 'score': score, 'focus': focus}, ignore_index=True)

    return score

## Splitting up Data and Threading

In [None]:
# Calculate all title scores

def allScores(nThreads, focus='rating'):
    # Break points for binning of dataset
    points = np.arange(0, title_basics.shape[0], math.ceil(title_basics.shape[0]/nThreads))
    
    # Delegate a section of the data to each of the threads
    for i in range(nThreads):
        if(len(points) == 1):
            threading.Thread(target = scoreRange, args = (points[0], title_basics.shape[0], i)).start()
        else:
            threading.Thread(target = scoreRange, args = (points[0], points[1]-1, i)).start()
            points = points[1:]

In [None]:
# Calculate title scores wthin a supplied range of indexes. This is the function run by each thread. 

def scoreRange(start, stop, thread, focus='rating'):
    global title_ratings
    global title_basics
    global title_principals
    
    count = 0
    # Loop through the bin of data and calculate scores. 
    for index, row in title_basics.iloc[start:stop, :].iterrows():
        try:
            tconst = row['tconst']
            generateTitleScore(tconst = tconst)
            
            # Save every 10th score generated
            if(count%10 == 0):
                # Every 3rd save should be a backup. This prevents loss of data from unexpected shutdowns. 
                if(count%30 == 0):
                    titleScores.to_csv('titleScoresBackup.csv', index=False)
                    scores.to_csv('personScoresBackup.csv', index=False)
                else:
                    titleScores.to_csv('titleScores.csv', index=False)
                    scores.to_csv('personScores.csv', index=False)
                    
                # Output the current progress as percentage of total rows calculated
                print(titleScores.shape[0]/title_basics.shape[0]*100, "from thread", thread)

            count+=1
            
        except Exception as e:
            # Save before stopping
            if(e == KeyboardInterrupt):
                scores.to_csv('personScores.csv')
                titleScores.to_csv('titleScores.csv')
                sys.exit()
                
    tScores = titleScores.to_csv('titleScores.csv', index=False)
    pScores = scores.to_csv('personScores.csv', index=False)

In [None]:
allScores(16)

In [None]:
# A final save just to be sure

tScores = titleScores.to_csv('titleScores.csv', index=False)
pScores = scores.to_csv('personScores.csv', index=False)