# Feature Generation and Cleaning

# SET FLAGS SO YOU DON'T OVERWRITE EXISTING FILES

## IF YOU'RE DOING THIS FOR SELECT FILMS, INPUT THE DATE RANGE FOR WHICH YOU WANT RELEASE DATE BASED FEATURES TO BE COMPUTED OVER, FOR EACH FILM

In [1]:
SELECT_FILMS = True
SELECT_TITLES = ['Rue']

In [2]:
# STRUCTURE: {TITLE: [EARLIEST DATE, LATEST DATE]} WITH DATES IN 'YYYY-MM-DD' FORMAT
SELECT_DATES = {'Rue': ['2019-01-01', '2019-12-31']}

In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import collections
import json
import networkx as nx
from networkx import *
from joblib import Parallel, delayed
import time

# function to flatten the titles dict
def flatten(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, collections.MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

# Pull in titles and people json files.

In [4]:
import os.path

titles = []
with open(os.path.dirname(os.getcwd()) + '/titles.json') as f:
    for line in f:
        while True:
            try:
                titles.append(json.loads(line))
                break
            except ValueError:
                # Not yet a complete JSON value
                line += next(f)

people = []
with open(os.path.dirname(os.getcwd()) + '/people.json') as f:
    for line in f:
        while True:
            try:
                people.append(json.loads(line))
                break
            except ValueError:
                # Not yet a complete JSON value
                line += next(f)

In [5]:
franchises = []
with open(os.path.dirname(os.getcwd()) + '/franchises.json') as f:  
    for line in f:
        franchises.append(json.loads(line))

In [6]:
# create a dictionary of films from titles list
titles_dict = {}
for title in titles:
    name = title['title']
    titles_dict[name] = title

In [7]:
# create a dictionary of people from person list
people_dict = {}
for person in people:
    name = person['name']
    people_dict[name] = person

In [8]:
# create a franchise dataframe
franchise_dicts = []
for franchise in franchises:
    for film in franchise['films']:
        franchise_dict = {}
        franchise_dict['franchise'] = franchise['franchise']
        franchise_dict['film'] = film
        franchise_dicts.append(franchise_dict)
        
franchise_df = pd.DataFrame(franchise_dicts)
for i in range(len(franchise_df)):
    film = franchise_df['film'].iat[i]
    if film not in titles_dict:
        continue
    try:
        week_1_gross = titles_dict[film]['week_1_gross']
        week_1_gross = int(week_1_gross)
    except:
        week_1_gross = np.nan
    franchise_df.at[i, 'week_1_gross'] = week_1_gross
    franchise_df.at[i, 'first_wknd_gross'] = titles_dict[film]['first_wknd_gross']
    franchise_df.at[i, 'domestic_gross'] = titles_dict[film]['domestic_gross']
    franchise_df.at[i, 'release_date'] = pd.to_datetime(titles_dict[film]['theatrical_release']['$date'])

In [9]:
# Create dicts to hold actor and director scores, grosses, and tenures of titles
# These are calculated by taking their mean scores across the films they were in / directed.

actors = {}
directors = {}

for person in people:
    try:
        if 'production_co' not in person['profession'] and 'distribution_co' not in person['profession']:
            gross_temp = 0
            tenure = 0
            name = person['name']
            dates = []
            for film in person['films']:
                if film in titles_dict and pd.notnull(titles_dict[film]['domestic_gross']):
                    gross_temp += titles_dict[film]['domestic_gross']
                if titles_dict[film]['theatrical_release'] == None:
                    continue
                else:
                    dates.append(pd.to_datetime(titles_dict[film]['theatrical_release']['$date']))
            if len(dates) > 0:
                dates = np.array(dates)
                tenure = max(dates) - min(dates)
                tenure = int(tenure.days)

            titles_temp = []
            for film in person['films']:
                titles_temp.append(film)
                scores = []
                if 'imdb_score' in titles_dict[film]['scores'] and pd.notnull(titles_dict[film]['scores']['imdb_score']):
                    scores.append(float(titles_dict[film]['scores']['imdb_score'])*10)
                for col in ['metascore', 'rt_critics', 'rt_users']:
                    if col in titles_dict[film]['scores'] and pd.notnull(titles_dict[film]['scores'][col]):
                        scores.append(float(titles_dict[film]['scores'][col]))

            if 'director' in person['profession']:
                directors[name] = {}
                directors[name]['score'] = np.mean(scores)
                directors[name]['gross'] = gross_temp
                directors[name]['tenure'] = tenure
                directors[name]['films'] = titles_temp
                directors[name]['dates'] = pd.to_datetime(dates)

            if 'actor' in person['profession']:
                actors[name] = {}
                actors[name]['score'] = np.mean(scores)
                actors[name]['gross'] = gross_temp
                actors[name]['tenure'] = tenure
                actors[name]['films'] = titles_temp
                actors[name]['dates'] = pd.to_datetime(dates)
    except:
        print person["name"]
#         print person



In [10]:
# to check if people in db
names = [person['name'] for person in people]

In [11]:
# to save film dates, to ensure that director/actor gross
# only takes into account films before that film's release date

films = {}
for title in titles:
    name = title['title']
    films[name] = {}
    films[name]['score'] = []
    films[name]['gross'] = None
    films[name]['release_date'] = None
    if 'domestic_gross' in title and pd.notnull(title['domestic_gross']):
        films[name]['gross'] = title['domestic_gross']
    if 'theatrical_release' in title and pd.notnull(title['theatrical_release']):
        if '$date' in title['theatrical_release'] and pd.notnull(title['theatrical_release']['$date']):
                films[name]['release_date'] = pd.to_datetime(title['theatrical_release']['$date'])
    if 'imdb_score' in title['scores'] and pd.notnull(title['scores']['imdb_score']):
        films[name]['score'].append(float(title['scores']['imdb_score'])*10)
    for col in ['metascore', 'rt_critics', 'rt_users']:
        if col in title['scores'] and pd.notnull(title['scores'][col]):
            films[name]['score'].append(float(title['scores'][col]))
    films[name]['score'] = np.nanmean(films[name]['score'])



In [12]:
dfs = []
for title in titles:
    if SELECT_FILMS == True:
        if title['title'] not in SELECT_TITLES:
            continue
    flat = flatten(title)
    actor_score_temp = []
    direc_score_temp = []
    actor_gross_temp = []
    direc_gross_temp = []
    actor_tenures_temp = []
    direc_tenures_temp = []
    if title['theatrical_release']:
        release_date = pd.to_datetime(title['theatrical_release']['$date'])
    # include actor and director scores by referencing them from their dicts
    for actor in flat['actor']:
        # checker for whether in person db
        if actor in names:
            # get mean score for actor using scores before current film, add to list
            score = np.nanmean([films[film_title]['score'] for film_title in actors[actor]['films'] if films[film_title]['release_date'] is not None and films[film_title]['release_date'] < release_date])
            actor_score_temp.append(score)
            # get mean gross for actor using grosses before current film, add to list 
            gross = np.nanmean(filter(None, [films[film_title]['gross'] for film_title in actors[actor]['films'] if films[film_title]['release_date'] is not None and films[film_title]['release_date'] < release_date]))
            actor_gross_temp.append(gross) 
            # create mean tenure for actor using dates before current film, add to list
            if len(actors[actor]['dates']) > 0:
                tenure_temp = (release_date - np.min(actors[actor]['dates'].values)).days
                actor_tenures_temp.append(tenure_temp)
            else:
                actor_tenures_temp.append(0)
    # handle the case of messed up actors
    if not actor_tenures_temp:
        actor_mean_tenure = 0
        actor_stddev_tenure = 0
    else:
        actor_tenures_temp = np.array(actor_tenures_temp)
        actor_mean_tenure = np.mean(actor_tenures_temp[actor_tenures_temp.nonzero()])
        actor_stddev_tenure = np.std(actor_tenures_temp[actor_tenures_temp.nonzero()])
    for director in flat['director']:
        if director in names:
            score = np.nanmean([films[film_title]['score'] for film_title in directors[director]['films'] if films[film_title]['release_date'] is not None and films[film_title]['release_date'] < release_date])
            direc_score_temp.append(score)
            gross = np.nanmean(filter(None, [films[film_title]['gross'] for film_title in directors[director]['films'] if films[film_title]['release_date'] is not None and films[film_title]['release_date'] < release_date]))
            direc_gross_temp.append(gross)
            if len(directors[director]['dates']) > 0:
                tenure_temp = (release_date - np.min(directors[director]['dates'].values)).days
                direc_tenures_temp.append(tenure_temp)
            else:
                direc_tenures_temp.append(0)
    if not direc_tenures_temp:
        direc_mean_tenure = 0
    else:
        direc_tenures_temp = np.array(direc_tenures_temp)
        direc_mean_tenure = np.mean(direc_tenures_temp[direc_tenures_temp.nonzero()])
        direc_stddev_tenure = np.std(direc_tenures_temp[direc_tenures_temp.nonzero()])
    flat['actor_score'] = np.mean(actor_score_temp)
    flat['director_score'] = np.mean(direc_score_temp)
    flat['mean_actor_gross'] = np.mean(actor_gross_temp)
    flat['mean_director_gross'] = np.mean(direc_gross_temp)
    flat['actor_mean_tenure'] = actor_mean_tenure 
    flat['actor_stddev_tenure'] = actor_stddev_tenure
    flat['direc_mean_tenure'] = direc_mean_tenure
    flat['direc_stddev_tenure'] = direc_stddev_tenure
    flat = pd.DataFrame(pd.Series(flat)).transpose()
    dfs.append(flat)

df = pd.concat(dfs)
df = df.reset_index(drop = True)



In [13]:
def releaseDater(row, SELECT_DATES):
    film_title = row['title']
    select_dates = SELECT_DATES[film_title]
    date_range = pd.date_range(start = select_dates[0], end = select_dates[1], freq = 'W')
    new_rows = pd.concat([pd.DataFrame(row).T] * len(date_range))
    new_rows['theatrical_release_$date'] = date_range
    return new_rows

if SELECT_FILMS == True:
    df = pd.concat([releaseDater(x, SELECT_DATES) for _, x in list(df.iterrows())])
    df.reset_index(drop = True, inplace = True)

## create release date features


In [14]:
df['theatrical_release_$date']

0    2019-01-06
1    2019-01-13
2    2019-01-20
3    2019-01-27
4    2019-02-03
5    2019-02-10
6    2019-02-17
7    2019-02-24
8    2019-03-03
9    2019-03-10
10   2019-03-17
11   2019-03-24
12   2019-03-31
13   2019-04-07
14   2019-04-14
15   2019-04-21
16   2019-04-28
17   2019-05-05
18   2019-05-12
19   2019-05-19
20   2019-05-26
21   2019-06-02
22   2019-06-09
23   2019-06-16
24   2019-06-23
25   2019-06-30
26   2019-07-07
27   2019-07-14
28   2019-07-21
29   2019-07-28
30   2019-08-04
31   2019-08-11
32   2019-08-18
33   2019-08-25
34   2019-09-01
35   2019-09-08
36   2019-09-15
37   2019-09-22
38   2019-09-29
39   2019-10-06
40   2019-10-13
41   2019-10-20
42   2019-10-27
43   2019-11-03
44   2019-11-10
45   2019-11-17
46   2019-11-24
47   2019-12-01
48   2019-12-08
49   2019-12-15
50   2019-12-22
51   2019-12-29
Name: theatrical_release_$date, dtype: datetime64[ns]

In [15]:
df['release_date'] = df['theatrical_release_$date']
df['release_date'] = pd.to_datetime(df['release_date'])

In [16]:
# takes in a film, returns mean_week_1, mean_domestic, # of preceding films
def filmFranchiser(row, franchise_df):
    rd = row['release_date']
    title = row['title']
    result = pd.Series({'days_since_last_entry': 0,
                          'gross_prev_entry': 0,
                          'mean_first_wknd_gross': 0,
                          'mean_week_1_gross': 0,
                          'mean_franchise_domestic_gross': 0,
                          'num_preceding_films': 0})
    franchise_check = franchise_df[franchise_df['film'] == row['title']]
    if franchise_check.empty:
        return result
    else:
        franchise = franchise_check['franchise'].values[0]
        franchise_films = franchise_df[franchise_df['franchise'] == franchise]
        prev_films = franchise_films[franchise_films['release_date'] < rd]
        if prev_films.empty:
            return result
        else:
            result['days_since_last_entry'] = (rd - np.max(prev_films['release_date'])).days
            result['gross_prev_entry'] = np.max(prev_films.loc[prev_films['release_date'].idxmax()]['domestic_gross'])
            result['mean_first_wknd_gross'] = np.mean(prev_films['first_wknd_gross'])
            result['mean_week_1_gross'] = np.mean(prev_films['week_1_gross'])
            result['mean_franchise_domestic_gross'] = np.mean(prev_films['domestic_gross'])
            result['num_preceding_films'] = prev_films.shape[0]
            return result
        
franchise_feats = df.apply(filmFranchiser, args = (franchise_df, ), axis = 1)

In [17]:
df = df.merge(franchise_feats, left_index = True, right_index = True)

# RUN THE FOLLOWING LINE OF CODE OR ALL FEATURES THAT RELY ON RELEASE DATE WILL BREAK

In [18]:
df['release_date'] = pd.to_datetime(df['release_date'])
df['weekday'] = df['release_date'].dt.dayofweek
df['week'] = df['release_date'].dt.weekofyear
df['month'] = df['release_date'].dt.month
df['year'] = df['release_date'].dt.year

### cyclical pattern of films

In [19]:
# df_cycle = df[~pd.isnull(df['domestic_gross'])]
# df_cycle = df_cycle[~pd.isnull(df_cycle['year'])]
# df_cycle['domestic_gross'] = df_cycle['domestic_gross'].astype(int)
# df_cycle = df_cycle[df_cycle['year'] > 1989]

In [20]:
# import numpy as np
# from scipy.optimize import leastsq
# import matplotlib.pyplot as plt

# data = df_cycle.groupby(['year', 'month'])['domestic_gross'].mean()
# N = len(data) # number of data points
# t = np.linspace(0, 4* np.pi , N)

# monthly_mean = np.mean(data)
# monthly_std = 3*np.std(data)/(2**0.5)
# monthly_phase = 0

# # we'll use this to plot our first estimate. This might already be good enough for you
# # data_first_guess = guess_std*np.sin(t+guess_phase) + guess_mean

# # Define the function to optimize, in this case, we want to minimize the difference
# # between the actual data and our "guessed" parameters
# optimize_func = lambda x: x[0]*np.sin(t+x[1]) + x[2] - data
# est_std, est_phase, est_mean = leastsq(optimize_func, [monthly_std, monthly_phase, monthly_mean])[0]

# # recreate the fitted curve using the optimized parameters
# data_fit = est_std*np.sin(t+est_phase) + est_mean

In [21]:
# mean_gross_monthly = data.reset_index().drop('domestic_gross', axis = 1)
# mean_gross_monthly['cycle_pred'] = data_fit
# df = df.merge(mean_gross_monthly, left_on = ['year', 'month'], right_on = ['year', 'month'])

In [22]:
# d2 = pd.to_datetime('2016-01-01').to_datetime()
# d1 = pd.to_datetime('1990-01-01').to_datetime()

# (d1.year - d2.year)*12 + d1.month - d2.month

### holiday

In [23]:
# holiday distance variables
# the big ones: washington's birthday, easter, memorial day, july 4,
# labor day, thanksgiving, christmas, new years 
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
cal = calendar()

def holCollect(date, cal):
    distsDict = {}
    for rule in cal.rules:
        holidays = rule.dates(start_date = date - pd.DateOffset(months = 7), 
                              end_date = date + pd.DateOffset(months = 7))
        # get the distance in days between each holiday and the date in question
        dist = holDist(date,holidays)
        distsDict[rule.name] = dist
    return distsDict

def holDist(date, holidays):
    dists = [abs(date - holiday).days for holiday in holidays]
    return np.min(dists)
    
hol_distances = df['release_date'].apply(lambda x: holCollect(x, cal) if pd.notnull(x) else np.nan)

In [24]:
hol_distances = hol_distances[hol_distances.notnull()].apply(lambda x: pd.Series(x))

In [25]:
# hol_distances = hol_distances.apply(lambda array: array if type(array) == list and len(array) == 10 else [np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan])
# hol_dists_to_save = pd.DataFrame(hol_dists_final, columns=['new_years', 'mlk_day', 'presidents_day', 'memorial_day',
#                                      'jul_4th', 'labor_day', 'columbus_day', 'veterans_day',
#                                      'thanksgiving_day', 'christmas_day'])
# df = pd.concat([df, hol_dists_to_save], axis = 1)

In [26]:
df = df.merge(hol_distances, left_index = True, right_index = True)

### nunique franchises across actors

In [27]:
# nunique franchises across actors 
def nUniqueFranchises(row):
    actors = row['actor']
    release_date = row['release_date']
    unique_prior_franchises = []
    unique_prior_franchise_films = []
    result = pd.Series({'unique_prior_franchises': 0,
                        'unique_prior_franchise_films': 0})
    for actor in actors:
        prev_films = df[df['franchise'] != False]
        if prev_films.empty:
            continue
        prev_films = prev_films[prev_films['release_date'] < release_date]
        if prev_films.empty:
            continue
        prev_films = prev_films[prev_films['actor'].apply(lambda x: actor in x)]
        if prev_films.empty:
            continue
        unique_prior_franchises.extend(prev_films['franchise'].tolist())
        unique_prior_franchise_films.extend(prev_films['title'].tolist())
    result['unique_prior_franchises'] = len(set(unique_prior_franchises))
    result['unique_prior_franchise_films'] = len(set(unique_prior_franchise_films))
    return result
    
n_unique_franchises = df.apply(nUniqueFranchises, axis = 1)

In [28]:
df = df.merge(n_unique_franchises, left_index = True, right_index = True)

## Add personal disposable spending

In [29]:
# def nearestDate(dates, pivot, personal_spending):
#     try: 
#         date = min(dates, key=lambda x: abs(x - pivot))
#         return personal_spending[date]
#     except:
#         return np.nan

# personal_spending = pd.read_csv('personal_spending.csv')
# personal_spending = dict(zip(pd.to_datetime((personal_spending['DATE'])), personal_spending['DSPIC96']))
# spend_dates = pd.to_datetime(personal_spending.keys())
# df['personal_spending'] = df['release_date'].apply(lambda x : nearestDate(spend_dates, x, personal_spending))

## Get number of films released in same week and month

In [30]:
df['competition_week'] = df['release_date'].apply(lambda x: len(df[(df['release_date'] > x - pd.DateOffset(4)) & 
                                                                   (df['release_date'] < x + pd.DateOffset(4))]) - 1)

df['competition_month'] = df['release_date'].apply(lambda x: len(df[(df['release_date'] > x - pd.DateOffset(15)) & 
                                                                   (df['release_date'] < x + pd.DateOffset(15))]) - 1)
df['competition_week'].replace(-1, np.nanmean(df['competition_week'].replace(-1, np.nan)), inplace = True)
df['competition_month'].replace(-1, np.nanmean(df['competition_month'].replace(-1, np.nan)), inplace = True)

# Fix types, missing data

In [31]:
df['franchise'] = df['franchise'].astype(bool)
df['mean_director_gross'] = df['mean_director_gross'].astype(float)
df['mean_actor_gross'] = df['mean_actor_gross'].astype(float)
df['director_score'] = df['director_score'].astype(float)
df['actor_score'] = df['actor_score'].astype(float)

In [32]:
# fix missing runtimes
df['runtime'] = df['runtime'].replace('', 0)
df['runtime'] = df['runtime'].astype(int)
df['runtime'] = df['runtime'].replace(0, np.nan)

# replace NaN tenures with 0's
df['actor_mean_tenure'] = df['actor_mean_tenure'].fillna(0)
df['direc_mean_tenure'] = df['direc_mean_tenure'].fillna(0)

## genres

In [33]:
# # strips whitespace in strings within list of variable sublist size recursively
# def genre_retrieval(multilist):
#     newlist = []
#     for x in multilist:
#         type_x = type(x)
#         if type_x == list:
#             newlist.append([item for item in genre_retrieval(x)])
#         elif type_x == str or type_x == unicode:
#             x = x.strip()
#             newlist.append(x)
#     return newlist

In [34]:
# get all unique genres
genres = pd.Series([val.strip() for sublist in df.genres.dropna().tolist() for val in sublist]).unique()

# init series
for genre in genres:
    df[genre] = 0
df['genres'] = df['genres'].fillna('N/A')

# replace error with NA for each movie 
listed = df['genres'].tolist()
for i in range(len(listed)):
    if listed[i] == 'N/A':
        listed[i] = ['N/A']
df['genres'] = listed

final_genres_list = []
for sublist in df.genres.tolist():
    sublist_temp = []
    for string in sublist:
        sublist_temp.append(string.strip())
    final_genres_list.append(sublist_temp)

df['genres'] = final_genres_list

for i in range(len(df)):
    for genre in df.genres.iat[i]:
        df[genre].iat[i] = 1

# mean gross by genre for previous year

In [35]:
# df[df['domestic_gross'].apply(lambda x: type(x) != int)][['title', 'domestic_gross']]

In [36]:
def mean_gross_genre(film):
    mean_grosses = []
    lastyear = df[df['year'] == film['year'] - 1]
    for genre in film['genres']:
        try: 
            mean_grosses.append(np.mean(lastyear[lastyear[genre] == 1]['domestic_gross'].dropna()))
        except:
            print df.keys()
            mean_grosses.append(np.mean(df[df[genre] == 1]['domestic_gross'].dropna()))
    return np.nanmean(mean_grosses)

In [37]:
df['mean_gross_by_genre_prev_year'] = df.apply(lambda x: mean_gross_genre(x), axis = 1)

### mean director genre expertise (mean number of times directors have starred in a movie of that genre before this movie)

In [38]:
# def mean_direc_genre_expertise(title):
#     if title['theatrical_release']:
#         release = title['theatrical_release']['$date']
#         genre = title['genres'][0]
#         counts = []
#         for direc in title['director']:
#             films = df[df['director'].apply(lambda x: direc in x)]
#             films = films[films['theatrical_release_$date'].apply(lambda x: pd.notnull(x))]
#             films = films[films['theatrical_release_$date'].apply(lambda x: x < release)]
#             films = films[films['genres'].apply(lambda x: genre in x)]
#             if films.empty:
#                 counts.append(np.nan)
#             else:
#                 counts.append(len(films))
#         return np.nanmean(counts)
#     else:
#         return np.nan
    
# df['mean_direc_genre_expertise'] = [mean_direc_genre_expertise(title) for title in titles]
# df['mean_direc_genre_expertise'] = df['mean_direc_genre_expertise'].fillna(df['mean_direc_genre_expertise'].median())

### mean actor genre expertise (mean number of times actors have starred in a movie of that genre before this movie)

In [39]:
# def mean_act_genre_expertise(title):
#     if title['theatrical_release']:
#         release = title['theatrical_release']['$date']
#         genre = title['genres'][0]
#         counts = []
#         for act in title['actor']:
#             films = df[df['actor'].apply(lambda x: act in x)]
#             films = films[films['theatrical_release_$date'].apply(lambda x: pd.notnull(x))]
#             films = films[films['theatrical_release_$date'].apply(lambda x: x < release)]
#             films = films[films['genres'].apply(lambda x: genre in x)]
#             if films.empty:
#                 counts.append(np.nan)
#             else:
#                 counts.append(len(films))
#         return np.nanmean(counts)
#     else:
#         return np.nan
    
# df['mean_act_genre_expertise'] = [mean_act_genre_expertise(title) for title in titles]
# df['mean_act_genre_expertise'] = df['mean_act_genre_expertise'].fillna(df['mean_act_genre_expertise'].median())

## mean director gross by genre

In [40]:
def mean_direc_gross_genre(title_of_movie):
    title = titles_dict[title_of_movie]
    if title['theatrical_release']:
        release = pd.to_datetime(title['theatrical_release']['$date'])
        genre = title['genres'][0]
        grosses = []
        for direc in title['director']:
            films = df[df['director'].apply(lambda x: direc in x)]
            if films.empty:
                grosses.append(np.nan)
                continue
            films = films[films['release_date'].apply(lambda x: pd.notnull(x))]
            films = films[films['release_date'].apply(lambda x: x < release)]
            films = films[films['genres'].apply(lambda x: genre in x)]
            if films.empty:
                grosses.append(np.nan)
            else:
                films = films[films['domestic_gross'].apply(lambda x: pd.notnull(x))]
                grosses.append(np.nanmean(films['domestic_gross']))
        return np.nanmean(grosses)
    else:
        return np.nan
    
df['mean_direc_gross_genre'] = Parallel(n_jobs=4)(delayed(mean_direc_gross_genre)(title_of_movie) for title_of_movie in df['title'].tolist())
df['mean_direc_gross_genre'] = df['mean_direc_gross_genre'].fillna(df['mean_direc_gross_genre'].median())

## mean actor gross by genre

In [41]:
def mean_act_gross_genre(title_of_movie):
    title = titles_dict[title_of_movie]
    if title['theatrical_release']:
        release = pd.to_datetime(title['theatrical_release']['$date'])
        genre = title['genres'][0]
        grosses = []
        for act in title['actor']:
            films = df[df['actor'].apply(lambda x: act in x)]
            films = films[films['release_date'].apply(lambda x: pd.notnull(x))]
            films = films[films['release_date'].apply(lambda x: x < release)]
            films = films[films['genres'].apply(lambda x: genre in x)]
            if films.empty:
                grosses.append(np.nan)
            else:
                films = films[films['domestic_gross'].apply(lambda x: pd.notnull(x))]
                grosses.append(np.nanmean(films['domestic_gross']))
        return np.nanmean(grosses)
    else:
        return np.nan
    
df['mean_act_gross_genre'] = Parallel(n_jobs=4)(delayed(mean_act_gross_genre)(title_of_movie) for title_of_movie in df['title'].tolist())
df['mean_act_gross_genre'] = df['mean_act_gross_genre'].fillna(df['mean_act_gross_genre'].median())

### mean actor-director collaboration frequency

In [42]:
def mean_act_direc_collab_freq(title_of_movie):
    title = titles_dict[title_of_movie]
    if title['theatrical_release']:
        release = pd.to_datetime(title['theatrical_release']['$date'])
        collabs = []
        for act in title['actor']:
            act_collabs = []
            for direc in title['director']:
                films = df[df['director'].apply(lambda x: direc in x)]
                films = films[films['release_date'].apply(lambda x: pd.notnull(x))]
                films = films[films['release_date'].apply(lambda x: x < release)]
                films = films[films['actor'].apply(lambda x: act in x)]
                if films.empty:
                    act_collabs.append(0)
                else:
                    act_collabs.append(len(films))
            collabs.append(np.nanmean(act_collabs))
        return np.nanmean(collabs)
    else:
        return np.nan
    
df['mean_act_direc_collab_freq'] = Parallel(n_jobs=4)(delayed(mean_act_direc_collab_freq)(title_of_movie) for title_of_movie in df['title'].tolist())
df['mean_act_direc_collab_freq'] = df['mean_act_direc_collab_freq'].fillna(df['mean_act_direc_collab_freq'].median())

### distributor production company collab freq

In [43]:
def distributorProdcoCollabFreq(title_of_movie):
    title = titles_dict[title_of_movie]
    if title['theatrical_release']:
        release = pd.to_datetime(title['theatrical_release']['$date'])
        collabs = []
        for distrib_co in title['distribution_co']:
            distrib_cos_collabs = []
            for prod_co in title['production_co']:
                films = df[df['production_co'].apply(lambda x: prod_co in x)]
                films = films[films['release_date'].apply(lambda x: pd.notnull(x))]
                films = films[films['release_date'].apply(lambda x: x < release)]
                films = films[films['distribution_co'].apply(lambda x: distrib_co in x)]
                if films.empty:
                    distrib_cos_collabs.append(0)
                else:
                    distrib_cos_collabs.append(len(films))
            collabs.append(np.nanmean(distrib_cos_collabs))
        return np.nanmean(collabs)
    else:
        return np.nan
    
df['mean_distrib_prodco_collab_freq'] = Parallel(n_jobs=4)(delayed(distributorProdcoCollabFreq)(title_of_movie) for title_of_movie in df['title'].tolist())
df['mean_distrib_prodco_collab_freq'] = df['mean_distrib_prodco_collab_freq'].fillna(df['mean_distrib_prodco_collab_freq'].median())


### in-genre competition during the week of the film's release: count, budget

In [44]:
def igcPrevFollow(row):
    release_date = row['release_date']
    top_genre = row['genres'][0]
    igcw_prev_week = df[(df['release_date'] >= release_date - pd.DateOffset(8)) & 
                        (df['release_date'] < release_date) &
                        (df['genres'].apply(lambda x: top_genre in x))]
    igcw_following_week = df[(df['release_date'] <= release_date + pd.DateOffset(8)) & 
                             (df['release_date'] > release_date) &
                             (df['genres'].apply(lambda x: top_genre in x))]
    igc_prev_wk = igcw_prev_week.shape[0]
    igc_following_wk = igcw_following_week.shape[0]
    igc_prev_wk_budgets = igcw_prev_week['production_budget'].mean()
    igc_prev_wk_first_wknds = igcw_prev_week['first_wknd_gross'].mean()
    igc_following_wk_budgets = igcw_following_week['production_budget'].mean()
    result = pd.Series({'in_genre_competition_prev_week': igc_prev_wk,
                       'in_genre_competition_following_week': igc_following_wk,
                        'in_genre_competition_prev_wk_grosses': np.nanmax([igc_prev_wk_first_wknds, 0]),
                       'in_genre_competition_budgets_prev_week': np.nanmax([igc_prev_wk_budgets, 0]),
                        'in_genre_competition_budgets_following_week': np.nanmax([igc_following_wk_budgets, 0])})
    return result


igc_feats = df.apply(igcPrevFollow, axis = 1)
df = df.merge(igc_feats, left_index = True, right_index = True)

### Mean, stddev, count of films previously released by distcos, prodcos, writers

In [45]:
def aggPrevFilms(df, colname, entities, release_date, min_film_count, result):
    if len(entities) > 0:
        for entity in entities:
            prev_grosses = []
            prev_films = df[(df[colname].apply(lambda x: entity in x)) &
                  (df['release_date'] < release_date)]
            if not prev_films.empty:
                prev_films = prev_films[['domestic_gross', 'title']]
                prev_grosses.append(prev_films)
        if len(prev_grosses) > min_film_count:
            prev_grosses = pd.concat(prev_grosses).drop_duplicates()
            result[colname + '_mean_gross'] = prev_grosses['domestic_gross'].mean()
            result[colname + '_stddev_gross'] = prev_grosses['domestic_gross'].std()
            result[colname + '_prev_count'] = prev_grosses['title'].nunique()
    return result

def distProdWriterFeats(row, df):
    release_date = row['release_date']
    # get the number of previous films by distco involved
    # filter out distcos with < 5 films released
    dist_cos = row['distribution_co']
    prod_cos = row['production_co']
    writers = row['screenwriter']
    result = pd.Series({'distribution_co_mean_gross': 0,
                        'distribution_co_stddev_gross':0,
                        'distribution_co_prev_count':0,
                        'production_co_mean_gross':0,
                        'production_co_stddev_gross':0,
                        'production_co_prev_count':0,
                        'screenwriter_mean_gross':0,
                        'screenwriter_stddev_gross':0,
                        'screenwriter_prev_count':0})
    result = aggPrevFilms(df, 'distribution_co', dist_cos, release_date, 0, result)
    result = aggPrevFilms(df, 'production_co', prod_cos, release_date, 0, result)
    result = aggPrevFilms(df, 'screenwriter', writers, release_date, 0, result)
    return result.fillna(0)

distco_prodco_writer_feats = df.apply(distProdWriterFeats, args = (df,), axis = 1)

In [46]:
df = df.merge(distco_prodco_writer_feats, left_index = True, right_index = True)

### Percentage of production studios previous films that are franchises / adaptations



In [47]:
def prevProdCoFranchisesAdpts(row, df):
    release_date = row['release_date']
    # get the number of previous films by prod cos involved
    prod_cos = row['production_co']
    result = pd.Series({'prod_co_prev_franchise_films':0.0,
                        'prod_co_prev_adpts': 0.0})
    if prod_cos != []:
        prod_co_percent_franchises = []
        prod_co_percent_adpts = []
        for prod_co in prod_cos:
            prev_films = df[(df['production_co'].apply(lambda x: prod_co in x)) &
                  (df['release_date'] < release_date)]
            if not prev_films.empty:
                prev_films = prev_films[['franchise', 'adaptation', 'title']]
                num_prev = float(prev_films.shape[0])
                if num_prev > 0:
                    num_franch = prev_films.franchise.apply(lambda x: x in (True, 1, 'True')).sum()
                    num_adpt = prev_films.adaptation.apply(lambda x: x not in (False, 'False')).sum()
                    prod_co_percent_franchises.append(num_franch / num_prev)
                    prod_co_percent_adpts.append(num_adpt / num_prev)
        if len(prod_co_percent_franchises) > 1:
            result['prod_co_prev_franchise_films'] = np.nanmean(prod_co_percent_franchises)
        if len(prod_co_percent_adpts) > 1:
            result['prod_co_prev_adpts'] = np.nanmean(prod_co_percent_adpts)
    return result

prodco_franch_adpt_feats = df.apply(prevProdCoFranchisesAdpts, args = (df,), axis =1)

In [48]:
df = df.merge(prodco_franch_adpt_feats, left_index = True, right_index = True)

# Age related features

In [49]:
# efficiency of career: mean across actors of (tenure / age) * log(mean_gross)

def ageFeatFunc(row):
    release_date = row['release_date']
    # get the number of previous films by writers involved
    # filter out writers with < 3 films written
    actors = row['actor']
    film_title = row['title']
    result = pd.Series({'actors_mean_age':0,
                          'actors_max_age':0,
                          'actors_stddev_age':0.0,
                          'career_efficiency':0.0})
    if actors == []:
        return result
    if pd.isnull(release_date):
        return result
    else:
        ages = []
        efficiencies = []
        for actor in actors:
            try:
                if pd.isnull(people_dict[actor]['birthday']['$date']):
                    continue
            except:
                continue
            bday = pd.to_datetime(people_dict[actor]['birthday']['$date'])
            age = (release_date - bday).days
            ages.append(age)
            prev_films = df[(df['release_date'] < release_date) &
                            (df['title'].apply(lambda x: x in people_dict[actor]['films']))]
            if not prev_films.empty:
                prev_films = prev_films[['domestic_gross', 'release_date']]
                prev_films['release_date'] = pd.to_datetime(prev_films['release_date'])
                tenure = (release_date - prev_films['release_date'].min()).days
                mean_gross = prev_films['domestic_gross'].mean()
                efficiency = (tenure / float(age)) * np.nanmax((np.log(mean_gross), 0))
                efficiencies.append(efficiency)
        if len(ages) < 1:
            return result
        else:
            result['actors_mean_age'] = np.mean(ages)
            result['actors_max_age'] = np.max(ages)
            result['actors_stddev_age'] = np.std(ages)
            result['career_efficiency'] = np.nanmax((np.nanmean(efficiencies), 0))
            return result

result = pd.DataFrame(Parallel(n_jobs=4)(delayed(ageFeatFunc)(row) for _, row in list(df.iterrows())))
df = df.merge(result, left_index = True, right_index = True)

# Diversity related features

In [50]:
def typecastScore(all_films):
    typecast_scores = []
    for actor_list in all_films:
        if len(actor_list) > 0:
            races = [people_dict[actor]['race'] for actor in actor_list if actor in people_dict and 'race' in people_dict[actor]]
            if len(races) > 0:
                typecast_score = 0
                races = Counter(races)
                count_sum = float(sum(races.values()))
                for key in races.keys():
                    races[key] = races[key] / count_sum
                if 'White' in races and races['White'] <= .5:
                    typecast_score = 1.0
                typecast_scores.append(typecast_score)
    return np.nanmean(typecast_scores)

In [51]:
from collections import Counter
def diversityFeats(row):
    actors = row['actor']
    directors = row['director']
    release_date = row['release_date']
    nunique_races = 0
    percent_white = 100
    percent_male = .6
    director_white = True
    director_male = True
    mean_typecast_score = .5
    typecast_scores = []
    if len(actors) > 0:
        act_races = []
        act_genders = []
        for actor in actors:
            if actor in people_dict:
                if 'race' in people_dict[actor]:
                    act_races.append(people_dict[actor]['race'])
                    # for all prev films, a 0 or 1 for whether majority race is nonwhite
                    # mean across prev films 
                    prev_films = df[(df['release_date'] < release_date) &
                                (df['title'].apply(lambda x: x in people_dict[actor]['films']))]['actor']
                    if not prev_films.empty:
                        typecast_score = typecastScore(prev_films.tolist())
                        typecast_scores.append(typecast_score)
                if 'gender' in people_dict[actor]:
                    act_genders.append(people_dict[actor]['gender'])
        if len(act_races) > 0:
            nunique_races = len(set(act_races))
            percent_white = len([ele for ele in act_races if ele == 'White']) / float(len(act_races))
        if len(act_genders) > 0:
            percent_male = len([ele for ele in act_genders if ele == 'Male']) / float(len(act_genders))
    if len(directors) > 0:
        direc_races = []
        direc_genders = []
        for director in directors:
            if 'race' in people_dict[director]:
                direc_races.append(people_dict[director]['race'])
            if 'gender' in people_dict[director]:
                direc_genders.append(people_dict[director]['gender'])
        direc_races_counter = Counter(direc_races)
        direc_genders_counter = Counter(direc_genders)
        if 'Male' not in direc_genders_counter:
            director_male = False
        if 'Female' in direc_genders_counter and 'Male' in direc_genders_counter:
            if direc_genders_counter['Female'] > direc_genders_counter['Male']:
                director_male = False
        if 'White' not in direc_races_counter:
            director_white = False
        if len(direc_races_counter) > 1:
            director_white = False
    if len(typecast_scores) > 0:
        typecast_scores = [score for score in typecast_scores if pd.notnull(score)]
        mean_typecast_score = np.nanmean(typecast_scores)
        
    return pd.Series({'nunique_races': nunique_races,
                      'percent_white': percent_white,
                      'percent_male': percent_male,
                      'director_white': director_white,
                      'director_male': director_male,
                      'typecast_score': mean_typecast_score})


result = pd.DataFrame(Parallel(n_jobs=4)(delayed(diversityFeats)(row) for _, row in list(df.iterrows())))
df = df.merge(result, left_index = True, right_index = True)

drop bad columns

In [52]:
if 'international_gross_$numberLong' in df.columns:
    df = df.drop('international_gross_$numberLong', axis = 1 )

In [53]:
if SELECT_FILMS == True:
    df.to_csv('df_release_dating.csv', index = False, encoding = 'utf-8')
else:
    df.to_csv('df_regr.csv', index = False, encoding = 'utf-8')

# Time Intensive Features

In [54]:
if SELECT_FILMS == True:
    df = pd.read_csv('df_release_dating.csv')
else:
    df = pd.read_csv('df_regr.csv')
# though genres and actors are lists originally, they're then stored as strings 
# this line changes them back
for col in ['genres', 'actor', 'director', 'production_co', 'distribution_co']:
    df[col] = df[col].apply(lambda x: [ele.strip() for ele in x[1:-1].split(',')])
df['release_date'] = pd.to_datetime(df['release_date'])

### average genre expertise (AGE), weighted AGE and cast novelty

In [55]:
# if running this notebook top to bottom, genres is already defined, 
# if not, here they are 
genres = list(set(gen for sublist in df['genres'].tolist() for gen in sublist))

In [56]:
# for each film, for each actor in it
# AGE: across-actor mean of genres vector multiplied by the proportion of the number of times 
# the actor previously starred in films of genres of the given film
# WAGE: AGE weighted by gross
# cast novelty: log gross / WAGE

def AGE_WAGE_NOVELTY(idx, row):
    # get actors involved
    actors = row['actor']
    # handle no actors 
    if actors == []:
        return pd.Series({'average_genre_expertise': 0,
                          'std_genre_expertise': 0,
                          'weighted_average_genre_expertise': 0,
                          'std_weighted_average_genre_expertise': 0,
                          'cast_novelty': 0})
    else:
        # init vars 
        release_date = row['release_date']
        film_gen_dict = {genre: 0 for genre in genres}
        for genre in row['genres']:
            film_gen_dict[genre] += 1
        AGEs = []
        WAGEs = []
        cast_novelties = []
        for actor in actors:
            # get previous films by actors involved
            prev_films = df[(df['release_date'] < release_date) &
                (df['actor'].apply(lambda x: actor in x))]
            if not prev_films.empty:
                prev_films = prev_films[['domestic_gross', 'genres']]
                # for each actor, gen vec * experience vec 
                actor_gen_dict = {genre: 0 for genre in genres}
                # collect genres of previous films
                prev_gens = [gen for sublist in prev_films['genres'].tolist() for gen in sublist]
                for genre in prev_gens:
                    actor_gen_dict[genre] += 1
                # calculate normalizd AGE dict
                factor=1.0/sum(actor_gen_dict.itervalues())
                normalized_actor_gen_dict = {k: v*factor for k, v in actor_gen_dict.iteritems() }
                # calculate AGE 
                AGE = sum(film_gen_dict[key]*normalized_actor_gen_dict.get(key, 0) for key in film_gen_dict)
                AGEs.append(AGE)
                # calculate WAGE
                log_gross = np.log(prev_films['domestic_gross'].sum())
                WAGEs.append(log_gross * AGE)
                # calculate novelty
                cast_novelties.append(log_gross / (AGE + 1))
        mAGEs = 0
        stdAGEs = 0
        mWAGEs = 0
        stdWAGEs = 0
        max_cast_novelties = 0
        if len(AGEs) > 0:
            mAGEs = np.mean(AGEs)
            stdAGEs = np.std(AGEs)
        if len(WAGEs) > 0:
            mWAGEs = np.mean(WAGEs)
            stdWAGEs = np.std(WAGEs)
        if len(cast_novelties) > 0:
            max_cast_novelties = np.max(cast_novelties)
        return pd.Series({'average_genre_expertise': mAGEs,
                          'std_genre_expertise': stdAGEs,
                          'weighted_average_genre_expertise': mWAGEs,
                          'std_weighted_average_genre_expertise': stdWAGEs,
                          'cast_novelty': max_cast_novelties})
    
result = Parallel(n_jobs=4)(delayed(AGE_WAGE_NOVELTY)(idx, row) for idx, row in list(df.iterrows()))
df = df.merge(pd.DataFrame(result), how='left', left_index=True, right_index=True)

In [57]:
if SELECT_FILMS == True:
    df.to_csv('df_release_dating.csv', index = False, encoding = 'utf-8')
else:
    df.to_csv('df_regr.csv', index = False, encoding = 'utf-8')

# Other code

cleaning out duplicate people

In [58]:
# from fuzzywuzzy import fuzz
# names1 = [person['name'].lower() for person in people]
# dupe_suggestions = []
# seen_names = []
# for name1 in names1:
#     seen_names.append(name1)
#     names2 = names1[:]
#     for seen_name in seen_names:
#         names2.remove(seen_name)
#     overlap_ratio = np.array([fuzz.token_set_ratio(name1, name2) for name2 in names2])
#     if any(overlap_ratio > 90):
#         max_overlap_ratio_idxs = overlap_ratio.argsort()[-3:][::-1]
#         dupe_suggestions.append({name1:np.take(names2,max_overlap_ratio_idxs)})