In [8]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [9]:
directors_full = pd.read_pickle('../data/pickles/director_writer_nodrop.p')
movies = pickle.load(open('../data/pickles/movies.p', 'rb'))
characters = pickle.load(open('../data/pickles/characters.p', 'rb'))
characters_movies = pd.merge(characters[['movie_wiki_id', 'actor_name', 'combined_birth', 'combined_gender']], movies[['wikipedia_id', 'title', 'combined_release_year', 'combined_runtime', 'combined_box_office', 'averageRating_imdb', 'numVotes_imdb']], left_on='movie_wiki_id', right_on='wikipedia_id', how='left')

In [10]:
df = movies[['title', 'averageRating_imdb', 'numVotes_imdb', 'combined_box_office']].copy()

df['numVotes_imdb'] = np.log(df.numVotes_imdb)
df['combined_box_office'] = np.log(df.combined_box_office)

df['numVotes_imdb'] =(df.numVotes_imdb - df.numVotes_imdb.mean()) / df.numVotes_imdb.std()
df['combined_box_office'] =(df.combined_box_office - df.combined_box_office.mean()) / df.combined_box_office.std()
df['averageRating_imdb'] =(df.averageRating_imdb - df.averageRating_imdb.mean()) / df.averageRating_imdb.std()

In [11]:
df['metric'] = df.numVotes_imdb * np.abs(df.averageRating_imdb)

In [12]:
temp = pd.merge(characters_movies[["actor_name","combined_gender","combined_birth","combined_release_year","title"]].dropna().drop_duplicates(), df[["metric","title"]].drop_duplicates(subset=["title"]), left_on="title", right_on="title", validate="many_to_one")
temp["age"] = temp["combined_release_year"]-temp["combined_birth"]
temp.head()

Unnamed: 0,actor_name,combined_gender,combined_birth,combined_release_year,title,metric,age
0,Wanda De Jesus,female,1958.0,2001.0,Ghosts of Mars,2.360202,43.0
1,Natasha Henstridge,female,1974.0,2001.0,Ghosts of Mars,2.360202,27.0
2,Ice Cube,male,1969.0,2001.0,Ghosts of Mars,2.360202,32.0
3,Jason Statham,male,1967.0,2001.0,Ghosts of Mars,2.360202,34.0
4,Clea DuVall,female,1977.0,2001.0,Ghosts of Mars,2.360202,24.0


Average metric for all actors:

In [13]:
temp.groupby("actor_name").mean("metric").sort_values(by = "metric", ascending = False).dropna().head(20)["metric"]

actor_name
Morgana King                9.158800
Victor Rendina              9.158800
Tom Rosqui                  8.649312
Alexandra Astin             8.532330
Paul Norell                 8.532330
Nick Steele                 8.474059
Susan Griffiths             8.329917
Chandler Lindauer           8.329917
Michael Gilden              8.329917
Brenda Hillhouse            8.329917
Stephen Hibbert             8.329917
Sarah McLeod                8.229758
Roman Coppola               8.139823
Richard Matheson            8.139823
Sala Baker                  8.090836
Michael Conner Humphreys    8.027604
Mark Ferguson               7.927186
Peter McKenzie              7.927186
Olivia Tennet               7.812993
Lauren Katherine Conrad     7.712191
Name: metric, dtype: float64

find the percentage of male actor in each movie \\
take the date of each movie \\
take the gender of the director \\


In [14]:
pd.DataFrame(characters_movies.groupby("title")["combined_gender"].value_counts(normalize=True)).rename(columns={"combined_gender":"percent"})

Unnamed: 0_level_0,Unnamed: 1_level_0,percent
title,combined_gender,Unnamed: 2_level_1
#1 Cheerleader Camp,female,0.666667
#1 Cheerleader Camp,male,0.333333
$,male,0.750000
$,female,0.250000
$9.99,male,0.888889
...,...,...
…All the Marbles,female,0.600000
…All the Marbles,male,0.400000
₤500 Reward,female,1.000000
春田花花同學會,male,0.800000


### Find film female actor percentage

In [15]:
new = pd.DataFrame(characters_movies.groupby("title")["combined_gender"].value_counts(normalize=True)).rename(columns={"combined_gender":"percent"})
new = new.reset_index()
newF = new.loc[new["combined_gender"] == "female"].drop(columns=["combined_gender"]).rename(columns={"percent":"f_actor_percentage"})
newM = new.loc[new["combined_gender"] == "male"].drop(columns=["combined_gender"]).rename(columns={"percent":"m_actor_percentage"})
actor_gender_percentage = pd.merge(newF, newM, left_on="title", right_on="title")
actor_gender_percentage

Unnamed: 0,title,f_actor_percentage,m_actor_percentage
0,#1 Cheerleader Camp,0.666667,0.333333
1,$,0.250000,0.750000
2,$9.99,0.111111,0.888889
3,'77,0.200000,0.800000
4,'Neath the Arizona Skies,0.428571,0.571429
...,...,...,...
44131,Ōte,0.500000,0.500000
44132,Želary,0.375000,0.625000
44133,Život sa stricem,0.333333,0.666667
44134,…All the Marbles,0.600000,0.400000


### Find actor mean age per film

In [16]:
age = temp.groupby(["title","combined_gender"]).agg("mean")
age = age.drop(columns=["combined_birth","combined_release_year","metric"])
age = age.reset_index()
ageF = age.loc[age["combined_gender"] == "female"].drop(columns=["combined_gender"]).rename(columns={"age":"f_actor_mean_age"})
ageM = age.loc[age["combined_gender"] == "male"].drop(columns=["combined_gender"]).rename(columns={"age":"m_actor_mean_age"})
actor_mean_age = pd.merge(ageF, ageM, left_on="title", right_on="title")
actor_mean_age

  age = temp.groupby(["title","combined_gender"]).agg("mean")


Unnamed: 0,title,f_actor_mean_age,m_actor_mean_age
0,#1 Cheerleader Camp,43.500000,31.000000
1,$,32.500000,44.666667
2,$9.99,36.000000,38.750000
3,'77,54.000000,40.250000
4,'Neath the Arizona Skies,5.333333,37.000000
...,...,...,...
41590,Ōte,28.000000,28.000000
41591,Želary,50.000000,47.200000
41592,Život sa stricem,26.500000,37.750000
41593,…All the Marbles,29.333333,47.500000


In [17]:
gender_stats = pd.merge(actor_gender_percentage, actor_mean_age, left_on="title", right_on="title")
gender_stats = pd.merge(df[["title","metric"]], gender_stats, left_on="title", right_on="title", how="inner")

In [20]:
gender_stats.to_pickle("./gender_stats.pkl")

### Linear Regression of metric based on f_actor_percentage, f_actor_mean_age, m_actor_mean_age

In [255]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [256]:
# Declares the model
mod = smf.ols(formula='metric ~ f_actor_percentage + f_actor_mean_age + m_actor_mean_age', data=gender_stats)
# Fits the model (find the optimal coefficients, adding a random seed ensures consistency)
np.random.seed(2)
res = mod.fit()
# Print thes summary output provided by the library.
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                 metric   R-squared:                       0.016
Model:                            OLS   Adj. R-squared:                  0.016
Method:                 Least Squares   F-statistic:                     239.6
Date:                Thu, 08 Dec 2022   Prob (F-statistic):          3.14e-154
Time:                        14:47:44   Log-Likelihood:                -61262.
No. Observations:               43359   AIC:                         1.225e+05
Df Residuals:                   43355   BIC:                         1.226e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept              0.1976      0

In [4]:
W_apperance = pickle.load(open('../data/pickles/women_appearance.p', 'rb'))

In [7]:
W_apperance

[['tt0228333', 1],
 ['tt0245916', 1],
 ['tt0094806', 2],
 ['tt0094320', 2],
 ['tt0083949', 1],
 ['tt0002894', 7],
 ['tt0120166', 2],
 ['tt0029852', 2],
 ['tt0200545', 1],
 ['tt0053719', 3],
 ['tt0021335', 1],
 ['tt0072157', 1],
 ['tt0119548', 2],
 ['tt0097499', 26],
 ['tt0278891', 1],
 ['tt0033888', 1],
 ['tt0088646', 2],
 ['tt0058331', 1],
 ['tt0061637', 2],
 ['tt0104601', 2],
 ['tt0020823', 1],
 ['tt0133122', 3],
 ['tt0892904', 3],
 ['tt0055997', 2],
 ['tt0022289', 6],
 ['tt0026167', nan],
 ['tt0053820', 2],
 ['tt0079552', 2],
 ['tt0255819', 4],
 ['tt0367546', 1],
 ['tt0255668', 2],
 ['tt0051577', 1],
 ['tt0088157', 1],
 ['tt0366182', 2],
 ['tt0033150', 1],
 ['tt1849787', 2],
 ['tt0178022', 2],
 ['tt0178356', 4],
 ['tt0099054', 2],
 ['tt0079899', 12],
 ['tt0021728', 2],
 ['tt0482499', 6],
 ['tt0153301', 6],
 ['tt0166158', 2],
 ['tt0004604', nan],
 ['tt0080801', 2],
 ['tt0008150', 1],
 ['tt0009082', 2],
 ['tt0405393', 4],
 ['tt0097670', 5],
 ['tt0255267', 2],
 ['tt0843287', 7],
 ['tt0