# The Portrayal of Actresses - What types of women are portrayed in movies?

In [None]:
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import requests
from bs4 import BeautifulSoup as bs
import re
import gender_guesser.detector as gender
import plotly.express as px
import statsmodels.api as sm
import statsmodels.formula.api as smf
from tqdm import tqdm
import plotly.graph_objects as go
%matplotlib inline

In [None]:
data_folder = 'data/'
pickle_folder = data_folder + 'pickles/'

In [None]:
characters = pickle.load(open(pickle_folder + 'characters.p', 'rb'))
movies = pickle.load(open(pickle_folder + 'movies.p', 'rb'))

In [None]:
def plot_percentage_number_per_year(df, title, year, *args):
    df = df.loc[df[year].notna()]
    figure, ax = plt.subplots(1,2,figsize=(10,5))
    plt.suptitle(title)

    colours = sns.color_palette(n_colors=len(args) + 1)
    df_percentage = pd.DataFrame()
    df_abs = pd.DataFrame()
    df_abs['total_per_year'] = df[year].value_counts()

    for element in args:
        df_abs[element] = df.loc[df[element].notna()][year].value_counts().astype(float)
        df_percentage[element] = df_abs[element] / df_abs['total_per_year']
    df_abs.index = df_abs.index.astype(float)
    df_percentage.index = df_percentage.index.astype(float)

    ax1 = sns.lineplot(data=df_abs, ax=ax[0],palette=colours[-len(args)-1:], dashes=False)
    ax2 = sns.lineplot(data=df_percentage, ax=ax[1],palette=colours[-len(args):], dashes=False)
    ax1.set_title('Number of characters with specific feature')
    ax2.set_title('Percentage of characters with specific feature')
    ax1.set_xlabel('Years')
    ax2.set_xlabel('Years')
    ax1.set_ylabel('Number of characters')
    ax2.set_ylabel('Percentage of characters with feature')
    plt.show()

## Completeness of our data

We will briefly analyse the temporal completeness of our data

In [None]:
plot_percentage_number_per_year(characters, 'Characters', 'combined_release_year', 'combined_gender', 'combined_ethnicity', 'combined_birth')

Since we want to do an analysis over time, the plot above shows the completeness of actor gender ethnicity and birth over the years. Percentage of characters with specific feature (right graph) gets really noisy at both ends of our year spectrum. This is due to the small amount of movies in those time periods. Overall the gender and birth informations are around 80% complete around all years. Ethnicity on the other hand evolves significantly from older to newer films, and stays at lower completeness levels.

In [None]:
_, ax = plt.subplots(2,1, gridspec_kw={'height_ratios': [1, 6]})
ax1 = sns.boxplot(x = movies["combined_release_year"], ax = ax[0])
ax2 = movies["combined_release_year"].hist(bins = movies["combined_release_year"].nunique(), ax = ax[1])
ax[1].set_ylabel("Count")
ax[1].set_xlabel("Years")

ax[0].set_title("Distribution of films over the years")
ax[0].set_xlabel("")

old_tresh = 1915
young_tresh = 2011

young = len(movies.loc[movies["combined_release_year"] > young_tresh])
young_ratio = (young/len(movies))*100
old = len(movies.loc[movies["combined_release_year"] < old_tresh])
old_ratio = (old/len(movies))*100

print(f"There are {old} movies that released before {old_tresh} ({old_ratio:.2f}% of total movies)\n"
      f"There are {young} movies that released after {young_tresh} ({young_ratio:.2f}% of total movies)")

Here we can see the distribution of films over the years. It is skewed towards more recent years. But there are few films that released before 1915 and after 2011, respectively 1.69% and 2%.

## How are presence and age evolving year by year?
The following analysis will be conducted using the dataset that was created by merging informations coming CMU dataset, IMDB and wikidata.


### Presence of women as actors

In [None]:
male_char = characters.loc[characters["combined_gender"] == "male"]
female_char = characters.loc[characters["combined_gender"] == "female"]
non_bin = characters.loc[(characters["combined_gender"] != "female") & (characters["combined_gender"] != "male") & (characters["combined_gender"]).notna()]
remaining = characters.loc[characters["combined_gender"].isnull()]

unique_male = (male_char["actor_name"].nunique())
unique_female = (female_char["actor_name"].nunique())
unique_non_bin = (non_bin["actor_name"].nunique())
unique_remaining = (remaining["actor_name"].nunique())

print(f"There are {unique_male} male actors in our dataset for a total of {len(male_char)} appearances = an average of {len(male_char)/unique_male:.2f} movies per man \n"
      f"There are {unique_female} female actors in our dataset for a total of {len(female_char)} appearances = an average of {len(female_char)/unique_female:.2f} movies per woman\n"
      f"There are {unique_non_bin} non-binary people in our dataset for a total of {len(non_bin)} appearances\n"
      f"We are missing the gender of {unique_remaining} actors gender infos for a total of {len(remaining)} appearances = an average of {len(remaining)/unique_remaining:.2f} movies per person")

For this first analysis we are going to leave out non-binary people out, as they represent a small fraction of our dataset. In order to complete the dataset even more, we could try to infer the gender of actors based on their name, by using gender_guesser package for example. This would of course yield some incorrect results for some of the actors, but overall the accuracy should be good enough. These non-gendered actors seems  to have on average fewer films per actors (1.18), we think its because they are less important actors and that is also why we did not find their gender in our databases.

In addition. we see that the amounts of female and male actors are very different. We have 2 hypothesis to explain this difference:
* There is actually less women in the actors of film industry
* Actresses occupy less important roles, they might not get credited enough and end up not being present in the different datasets we used.

Let's see how this unbalance evolved year by year.

In [None]:
_, ax = plt.subplots(1,2,figsize = (10,5))

df = characters.loc[(characters["combined_gender"] == "female") | (characters["combined_gender"] == "male")]
ax1 = sns.histplot(data=df, stat="count", multiple="stack", x="combined_release_year", hue="combined_gender",ax = ax[0])

def find_ratio(df):
    return len(df.loc[df["combined_gender"] == "male"])/len(df)
male_v_female_repres = df.groupby("combined_release_year").apply(lambda x: find_ratio(x))

ax2 = sns.lineplot(male_v_female_repres.rolling(7).mean().T, ax = ax[1])
ax2.set_ylim([0,1])

ax1.set_title('Repartition of female and male appearance in movies per year')
ax2.set_title('Percentage of male representation per year')

ax1.set_xlabel('Years')
ax2.set_xlabel('Years')

ax1.set_ylabel('Number of actors')
ax2.set_ylabel('Percentage of male actors')
plt.show()

In [None]:
val = (len(df.loc[df["combined_gender"] == "male"])/len(df))*100
print(f"The average percentage of men across all the years in the total number of actors is {val:.2f}%")

We can see from the graph that the percentage of man vs woman is the same across the years, except in the more recent and older films where we have quite a bit of noise, due to the lack of data. It seems that independently of the release year of the film, there are always more male actors that are present, or at least cited, this value stays in the neighborhood of the average 66% without any big jumps, except at both ends of the spectrum, this is again due to the lack of data.

### Actresses' age over the years

first a bit about the completeness of our data

In [None]:
df_births = df.loc[df["combined_birth"].notna()]
df_missing_births = df.loc[df["combined_birth"].isna()]

fn = lambda row: row.combined_release_year - row.combined_birth
col = df_births.apply(fn, axis=1)
df_births = df_births.assign(age=col.values)

unique_birth = df_births["actor_name"].nunique()

print(f"We have {unique_birth} actors who have a male/female gender and an age\n"
      f"this represemts {len(df_births)} entries\n"
      f"we are missing the birthdate of {len(df_missing_births)} male/female actors")

We remove the actors who's age is smaller tham 0 and greater than 100.

In [None]:
df_births = df_births.loc[(df_births["age"] > 0) & (df_births["age"] < 100) ]
df_births = df_births.loc[(df_births["combined_release_year"] > 1909) & (df_births["combined_release_year"] < 2020) ] # we also drop the movies before 1910 and after 2020 since we do not have enough data in those years

Let's look at the overall age difference

In [None]:
df_births

In [None]:
for i in range(int(df_births.combined_release_year.min()), int(df_births.combined_release_year.max())):
    year_low = i// 10 * 10
    year_high = year_low + 9
    df_births.loc[df_births.combined_release_year == i, 'group_year'] = f"{year_low} - {year_high}"

In [None]:
ax = sns.barplot(data = df_births.sort_values(by = "group_year"), x = "group_year", y = "age", hue = "combined_gender")
plt.xticks(rotation = 90)
plt.title("Men vs women age in movies by decade")
ax.set_xlabel("Decade")
ax.set_ylabel("Average age")
plt.show()

In [None]:
avg_men_age = df_births.loc[df_births["combined_gender"]=="male"].groupby("group_year")["age"].mean()
avg_women_age = df_births.loc[df_births["combined_gender"]=="female"].groupby("group_year")["age"].mean()
avg_diff = (avg_men_age - avg_women_age).dropna()

In [None]:
avg_diff = pd.DataFrame(avg_diff).reset_index()

In [None]:
ax = sns.barplot(data = avg_diff, x = "group_year", y = "age")
plt.xticks(rotation = 90)
plt.title("Average age difference between men and women actors")
ax.set_xlabel("Decade")
ax.set_ylabel("Average age difference")
plt.show()

In [None]:
from scipy.stats import ttest_ind, ttest_rel

In [None]:
women_age = df_births.loc[df_births["combined_gender"]=="female"]["age"]
men_age = df_births.loc[df_births["combined_gender"]=="male"]["age"]

print(ttest_ind(women_age, men_age))

mean_f = women_age.mean()
median_f = women_age.median()

mean_m = men_age.mean()
median_m = men_age.median()

diff_mean = mean_m-mean_f
diff_median = median_m-median_f

print(f"The average woman age on screen is {mean_f:.2f} and the average men age is {mean_m:.2f}, for a difference of {diff_mean:.2f} years")

Based on the results opf the t-test, and the p value being of 0, we can say with great confidence that men and female definetly do not have the same age on screen.

In [None]:
ax = sns.boxplot(data = df_births, x = "age", y = "combined_gender")
plt.title("Age comparison between men and women actors")
ax.set_xlabel("Age")
ax.set_ylabel("Gender")
plt.show()

From the plot above, we can see that there is an overall difference between male and female actors' age. The median age is different and the female age is skewed towards younger ages (other visualisation in following plots).

In [None]:
births_male = df_births[df_births["combined_gender"] == "male"]
births_female = df_births[df_births["combined_gender"] == "female"]

In [None]:
p = sns.jointplot(data=births_male, x = "combined_release_year", y = "age", kind = "hex")
p.fig.suptitle("Jointplot of male actors age and distribution over time")
p.fig.tight_layout()
p.fig.subplots_adjust(top=0.95)

plt.xlabel('Movie release year')
plt.ylabel('Male actor age')

plt.show()

p = sns.jointplot(data=births_female, x = "combined_release_year", y = "age", kind = "hex", color = "pink")
p.fig.suptitle("Jointplot of female actors age and distribution over time")
p.fig.tight_layout()
p.fig.subplots_adjust(top=0.95)

plt.xlabel('Movie release year')
plt.ylabel('Female actor age')

plt.show()

The distribution of women's age seems to be more skewed to towards smaller ages than men. Average men age is more spread out.

Let's see how this difference evolves over time.

In [None]:
avg_men_age = births_male.groupby("combined_release_year")["age"].mean()
avg_women_age = births_female.groupby("combined_release_year")["age"].mean()
avg_diff = (avg_men_age - avg_women_age).dropna()

In [None]:
_, ax = plt.subplots(1,2,figsize = (10,5))

ax1 = sns.lineplot(avg_men_age.T, label = "Men average age", ax = ax[0])
ax2 = sns.lineplot(avg_women_age.T, label = "Women average age", ax = ax[0])
ax3 = sns.lineplot(avg_diff.rolling(5).mean().T, ax = ax[1])

ax2.set_ylim([0,80])
ax2.set_title("Yearly averaged age for men and women")
ax2.set_xlabel("Movie release Year")
ax2.set_ylabel("Age")

ax3.set_ylim([0,20])
ax3.set_title("Average age difference between men and women")
ax3.set_xlabel("Movie release Year")
ax3.set_ylabel("Age difference")

plt.show()

Since the noise is really important, we cut the extremes years for better visualisation

In [None]:
min = 1920
max = 2010
avg_men_age = avg_men_age.loc[avg_men_age.index.isin(np.linspace(min,max, num = max-min+1))]
avg_women_age = avg_women_age.loc[avg_women_age.index.isin(np.linspace(min,max, num = max-min+1))]
avg_diff = (avg_men_age - avg_women_age).dropna()

In [None]:
_, ax = plt.subplots(1,2,figsize = (10,5))

ax1 = sns.lineplot(avg_men_age.T, label = "Men average age", ax = ax[0])
ax2 = sns.lineplot(avg_women_age.T, label = "Women average age", ax = ax[0])
ax3 = sns.lineplot(avg_diff.rolling(5).mean().T, ax = ax[1])

ax2.set_ylim([0,80])
ax2.set_title("Yearly averaged age for men and women")
ax2.set_xlabel("Movie release Year")
ax2.set_ylabel("Age")

ax3.set_ylim([0,20])
ax3.set_title("Average age difference between men and women")
ax3.set_xlabel("Movie release Year")
ax3.set_ylabel("Age difference")

plt.show()

Men actors seem to be overall older than women actors, but it looks that the difference gets a bit smaller, when coming closer to the 2000's.

## Actresses and genre

In [None]:
merged_character = pd.merge(characters, movies, left_on='movie_freebase_id', right_on='freebase_id_cmu', how='left')

In [None]:
df = pd.DataFrame()
df2 = pd.DataFrame()
df['genre'] = merged_character.loc[merged_character.combined_gender == 'female'].genres_imdb.str.split(',').explode().value_counts() / len(merged_character.loc[merged_character.combined_gender == 'female'].genres_imdb.str.split(',').explode())
df['gender'] = 'female'
df2['genre'] = merged_character.loc[merged_character.combined_gender == 'male'].genres_imdb.str.split(',').explode().value_counts()/ len(merged_character.loc[merged_character.combined_gender == 'male'].genres_imdb.str.split(',').explode())
df2['gender'] = 'male'

df = pd.concat([df2, df])
df = df.reset_index()

fig, ax = plt.subplots(figsize=(10,10))

ax = sns.barplot(data=df, x='index', y='genre', hue='gender')
plt.xticks(rotation=90)
ax.set_title("Male vs female presence in movie genre")
ax.set_xlabel('Genre')
ax.set_ylabel('Percent of total gender in that genre')
plt.show()

This graph shows that females appear more in Drama, Comedy and Romance compared to men, who appear more in Action, Crime and Adventure.

## Are females equally represented in leading and minor roles?

In order to find the presence of women in major/minor roles we are considering their appearance in the credits of a film. The order of the credits can be considered as importance in the film, as first cast members will be the main actors, while following are less important. This order can be found on the "Full Cast & Crew" page of nearly all movies present on IMDB.

In [None]:
print(f"We have the imdb id of {movies.IMDB_id.dropna().count()} movies")

We will get the index of the first woman cited in the credits, and see how this evolves over the years.

In [None]:
d = gender.Detector() #we initialize it outside the multiprocessing to gain time

In [None]:
def find_first_woman_index(gender_list, gender = "female"): #returns the index or nan if there are no women in the credits
    try:
        index_f = (gender_list.index(gender))
    except:
        index_f = np.inf
    try:
        index_mf = (gender_list.index("mostly_"+gender))
    except:
        index_mf = np.inf

    if index_f < index_mf:
        index = index_f
    else:
        index = index_mf

    if index == np.inf:
        index = np.nan

    return index

In [None]:
def get_gender_list(imdb_id, gender = "female"):
    r = requests.get(f"https://www.imdb.com/title/{imdb_id}/fullcredits?ref_=tt_ov_st_sm")
    soup = bs(r.text, 'html.parser')
    table = soup.find('table', class_='cast_list')
    first_name = 'img alt="[\w]+'
    if table is not None:
        m = re.findall(first_name, table.decode())
        gender_list = [d.get_gender(s.replace('img alt="', "")) for s in m] #we find the gender of the person using a gender detector package
        return [imdb_id, find_first_woman_index(gender_list, gender)+1]

    else:
        return [imdb_id, "no-cast"] #if the page does not exist

gender_list gives us the gender of the actors, and the order of the list is the order they appear in the movie's credits.


As this takes mutliple hours, we are going to showcase a result for the first 1000 imdb ids. The multiprocessing is run on a separate python file, mutliprocessing on windows doesn't work well inside notebooks.

In [None]:
imdb_id = movies.IMDB_id.dropna()
imdb_id = list(imdb_id)[:1000]
total_len = len(imdb_id)

#The multiprocessing was run in src/mutliproc.py

In [None]:
women_credit_rank = pd.DataFrame.from_records(pickle.load(open(pickle_folder+"women_appearance.p","rb"))).rename({0:"IMDB_id",1:"women_credit_rank"}, axis=1)
men_credit_rank = pd.DataFrame.from_records(pickle.load(open(pickle_folder+"men_appearance.p","rb"))).rename({0:"IMDB_id",1:"men_credit_rank"}, axis=1)
merged = (movies.merge(women_credit_rank, left_on="IMDB_id", right_on="IMDB_id")).merge(men_credit_rank, left_on="IMDB_id", right_on="IMDB_id")
merged = merged.loc[(merged["men_credit_rank"].notna()) & (merged["men_credit_rank"] != "no-cast")]

for i in range(int(merged.combined_release_year.min()), int(merged.combined_release_year.max())):
    year_low = i// 10 * 10
    year_high = year_low + 9
    merged.loc[merged.combined_release_year == i, 'group_year'] = f"{year_low} - {year_high}"

In [None]:
women_credit_rank

In [None]:
df = pd.DataFrame()
df['appearance'] = merged.women_credit_rank
df['gender'] = 'Female'
df["group_year"] = merged["group_year"]
df2 = pd.DataFrame()
df2['appearance'] = merged.men_credit_rank
df2['gender'] = 'Male'
df2["group_year"] = merged["group_year"]
df_final = pd.concat([df, df2])

In [None]:
ax = sns.barplot(data = df_final.sort_values(by = "group_year"), x = "group_year", y = "appearance", hue = "gender")
plt.xticks(rotation = 90)
plt.title("Men vs women position of first appearance in credits of movie, averaged over decades")
ax.set_xlabel("Decade")
ax.set_ylabel("Average first appearance index in credits")
plt.show()

We can see on this plot that on average women are cited later in the credits compared to their male counterparts, meaning that they have less main roles.

In [None]:
men_credit_rank = men_credit_rank.loc[(men_credit_rank["men_credit_rank"].notna()) & (men_credit_rank["men_credit_rank"] != "no-cast")]["men_credit_rank"]
women_credit_rank = women_credit_rank.loc[(women_credit_rank["women_credit_rank"].notna()) & (women_credit_rank["women_credit_rank"] != "no-cast")]["women_credit_rank"]

# NEW ANALYSIS -----------------------------------------

Combining all the above analysis to create two logistic regressions. Say how likeley the film is action or romance based on only actor gender data (percentage female actors, percentage male actors, age of female actors, age of male actors, appearance of male in the credits, appearance of female in the credits, male lead (1st in the credits), female lead.

## Adding scraped data about actor gender ranking in movies credits

Newly scraped data with all movies with imdB id

In [None]:
# how do we deal with movies that have no women/no men ?
credit_rank = pd.DataFrame.from_records(pickle.load(open(pickle_folder+"women_appearance.p","rb"))).rename({0:"IMDB_id",1:"women_credit_rank",2:"men_credit_rank"}, axis=1)
credit_rank = credit_rank.loc[(credit_rank["women_credit_rank"] != "no-cast") & (credit_rank["women_credit_rank"].notna()) & (credit_rank["men_credit_rank"].notna()) & (credit_rank["men_credit_rank"] != "no-cast")] #we remove the Nan and the no cast movies

We discard the movies that have none or both rank that is 1 (this means that the gender detector didn't do its job properly)

In [None]:
def f(women_credit_rank, men_credit_rank):
    if (women_credit_rank == 1 and men_credit_rank == 1) or (women_credit_rank != 1 and men_credit_rank != 1): #issue at least on of them should be 1 but not both
        return np.NaN # will be dropped later
    else:
        if women_credit_rank == 1:
            return "female"
        else:
            return  "male"

credit_rank["lead"] = credit_rank.apply(lambda x: f(x.women_credit_rank, x.men_credit_rank), axis = 1)

In [None]:
credit_rank = credit_rank.dropna(subset=["lead"])

In [None]:
def is_lead(lead):
    if lead == "female":
        return 1
    else:
        return 0
credit_rank["is_woman_lead"] = credit_rank.apply(lambda x: is_lead(x.lead), axis = 1)

In [None]:
credit_rank["women_credit_rank"] = credit_rank["women_credit_rank"].astype(float)

In [None]:
merged = (movies.merge(credit_rank, left_on="IMDB_id", right_on="IMDB_id"))
merged = merged.loc[(merged["men_credit_rank"].notna()) & (merged["men_credit_rank"] != "no-cast")]
merged = merged.loc[(merged["combined_release_year"] > 1930) & (merged["combined_release_year"] < 2020)]
merged["group_year"] = ((merged["combined_release_year"]//10)*10).astype(int).astype(str)+"s"

In [None]:
df = pd.DataFrame()
df['appearance'] = merged.women_credit_rank
df['gender'] = 'Female'
df["group_year"] = merged["group_year"]
df2 = pd.DataFrame()
df2['appearance'] = merged.men_credit_rank
df2['gender'] = 'Male'
df2["group_year"] = merged["group_year"]
df_final = pd.concat([df, df2])

In [None]:
ax = sns.barplot(data = df_final.sort_values(by = "group_year"), x = "group_year", y = "appearance", hue = "gender")
plt.xticks(rotation = 90)
plt.title("Men vs women position of first appearance in credits of movie, averaged over decades")
ax.set_xlabel("Decade")
ax.set_ylabel("Average first appearance index in credits")
plt.show()

In [None]:
# merged.genres_imdb = merged.genres_imdb.str.split(',') # separate all movies os each line has a genre, but each movie is still represented in all his genres
# merged = merged.explode("genres_imdb")

In [None]:
from scipy.stats import bootstrap

In [None]:
genders = ["Female","Male"]
decades = ["1930s","1940s","1950s","1960s","1970s","1980s","1990s","2000s","2010s"]

upper_f = []
lower_f = []

g = "Female"
for d in decades:
    #print(df_final.loc[(df_final["gender"] == g) & (df_final["group_year"] == d)]["appearance"].values)
    data = df_final.loc[(df_final["gender"] == g) & (df_final["group_year"] == d)]["appearance"].values.tolist()
    moyenne = np.mean(data)
    data = (data,)
    ci_l, ci_u = bootstrap(data, np.mean).confidence_interval
    lower_f.append(moyenne-ci_l)
    upper_f.append(ci_u-moyenne)

upper_m = []
lower_m = []

g = "Male"
for d in decades:
    #print(df_final.loc[(df_final["gender"] == g) & (df_final["group_year"] == d)]["appearance"].values)
    data = df_final.loc[(df_final["gender"] == g) & (df_final["group_year"] == d)]["appearance"].values.tolist()
    moyenne = np.mean(data)
    data = (data,)
    ci_l, ci_u = bootstrap(data, np.mean).confidence_interval
    lower_m.append(moyenne-ci_l)
    upper_m.append(ci_u-moyenne)


In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(y=df_final.loc[df_final["gender"] == "Male"].groupby(["group_year"]).mean().values[:].flatten().tolist(), x=decades, name="Male",
                        error_y=dict(type='data', array=upper_m, arrayminus=lower_m)))
fig.add_trace(go.Scatter(y=df_final.loc[df_final["gender"] == "Female"].groupby(["group_year"]).mean().values[:].flatten().tolist(), x=decades, name="Female",
                        error_y=dict(type='data', array=upper_f, arrayminus=lower_f)))

fig.update_layout(title_text='Average first appearance in the credits by gender by decade')
# axis between 0 and 1
fig.update_yaxes(range=[1, 3.5])
# labels 
fig.update_xaxes(title_text='Decade')
fig.update_yaxes(title_text='Average first appearnce in the credits')

# title of legend
fig.update_layout(legend_title_text='Gender')


fig.show()


# save fig to html
fig.write_html('docs/_includes/first_appearance_per_decade_per_gender.html')

In [None]:
df_final.groupby(["gender","group_year"]).mean().reset_index()

In [None]:
fig = px.line(df_final.groupby(["gender","group_year"]).mean().reset_index(), y="appearance", x="group_year",color = "gender", color_discrete_map={"Female": '#EF553B', "Male":'#636EFA'}, markers=True)
fig.update_traces(textposition="bottom right")
fig.show()

# Percent of female actors

In [None]:
characters_movies = pd.merge(characters[['movie_wiki_id', 'actor_name', 'combined_birth', 'combined_gender']], movies[['wikipedia_id', 'title', 'combined_release_year',  'metric']], left_on='movie_wiki_id', right_on='wikipedia_id', how='left')
characters_movies["age"] = characters_movies["combined_release_year"]-characters_movies["combined_birth"]

### Find the percentage of male and female actors in each movie

In [None]:
new = pd.DataFrame(characters_movies.groupby("title")["combined_gender"].value_counts(normalize=True)).rename(columns={"combined_gender":"percent"})
new = new.reset_index()
newF = new.loc[new["combined_gender"] == "female"].drop(columns=["combined_gender"]).rename(columns={"percent":"f_actor_percentage"})
newM = new.loc[new["combined_gender"] == "male"].drop(columns=["combined_gender"]).rename(columns={"percent":"m_actor_percentage"})
actor_gender_percentage = pd.merge(newF, newM, left_on="title", right_on="title")
#actor_gender_percentage.head()

In [None]:
gender_stats = pd.merge(movies[["title","metric","genres_imdb","combined_release_year","IMDB_id"]], actor_gender_percentage, left_on="title", right_on="title", how="inner")
gender_stats = gender_stats.dropna()
gender_stats = gender_stats.loc[(gender_stats["combined_release_year"] > 1930) & (gender_stats["combined_release_year"] < 2020)]
#gender_stats.head()

## Evolution of the metric over the centuries

In [None]:
gender_stats["century"] = ((gender_stats["combined_release_year"]//100)*100).astype(int).astype(str)+"s"
gender_stats["percent_share"] = ((gender_stats["f_actor_percentage"]*100//10)*10).apply(str)+"-"+(((gender_stats["f_actor_percentage"]*100//10)*10)+9).apply(str)+"%"

In [114]:
fig = px.histogram(gender_stats, x="percent_share")
fig.update_xaxes(categoryorder='category ascending')
fig.show()

In [None]:
fig = px.box(gender_stats, x="percent_share", y="metric", points = False, color = "century")

data = gender_stats.groupby(["percent_share","century"]).mean()["metric"].reset_index()
fig.add_trace(go.Scatter(x=data.loc[data["century"] == "2000s"]["percent_share"], y=data.loc[data["century"] == "2000s"]["metric"],name="mean metric 2000s"))
fig.add_trace(go.Scatter(x=data.loc[data["century"] == "1900s"]["percent_share"], y=data.loc[data["century"] == "1900s"]["metric"],name="mean metric 1900s"))
fig.update_xaxes(categoryorder='category ascending')
fig.show()

## Evolution of the metric over the the genres

In [None]:
gender_stats.genres_imdb = gender_stats.genres_imdb.str.split(',') # separate all movies os each line has a genre, but each movie is still represented in all his genres
gender_stats = gender_stats.explode("genres_imdb")

In [None]:
top_10 = ['Drama', 'Comedy', 'Romance', 'Action', 'Sci-Fi', 'Adventure', 'Crime', 
       'Thriller', 'Horror', 'Family', 'Mystery', 'Fantasy', 'Animation']

special_genres = [
    'Action',
    'Adventure',
    'Comedy',
    'Drama',
    'Romance',
    'Sci-Fi',
]

In [None]:
data = gender_stats.groupby(["genres_imdb","percent_share"]).mean().reset_index()
data

In [None]:
# show the percentage of female characters per decade for each genre in a line plot using plotly go
# when hovering over a line, show the percentage - genre name
import plotly.graph_objects as go

fig = go.Figure()

for genre in top_10:
    temp_data = data.loc[data["genres_imdb"] == genre]
    fig.add_trace(go.Scatter(x=temp_data["percent_share"], y=temp_data["metric"], name=genre, hovertemplate='%{y:.2f} - ' + genre, visible=True if genre in special_genres else 'legendonly') )


# plotly.offline.iplot(dict_of_fig, filename='basic-bar')

# # export to html
# plotly.offline.plot(dict_of_fig, filename='../docs/_includes/directors_percentage.html')

# add title
fig.update_layout(title_text='Average movie score by percent female by genre')
# axis between 0 and 1
#fig.update_yaxes(range=[-1, 1])
# labels 
fig.update_xaxes(title_text='Percent of females in the actors')
fig.update_yaxes(title_text='Average movie metric')

# title of legend
fig.update_layout(legend_title_text='Genre')

fig.show()


# save fig to html
# fig.write_html('../docs/_includes/percentage_per_genre_per_decade.html')

In [None]:
fig = px.scatter(gender_stats, y = "metric", x = "f_actor_percentage", color="century", trendline="ols")
fig.data = [t for t in fig.data if t.mode == "lines"]
fig.update_traces(showlegend=True)
fig.show()

## Regression

In [None]:
master_gender = pd.merge(gender_stats, credit_rank, left_on="IMDB_id", right_on="IMDB_id")

In [None]:
# Declares the model
mod = smf.ols(formula='metric ~ f_actor_percentage + women_credit_rank + is_woman_lead', data=master_gender)
# Fits the model (find the optimal coefficients, adding a random seed ensures consistency)
np.random.seed(2)
res = mod.fit()
# Print thes summary output provided by the library.
print(res.summary())

# Actor impact score average male vs female

In [None]:
temp = characters[["actor_metric","combined_release_year","combined_gender"]]
temp = temp.loc[((temp["combined_gender"] == "male") | (temp["combined_gender"] == "female")) & (temp["combined_release_year"] > 1930) & (temp["combined_release_year"] < 2020)]
temp["decade"] = ((temp["combined_release_year"]//10)*10).astype(int).astype(str)+"s"
temp = temp.dropna()

In [None]:
genders = ["female","male"]
decades = ["1930s","1940s","1950s","1960s","1970s","1980s","1990s","2000s","2010s"]

upper_f = []
lower_f = []

g = "female"
for d in tqdm(decades):
    #print(df_final.loc[(df_final["gender"] == g) & (df_final["group_year"] == d)]["appearance"].values)
    data = temp.loc[(temp["combined_gender"] == g) & (temp["decade"] == d)]["actor_metric"].values.tolist()
    moyenne = np.mean(data)
    #print(data)
    data = (data,)
    ci_l, ci_u = bootstrap(data, np.mean, n_resamples=1000, batch = 5).confidence_interval
    lower_f.append(moyenne-ci_l)
    upper_f.append(ci_u-moyenne)

upper_m = []
lower_m = []

g = "male"
for d in tqdm(decades):
    #print(df_final.loc[(df_final["gender"] == g) & (df_final["group_year"] == d)]["appearance"].values)
    data = temp.loc[(temp["combined_gender"] == g) & (temp["decade"] == d)]["actor_metric"].values.tolist()
    moyenne = np.mean(data)
    data = (data,)
    ci_l, ci_u = bootstrap(data, np.mean, n_resamples=1000, batch = 1).confidence_interval
    lower_m.append(moyenne-ci_l)
    upper_m.append(ci_u-moyenne)



In [None]:
import plotly.graph_objects as go
#temp.loc[temp["combined_gender"] == "female"].groupby(["decade"]).mean().values[:].flatten().tolist()
fig = go.Figure()

fig.add_trace(go.Scatter(y=temp.loc[temp["combined_gender"] == "male"].groupby(["decade"]).mean()["actor_metric"].values[:].flatten().tolist(), x=decades, name="male",
                        error_y=dict(type='data', array=upper_m, arrayminus=lower_m)))

fig.add_trace(go.Scatter(y=temp.loc[temp["combined_gender"] == "female"].groupby(["decade"]).mean()["actor_metric"].values[:].flatten().tolist(), x=decades, name="female",
                        error_y=dict(type='data', array=upper_f, arrayminus=lower_f)))

fig.update_layout(title_text='Average first appearance in the credits by gender by decade')
# axis between 0 and 1
# fig.update_yaxes(range=[1, 3.5])
# labels 
fig.update_xaxes(title_text='Decade')
fig.update_yaxes(title_text='Average first appearnce in the credits')

# title of legend
fig.update_layout(legend_title_text='Gender')


fig.show()


# save fig to html
fig.write_html('docs/_includes/actor_impact_score_per_gender.html')

men actually play in more films than women

In [None]:
women_avg_movies_cnt = characters_movies.loc[characters_movies["combined_gender"] == "female"].groupby("actor_name").movie_wiki_id.count().mean()
men_avg_movies_cnt = characters_movies.loc[characters_movies["combined_gender"] == "male"].groupby("actor_name").movie_wiki_id.count().mean()

In [None]:
print(f"women play on average in {women_avg_movies_cnt} movies and men play in average in {men_avg_movies_cnt} movies")

In [None]:
movies

In [None]:
characters_movies = pd.merge(characters[["actor_metric",'movie_wiki_id',"combined_release_year"]], movies[["genres_imdb", 'wikipedia_id']], left_on='movie_wiki_id', right_on='wikipedia_id', how='left')[["actor_metric","combined_release_year","genres_imdb"]]
characters_movies = characters_movies.loc[(characters_movies["combined_release_year"] > 1930) & (characters_movies["combined_release_year"] < 2020)]
characters_movies = characters_movies.dropna()
characters_movies["decade"] = ((characters_movies["combined_release_year"]//10)*10).astype(int).astype(str)+"s"

In [111]:
characters_movies.genres_imdb = characters_movies.genres_imdb.str.split(',') # separate all movies os each line has a genre, but each movie is still represented in all his genres
characters_movies = characters_movies.explode("genres_imdb")

data=characters_movies.groupby(["decade","genres_imdb"]).mean().reset_index()

In [113]:
# show the percentage of female characters per decade for each genre in a line plot using plotly go
# when hovering over a line, show the percentage - genre name
import plotly.graph_objects as go

fig = go.Figure()

for genre in top_10:
    temp_data = data.loc[data["genres_imdb"] == genre]
    fig.add_trace(go.Scatter(x=temp_data["decade"], y=temp_data["actor_metric"], name=genre, hovertemplate='%{y:.2f} - ' + genre, visible=True if genre in special_genres else 'legendonly') )


# plotly.offline.iplot(dict_of_fig, filename='basic-bar')

# # export to html
# plotly.offline.plot(dict_of_fig, filename='../docs/_includes/directors_percentage.html')

# add title
fig.update_layout(title_text='Average person impact score by genre by decade')
# axis between 0 and 1
#fig.update_yaxes(range=[-1, 1])
# labels 
fig.update_xaxes(title_text='Decade')
fig.update_yaxes(title_text='Average person impact score')

# title of legend
fig.update_layout(legend_title_text='Genre')

fig.show()


# save fig to html
# fig.write_html('../docs/_includes/percentage_per_genre_per_decade.html')

In [None]:
df_actors = pickle.load(open("./src/actors_with_image.p","rb"))
fig = px.bar(df_actors, y='actor_metric', x='actor_name', color ="combined_gender")
fig.update_layout( xaxis={'categoryorder':'array', 'categoryarray':df_actors.actor_name})

fig.show()

# ADDITIONAL ANALYSIS ------------------- Classification of film genre based on gender variables

We want to have a is_woman_lead "boolean" (0 or 1) to use in the RF

In [None]:
def is_lead(lead):
    if lead == "female":
        return 1
    else:
        return 0
credit_rank["is_woman_lead"] = credit_rank.apply(lambda x: is_lead(x.lead), axis = 1)

In [None]:
credit_rank

In [None]:
master_gender = pickle.load(open(pickle_folder + 'gender_stats.pkl', 'rb'))

In [None]:
master_gender

In [None]:
master_gender = pd.merge(master_gender, credit_rank, left_on="IMDB_id", right_on="IMDB_id")

In [None]:
master_gender.genres_imdb = master_gender.genres_imdb.str.split(',') # separate all movies os each line has a genre, but each movie is still represented in all his genres
master_gender = master_gender.explode("genres_imdb")

#TODO change genre to number [0 ot Ngenres]

In [None]:
master_gender = master_gender.dropna(subset=["genres_imdb"])

In [None]:
master_gender = master_gender.loc[master_gender["genres_imdb"] != "\\N"]

We need to transform film genre in number for the random forest

In [None]:
mapping = {}
for i in range(len(list(master_gender.genres_imdb.unique()))):
    mapping[list(master_gender.genres_imdb.unique())[i]] = i

In [None]:
mapping

In [None]:
master_gender["genre_number"] = master_gender.apply(lambda x: mapping.get(x.genres_imdb), axis = 1)

In [None]:
master_gender.to_pickle("master_gender.p")

## Creation of a random forest for classification of actor genre based on gender-related data

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split

In [None]:
master_gender

Create the RF

In [None]:
X_train, X_test, y_train, y_test = train_test_split(master_gender[["f_actor_percentage","m_actor_percentage","f_actor_mean_age","m_actor_mean_age","women_credit_rank","men_credit_rank","is_woman_lead"]], master_gender["genre_number"],
                                                    test_size=0.3, random_state=42)
clf = RandomForestClassifier()
clf.fit(X = X_train, y =y_train)
y_test_pred = clf.predict(X_test)

Evaluate the RF

In [None]:
precision = precision_score(y_test, y_test_pred, average="weighted")
F1 = f1_score(y_test, y_test_pred,average="weighted")

print(f"precision of the classifier is {precision}, F1 score is {F1}")

pretty bad results

## Let's try logistic regression on only Action and Romance movies

### First Action movies

We need to add a "boolean" (1 or 0) value to the array called is_action and is_romance

In [None]:
master_gender.genres_imdb = master_gender.genres_imdb.str.split(',')

In [None]:
master_gender

In [None]:
def is_genre(genres, genre_trgt ="Action"):
    for genre in genres:
        if genre == genre_trgt:
            return 1
        else:
            return 0

In [None]:
master_gender["is_action"] = master_gender.apply(lambda x: is_genre(x.genres_imdb), axis=1)
master_gender["is_romance"] = master_gender.apply(lambda x: is_genre(x.genres_imdb, genre_trgt="Romance"), axis=1)

In [None]:
master_gender

In [None]:
import statsmodels.api as sm

In [None]:
Xtrain = master_gender[["f_actor_percentage","f_actor_mean_age","m_actor_mean_age","is_woman_lead"]]
ytrain = master_gender["is_action"]

log_reg = sm.Logit(ytrain, Xtrain).fit()

print(log_reg.summary())

In [None]:
Xtrain = master_gender[["f_actor_percentage","f_actor_mean_age","m_actor_mean_age","is_woman_lead"]]
ytrain = master_gender["is_romance"]

log_reg = sm.Logit(ytrain, Xtrain).fit()

print(log_reg.summary())

In [None]:
df_actors = characters.drop_duplicates(subset="actor_name").sort_values("actor_metric", ascending = False)[["actor_name","combined_gender","actor_metric","wikidata_actor_id"]].dropna(subset="actor_metric").head(100)

In [None]:
df_actors

In [None]:
import hashlib



In [None]:
def get_image_link(id):
    json_id = requests.get(f'https://www.wikidata.org/wiki/Special:EntityData/{id}.json')
    if json_id.ok:
        #features = dict()
        json_id = json_id.json()
        key = list(json_id['entities'].keys())[0]
        #features['new_wikidata_id'] = key
        image_link = json_id['entities'][key]['claims']["P18"][0]['mainsnak']['datavalue']['value'].replace(" ", "_")
        hash = hashlib.md5(bytes(image_link, encoding='utf-8')).hexdigest()
        a = hash[0]
        b = hash[1]
        new_url = "https://upload.wikimedia.org/wikipedia/commons/"+a+"/"+a+b+"/"+image_link
        return new_url
    else:
        print(json_id.status_code)
        return [id, json_id.status_code]

In [None]:
df_actors["img_link"] = df_actors.apply(lambda x: get_image_link(x.wikidata_actor_id), axis=1)

In [None]:
df_actors.to_pickle("actors_with_image.p")

In [None]:
df_actors = pickle.load(open("./src/actors_with_image.p","rb"))
fig = px.bar(df_actors, y='actor_metric', x='actor_name', color ="combined_gender")
fig.update_layout( xaxis={'categoryorder':'array', 'categoryarray':df_actors.actor_name})

fig.show()