# This whole notebook will be data analysis, which most will be contributed to my written piece

In [249]:
%load_ext rpy2.ipython
%load_ext autoreload
%autoreload 2

%matplotlib inline  
from matplotlib import rcParams
rcParams['figure.figsize'] = (16, 100)

import warnings
from rpy2.rinterface import RRuntimeWarning
warnings.filterwarnings("ignore") # Ignore all warnings
# warnings.filterwarnings("ignore", category=RRuntimeWarning) # Show some warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [250]:
import pandas as pd

## import the female list I curated

In [251]:
df_w = pd.read_csv('females_list_bbc.csv')

### convert ratings, nominations, wins to numerics

In [252]:
df_w['rating'] = pd.to_numeric(df_w['rating'], errors='coerce')

In [253]:
df_w['wins'] = pd.to_numeric(df_w['wins'], errors='coerce')

In [254]:
df_w['nominations'] = pd.to_numeric(df_w['nominations'], errors='coerce')

### the average rating for films directed by females

In [255]:
df_w['rating'].median()

7.3

### the number of awards won on average

In [256]:
df_w['wins'].median()

12.0

### the number of nominations won on average

In [257]:
df_w['nominations'].median()

15.0

In [259]:
df_w['genres'] = df_w['genres'].str.strip("[]").str.replace("'", "").str.split(", ")


### separate genres

In [260]:
df_exploded = df_w.explode('genres')

In [261]:
genre_counts = df_exploded.groupby(['genres']).size().reset_index(name='Count')

### make a smaller dataframe that only contains genre statistics

In [262]:
genre_counts = genre_counts[genre_counts['Count'] > 0]

In [263]:
genre_counts

Unnamed: 0,genres,Count
0,Action,2
1,Action Epic,1
2,Adventure,4
3,Adventure Epic,1
4,Artificial Intelligence,1
5,Baseball,1
6,Biography,9
7,Body Horror,1
8,Body Swap Comedy,1
9,Comedy,17


In [264]:
genre_counts['genres'] = genre_counts['genres'].astype(str)  # Force string conversion


### put granular genres into bigger categories

In [265]:
parent_genres = {
    'Documentary': ['documentary', 'docu', 'biography'],
    'Comedy': ['comedy', 'humor', 'satire', 'dark comedy'],
    'Psychological Drama' : ['psychological drama'],
    'Horror': ['horror', 'terror', 'monster', 'slasher', 'ghost'],
    'Sports': ['sport', 'ball', 'baseball', 'athlete', 'fitness'],
    'Romance': ['romance', 'romantic', 'love', 'steamy'],
    'Thriller': ['thriller', 'suspense', 'crime'],
    'Coming-of-Age': ['coming-of-age', 'teen', 'youth', 'adolescent'],
    'Mystery': ['mystery', 'whodunit', 'enigma'],
    'Crime': ['crime', 'heist', 'gangster', 'mafia', 'noir'],
    'Sci-Fi': ['sci-fi', 'science fiction', 'space opera', 'cyberpunk', 'artificial', 'fantasy'],
    'Family': ['family'],
    'History': ['history'],
    'War' : ['war'],
    'Action': ['action', 'adventure', 'epic', 'heroic'],
    'Animation': ['anim'],
    'Other': []
}

# wrote a function that applies each individual/niche genre to the parent categories 
def map_to_parent(genre):
    if pd.isna(genre):
        return 'Other'
    genre_lower = str(genre).lower().strip()
    for parent, keywords in parent_genres.items():
        if any(keyword in genre_lower for keyword in keywords):  
            return parent
    return 'Other'

# Apply mapping
genre_counts['Broad_Genre'] = genre_counts['genres'].apply(map_to_parent)

# Aggregate
df_w_agg = genre_counts.groupby([ 'Broad_Genre'], as_index=False)['Count'].sum()

In [266]:
genre_counts

Unnamed: 0,genres,Count,Broad_Genre
0,Action,2,Action
1,Action Epic,1,Action
2,Adventure,4,Action
3,Adventure Epic,1,Action
4,Artificial Intelligence,1,Sci-Fi
5,Baseball,1,Sports
6,Biography,9,Documentary
7,Body Horror,1,Horror
8,Body Swap Comedy,1,Comedy
9,Comedy,17,Comedy


In [267]:
# sort vaalues by count
df_sorted = df_w_agg.sort_values(['Count'], ascending=False)

In [268]:
df_sorted

Unnamed: 0,Broad_Genre,Count
10,Other,122
2,Comedy,38
15,Thriller,32
5,Documentary,32
12,Romance,31
3,Coming-of-Age,25
13,Sci-Fi,16
0,Action,13
8,Horror,13
11,Psychological Drama,11


In [269]:
df_sorted['Gender'] = 'Female' 


## import the list of 100 greatest film (filtered only to male-directed work)

In [271]:
df_all = pd.read_csv('list_all_bbc.csv')

### filter filmd directed by men only

In [272]:
female_directors = [
    "Maren Ade", "Agnès Varda", "Claire Denis", 
    "Kathryn Bigelow", "Andrea Arnold", 
    "Lucrecia Martel", "Sarah Polley"
]

In [273]:
df_all = df_all[
    ~df_all["director"].isin(female_directors) & 
    ~df_all["director"].str.contains(",| and ", regex=True, na=False)
]

### convert ratings, nominations, wins to numerics

In [274]:
df_all['rating'] = pd.to_numeric(df_all['rating'], errors='coerce')

In [275]:
df_w['wins'] = pd.to_numeric(df_w['wins'], errors='coerce')

In [276]:
df_w['nominations'] = pd.to_numeric(df_w['nominations'], errors='coerce')

### the average rating 

In [277]:
df_all['rating'].median()

7.8

### the number of awards won on average

In [246]:
df_all['wins'].median()

50.5

### the number of nominations on average

In [247]:
df_all['nominations'].median()

72.5

# linear regression: does the a higher number of awards won imply a higher rating?

## prepare smaller dataframes

In [None]:
new_df = pd.DataFrame({
    'females': ['females'] * len(df_w),  # Repeat "females" for each row
    'rating': df_w['rating'],
    'wins': df_w['wins'],
    'director':df_w['director']
})

In [248]:
new_df_all = pd.DataFrame({
    'males': ['males'] * len(df_all),  
    'rating': df_all['rating'],
    'wins': df_all['wins'],
    'director': df_all['director']
})

## combined  two together

In [30]:
female_df = new_df.rename(columns={"females": "gender"})
male_df = new_df_all.rename(columns={"males": "gender"})

In [31]:
combined_df = pd.concat([female_df, male_df], ignore_index=True)

In [281]:
combined_df

Unnamed: 0,gender,rating,wins,director
0,females,7.0,29.0,Lisa Cholodenko
1,females,6.4,7.0,Joanna Hogg
2,females,6.3,4.0,Sofia Coppola
3,females,7.2,4.0,Márta Mészáros
4,females,5.9,,Chantal Akerman
5,females,6.9,,Maya Deren
6,females,7.3,,Chantal Akerman
7,females,6.8,22.0,Andrea Arnold
8,females,6.9,25.0,Julia Ducournau
9,females,6.9,1.0,Claire Denis


In [283]:
%%R -i combined_df

model <- lm(wins ~ rating, data = combined_df
           )

# Print summary of the model
summary(model)


Call:
lm(formula = wins ~ rating, data = combined_df)

Residuals:
   Min     1Q Median     3Q    Max 
-62.00 -27.16 -10.09  17.67 177.92 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -250.092     39.792  -6.285 2.91e-09 ***
rating        39.158      5.295   7.396 7.15e-12 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 41.36 on 162 degrees of freedom
  (24 Beobachtungen als fehlend gelöscht)
Multiple R-squared:  0.2524,	Adjusted R-squared:  0.2478 
F-statistic:  54.7 on 1 and 162 DF,  p-value: 7.149e-12



In [284]:
df_all['Number_of_Genres'] = df_all['genres'].str.strip("[]").str.split(", ").apply(len)

## separate genres for male-directed films

In [285]:
df_all['genres'] = df_all['genres'].str.strip("[]").str.replace("'", "").str.split(", ")


In [287]:
df_exploded_all = df_all.explode('genres')

## make a separate but smaller dataframe

In [288]:
genre_counts_all = df_exploded_all.groupby(['genres']).size().reset_index(name='Count')

In [289]:
genre_counts_all = genre_counts_all[genre_counts_all['Count'] > 0]


In [290]:
genre_counts_all['genres'] = genre_counts_all['genres'].astype(str)  # Force string conversion


In [291]:
genre_counts_all

Unnamed: 0,genres,Count
0,Action,6
1,Action Epic,5
2,Adventure,13
3,Adventure Epic,5
4,Alien Invasion,1
5,Animal Adventure,2
6,Animation,5
7,Anime,1
8,Artificial Intelligence,3
9,Biography,9


## apply the "map individual/niche genre to broader categories" function to this list

In [293]:
# Apply mapping
genre_counts_all['Broad_Genre'] = genre_counts_all['genres'].apply(map_to_parent)

# Aggregate
df_agg = genre_counts_all.groupby(['Broad_Genre'], as_index=False)['Count'].sum()

In [294]:
df_agg

Unnamed: 0,Broad_Genre,Count
0,Action,51
1,Animation,11
2,Comedy,31
3,Coming-of-Age,15
4,Crime,3
5,Documentary,16
6,Family,5
7,History,6
8,Horror,7
9,Mystery,11


In [295]:
# assign the gender to this dataframe

df_agg['Gender'] = 'Male' 

## combined dataframes for both genders to test if there's gender difference in what type of stories they tell

In [296]:
df_datawrap = pd.concat([df_sorted, df_agg], ignore_index=True)

In [298]:
df_pivoted = df_datawrap.pivot_table(
    index=['Broad_Genre'], 
    columns='Gender',              
    values='Count',               
    fill_value=0                  
).reset_index()                    

df_pivoted.columns = ['Broad_Genre', 'Female', 'Male']


In [299]:
df_pivoted

Unnamed: 0,Broad_Genre,Female,Male
0,Action,13.0,51.0
1,Animation,1.0,11.0
2,Comedy,38.0,31.0
3,Coming-of-Age,25.0,15.0
4,Crime,4.0,3.0
5,Documentary,32.0,16.0
6,Family,1.0,5.0
7,History,9.0,6.0
8,Horror,13.0,7.0
9,Mystery,7.0,11.0


In [300]:
genre_totals = df_pivoted.groupby('Broad_Genre')[['Female', 'Male']].sum().reset_index()

In [302]:
# Calculate total Female and Male movies (across all genres)
total_female = genre_totals['Female'].sum()
total_male = genre_totals['Male'].sum()

# Add percentage columns
genre_totals['Female_Share (%)'] = (genre_totals['Female'] / total_female) * 100
genre_totals['Male_Share (%)'] = (genre_totals['Male'] / total_male) * 100

# Round to 2 decimal places
genre_totals['Female_Share (%)'] = genre_totals['Female_Share (%)'].round(2)
genre_totals['Male_Share (%)'] = genre_totals['Male_Share (%)'].round(2)

In [303]:
genre_totals

Unnamed: 0,Broad_Genre,Female,Male,Female_Share (%),Male_Share (%)
0,Action,13.0,51.0,3.57,10.92
1,Animation,1.0,11.0,0.27,2.36
2,Comedy,38.0,31.0,10.44,6.64
3,Coming-of-Age,25.0,15.0,6.87,3.21
4,Crime,4.0,3.0,1.1,0.64
5,Documentary,32.0,16.0,8.79,3.43
6,Family,1.0,5.0,0.27,1.07
7,History,9.0,6.0,2.47,1.28
8,Horror,13.0,7.0,3.57,1.5
9,Mystery,7.0,11.0,1.92,2.36


In [304]:
genre_totals.to_csv('genre_share.csv', index=False)

# THIS PART DID NOT CONTRIBUTE TO MY STORY - Analysis for Rotten Tomatoes List

In [110]:
df_rt = pd.read_csv('rotten.csv')

## 1. convert all statistics into numerics

In [111]:
df_rt['Year'] = pd.to_numeric(df_rt['Year'], errors='coerce').astype('Int64')

In [112]:
df_rt['Rating'] = pd.to_numeric(df_rt['Rating'], errors='coerce')

## 2. how many genred does each film is tagged?

In [113]:
df_rt['Number_of_Genres'] = df_rt['Genres'].str.strip("[]").str.split(", ").apply(len)


In [114]:
df_rt

Unnamed: 0,Title,Year,Director,Link,Rating,Wins,Nominations,Genres,Number_of_Genres
0,The Substance,2024,Coralie Fargeat,https://www.imdb.com/title/tt17526714/,7.3,143.0,280.0,"['Body Horror', 'Dark Comedy', 'Monster Horror...",8
1,My Old Ass,2024,Megan Park,https://www.imdb.com/title/tt18559464/,6.9,9.0,35.0,"['Coming-of-Age', 'Comedy', 'Drama', 'Romance']",4
2,Love Lies Bleeding,2024,Rose Glass,https://www.imdb.com/title/tt19637052/,6.6,6.0,53.0,"['Dark Romance', 'Drug Crime', 'Erotic Thrille...",10
3,The Fire Inside,2024,Rachel Morrison,https://www.imdb.com/title/tt6133444/,6.7,5.0,17.0,"['Boxing', 'Docudrama', 'Biography', 'Drama', ...",5
4,The Devil's Bath,2024,Veronika Franz,https://www.imdb.com/title/tt29141112/,6.6,15.0,10.0,"['Folk Horror', 'Period Drama', 'Drama', 'Hist...",6
...,...,...,...,...,...,...,...,...,...
283,Monsoon Wedding,2001,Mira Nair,https://www.imdb.com/title/tt0265343/,7.3,7.0,12.0,"['Feel-Good Romance', 'Romantic Comedy', 'Come...",5
284,Promises,2001,Justine Shapiro,https://www.imdb.com/title/tt0282864/,8.3,15.0,5.0,['Documentary'],1
285,The Taste of Others,2000,Agnès Jaoui,https://www.imdb.com/title/tt0216787/,7.2,16.0,12.0,"['Comedy', 'Drama', 'Romance']",3
286,The Gleaners and I,2000,Agnès Varda,https://www.imdb.com/title/tt0247380/,7.7,16.0,3.0,['Documentary'],1


## 3. divide the data by eras

In [115]:
bins = [1999, 2010, 2020, 2025]
labels = ['1999-2010', '2010-2020', '2020-2024']

# Add 'Era' column
df_rt['Era'] = pd.cut(
    df_rt['Year'],
    bins=bins,
    labels=labels,
    right=False
)

In [116]:
df_rt

Unnamed: 0,Title,Year,Director,Link,Rating,Wins,Nominations,Genres,Number_of_Genres,Era
0,The Substance,2024,Coralie Fargeat,https://www.imdb.com/title/tt17526714/,7.3,143.0,280.0,"['Body Horror', 'Dark Comedy', 'Monster Horror...",8,2020-2024
1,My Old Ass,2024,Megan Park,https://www.imdb.com/title/tt18559464/,6.9,9.0,35.0,"['Coming-of-Age', 'Comedy', 'Drama', 'Romance']",4,2020-2024
2,Love Lies Bleeding,2024,Rose Glass,https://www.imdb.com/title/tt19637052/,6.6,6.0,53.0,"['Dark Romance', 'Drug Crime', 'Erotic Thrille...",10,2020-2024
3,The Fire Inside,2024,Rachel Morrison,https://www.imdb.com/title/tt6133444/,6.7,5.0,17.0,"['Boxing', 'Docudrama', 'Biography', 'Drama', ...",5,2020-2024
4,The Devil's Bath,2024,Veronika Franz,https://www.imdb.com/title/tt29141112/,6.6,15.0,10.0,"['Folk Horror', 'Period Drama', 'Drama', 'Hist...",6,2020-2024
...,...,...,...,...,...,...,...,...,...,...
283,Monsoon Wedding,2001,Mira Nair,https://www.imdb.com/title/tt0265343/,7.3,7.0,12.0,"['Feel-Good Romance', 'Romantic Comedy', 'Come...",5,1999-2010
284,Promises,2001,Justine Shapiro,https://www.imdb.com/title/tt0282864/,8.3,15.0,5.0,['Documentary'],1,1999-2010
285,The Taste of Others,2000,Agnès Jaoui,https://www.imdb.com/title/tt0216787/,7.2,16.0,12.0,"['Comedy', 'Drama', 'Romance']",3,1999-2010
286,The Gleaners and I,2000,Agnès Varda,https://www.imdb.com/title/tt0247380/,7.7,16.0,3.0,['Documentary'],1,1999-2010
