In [102]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from tqdm import tqdm

## Analyzing the Impact of Female Directors and Writers

To answer what impact female directors and writers have we will take our director data from WikiData and supplemented CMU movie data to see underlying trends that may be present as well as understanding the impact of female writers and directors on a movie's IMDB rating.

### Merging and Cleaning
First we will merge our two relevent datasets together.

In [64]:
movies_full = pd.read_pickle('./data/pickles/movies.p')
directors_full = pd.read_pickle('./data/pickles/director_writer_nodrop.p')

print(movies_full.columns)
print(directors_full.columns)

Index(['title', 'primaryTitle', 'originalTitle', 'wikipedia_id',
       'freebase_id_cmu', 'freebase_id_wd', 'wikidata_id', 'new_wikidata_id',
       'IMDB_id', 'mojo_id', 'release_date_cmu', 'release_date_wd',
       'release_year_cmu', 'release_year_wd', 'release_year_imdb',
       'combined_release_year', 'runtime_cmu', 'runtime_wd', 'runtime_imdb',
       'combined_runtime', 'box_office_cmu', 'box_office_wd', 'languages',
       'countries', 'averageRating_imdb', 'numVotes_imdb', 'genres_cmu',
       'genres_imdb', 'directors', 'writers'],
      dtype='object')
Index(['title', 'movie_wiki_id', 'movie_wikidata_id', 'movie_freebase_id',
       'movie_imdb_id', 'role', 'imdb_id', 'wikidata_id', 'name', 'gender',
       'birth', 'height', 'ethnicity'],
      dtype='object')


There are many columns that are not relevant to the analysis to be done and therefore we will create a new DataFrame for both the movies and directors that have the columns to be analyzed.

In [65]:
movies = movies_full[[
    'title',
    'freebase_id_cmu',
    'combined_release_year',
    'combined_runtime',
    'averageRating_imdb',
    'numVotes_imdb',
    'genres_cmu'
]].copy()

print(f'The length of the dataset is {len(movies)}, the number of freebase IDs is {len(movies.freebase_id_cmu.notna())}')
movies.genres_cmu = movies.genres_cmu.apply(lambda x: eval(x))
movies.head()

The length of the dataset is 81741, the number of freebase IDs is 81741


Unnamed: 0,title,freebase_id_cmu,combined_release_year,combined_runtime,averageRating_imdb,numVotes_imdb,genres_cmu
0,Ghosts of Mars,/m/03vyhn,2001.0,98.0,4.9,55061.0,"{'/m/01jfsb': 'Thriller', '/m/06n90': 'Science..."
1,Getting Away with Murder: The JonBenét Ramsey ...,/m/08yl5d,2000.0,95.0,5.9,66.0,"{'/m/02n4kr': 'Mystery', '/m/03bxz7': 'Biograp..."
2,Brun bitter,/m/0crgdbh,1988.0,83.0,5.7,40.0,"{'/m/0lsxr': 'Crime Fiction', '/m/07s9rl0': 'D..."
3,White Of The Eye,/m/0285_cd,1987.0,110.0,6.1,2537.0,"{'/m/01jfsb': 'Thriller', '/m/0glj9q': 'Erotic..."
4,A Woman in Flames,/m/01mrr1,1983.0,106.0,6.0,587.0,{'/m/07s9rl0': 'Drama'}


In [66]:
directors = directors_full[[
    'title',
    'movie_freebase_id',
    'movie_imdb_id',
    'role',
    'name',
    'imdb_id',
    'wikidata_id',
    'gender',
    'birth',
    'height',
    'ethnicity'
]]
directors.head()

Unnamed: 0,title,movie_freebase_id,movie_imdb_id,role,name,imdb_id,wikidata_id,gender,birth,height,ethnicity
0,Ghosts of Mars,/m/03vyhn,tt0228333,director,John Carpenter,nm0000118,Q95008,male,1948-01-16,,
1,Getting Away with Murder: The JonBenét Ramsey ...,/m/08yl5d,tt0245916,director,,nm1740285,,,,,
2,Brun bitter,/m/0crgdbh,tt0094806,director,Sølve Skagen,nm0803751,Q7666470,male,1945-02-17,,
3,White Of The Eye,/m/0285_cd,tt0094320,director,Donald Cammell,nm0131910,Q975488,male,1934-01-17,,
4,A Woman in Flames,/m/01mrr1,tt0083949,director,Robert van Ackeren,nm0885554,Q88104,male,1946-12-22,,


The data will be merged on the movie freebase ID

In [67]:
directors_movies_merge = pd.merge(left=directors, right=movies, how='left', 
                                    left_on='movie_freebase_id', right_on='freebase_id_cmu')

directors_movies_merge.head()

Unnamed: 0,title_x,movie_freebase_id,movie_imdb_id,role,name,imdb_id,wikidata_id,gender,birth,height,ethnicity,title_y,freebase_id_cmu,combined_release_year,combined_runtime,averageRating_imdb,numVotes_imdb,genres_cmu
0,Ghosts of Mars,/m/03vyhn,tt0228333,director,John Carpenter,nm0000118,Q95008,male,1948-01-16,,,Ghosts of Mars,/m/03vyhn,2001.0,98.0,4.9,55061.0,"{'/m/01jfsb': 'Thriller', '/m/06n90': 'Science..."
1,Getting Away with Murder: The JonBenét Ramsey ...,/m/08yl5d,tt0245916,director,,nm1740285,,,,,,Getting Away with Murder: The JonBenét Ramsey ...,/m/08yl5d,2000.0,95.0,5.9,66.0,"{'/m/02n4kr': 'Mystery', '/m/03bxz7': 'Biograp..."
2,Brun bitter,/m/0crgdbh,tt0094806,director,Sølve Skagen,nm0803751,Q7666470,male,1945-02-17,,,Brun bitter,/m/0crgdbh,1988.0,83.0,5.7,40.0,"{'/m/0lsxr': 'Crime Fiction', '/m/07s9rl0': 'D..."
3,White Of The Eye,/m/0285_cd,tt0094320,director,Donald Cammell,nm0131910,Q975488,male,1934-01-17,,,White Of The Eye,/m/0285_cd,1987.0,110.0,6.1,2537.0,"{'/m/01jfsb': 'Thriller', '/m/0glj9q': 'Erotic..."
4,A Woman in Flames,/m/01mrr1,tt0083949,director,Robert van Ackeren,nm0885554,Q88104,male,1946-12-22,,,A Woman in Flames,/m/01mrr1,1983.0,106.0,6.0,587.0,{'/m/07s9rl0': 'Drama'}


After merging the dataset will need some cleaning before it can be analyzed.

In [68]:
print(all(directors_movies_merge.title_x == directors_movies_merge.title_y)) #are title_x and title_y are exactly the same?
directors_movies = directors_movies_merge[[
    'title_x',
    'combined_release_year',
    'combined_runtime',
    'averageRating_imdb',
    'numVotes_imdb',
    'genres_cmu',
    'movie_freebase_id',
    'movie_imdb_id',
    'role',
    'name',
    'gender',
    'birth',
    'height',
    'ethnicity'
]].copy()

directors_movies.head()

True


Unnamed: 0,title_x,combined_release_year,combined_runtime,averageRating_imdb,numVotes_imdb,genres_cmu,movie_freebase_id,movie_imdb_id,role,name,gender,birth,height,ethnicity
0,Ghosts of Mars,2001.0,98.0,4.9,55061.0,"{'/m/01jfsb': 'Thriller', '/m/06n90': 'Science...",/m/03vyhn,tt0228333,director,John Carpenter,male,1948-01-16,,
1,Getting Away with Murder: The JonBenét Ramsey ...,2000.0,95.0,5.9,66.0,"{'/m/02n4kr': 'Mystery', '/m/03bxz7': 'Biograp...",/m/08yl5d,tt0245916,director,,,,,
2,Brun bitter,1988.0,83.0,5.7,40.0,"{'/m/0lsxr': 'Crime Fiction', '/m/07s9rl0': 'D...",/m/0crgdbh,tt0094806,director,Sølve Skagen,male,1945-02-17,,
3,White Of The Eye,1987.0,110.0,6.1,2537.0,"{'/m/01jfsb': 'Thriller', '/m/0glj9q': 'Erotic...",/m/0285_cd,tt0094320,director,Donald Cammell,male,1934-01-17,,
4,A Woman in Flames,1983.0,106.0,6.0,587.0,{'/m/07s9rl0': 'Drama'},/m/01mrr1,tt0083949,director,Robert van Ackeren,male,1946-12-22,,


The birth dates are stored as strings. For our analysis we will be using the birth year to obtain an "age at movie release" column so that we can also see how the success of directors and writers may change over their career.

In [69]:
directors_movies['birth_year'] = directors_movies.birth.apply(lambda x: int(x.split('-')[0]) if isinstance(x,str) else x)
directors_movies.head()

Unnamed: 0,title_x,combined_release_year,combined_runtime,averageRating_imdb,numVotes_imdb,genres_cmu,movie_freebase_id,movie_imdb_id,role,name,gender,birth,height,ethnicity,birth_year
0,Ghosts of Mars,2001.0,98.0,4.9,55061.0,"{'/m/01jfsb': 'Thriller', '/m/06n90': 'Science...",/m/03vyhn,tt0228333,director,John Carpenter,male,1948-01-16,,,1948.0
1,Getting Away with Murder: The JonBenét Ramsey ...,2000.0,95.0,5.9,66.0,"{'/m/02n4kr': 'Mystery', '/m/03bxz7': 'Biograp...",/m/08yl5d,tt0245916,director,,,,,,
2,Brun bitter,1988.0,83.0,5.7,40.0,"{'/m/0lsxr': 'Crime Fiction', '/m/07s9rl0': 'D...",/m/0crgdbh,tt0094806,director,Sølve Skagen,male,1945-02-17,,,1945.0
3,White Of The Eye,1987.0,110.0,6.1,2537.0,"{'/m/01jfsb': 'Thriller', '/m/0glj9q': 'Erotic...",/m/0285_cd,tt0094320,director,Donald Cammell,male,1934-01-17,,,1934.0
4,A Woman in Flames,1983.0,106.0,6.0,587.0,{'/m/07s9rl0': 'Drama'},/m/01mrr1,tt0083949,director,Robert van Ackeren,male,1946-12-22,,,1946.0


In [70]:
print(f'There are {len(directors_movies[directors_movies.combined_release_year.isna()])} NaN values in the combined release year column')
print(f'There are {len(directors_movies[directors_movies.birth_year.isna()])} NaN values in the birth year column')

There are 10672 NaN values in the combined release year column
There are 93750 NaN values in the birth year column


In [71]:
#create a column to store age at release values
directors_movies['age_at_release'] = np.nan

#fill in with actual values of age at release year if there is both a release year and a birth year
for indx, row in directors_movies.loc[directors_movies.combined_release_year.notna() & directors_movies.birth_year.notna()].iterrows():
    directors_movies.loc[indx,'age_at_release'] = row.combined_release_year- row.birth_year

directors_movies.head()

Unnamed: 0,title_x,combined_release_year,combined_runtime,averageRating_imdb,numVotes_imdb,genres_cmu,movie_freebase_id,movie_imdb_id,role,name,gender,birth,height,ethnicity,birth_year,age_at_release
0,Ghosts of Mars,2001.0,98.0,4.9,55061.0,"{'/m/01jfsb': 'Thriller', '/m/06n90': 'Science...",/m/03vyhn,tt0228333,director,John Carpenter,male,1948-01-16,,,1948.0,53.0
1,Getting Away with Murder: The JonBenét Ramsey ...,2000.0,95.0,5.9,66.0,"{'/m/02n4kr': 'Mystery', '/m/03bxz7': 'Biograp...",/m/08yl5d,tt0245916,director,,,,,,,
2,Brun bitter,1988.0,83.0,5.7,40.0,"{'/m/0lsxr': 'Crime Fiction', '/m/07s9rl0': 'D...",/m/0crgdbh,tt0094806,director,Sølve Skagen,male,1945-02-17,,,1945.0,43.0
3,White Of The Eye,1987.0,110.0,6.1,2537.0,"{'/m/01jfsb': 'Thriller', '/m/0glj9q': 'Erotic...",/m/0285_cd,tt0094320,director,Donald Cammell,male,1934-01-17,,,1934.0,53.0
4,A Woman in Flames,1983.0,106.0,6.0,587.0,{'/m/07s9rl0': 'Drama'},/m/01mrr1,tt0083949,director,Robert van Ackeren,male,1946-12-22,,,1946.0,37.0


The genre data is currently stored as a dictionary with the the freebase ID as the key and the genre as the value. Let's change this to just a list of genres per movie.

In [72]:
print(f'There are {len(directors_movies[directors_movies.genres_cmu.isna()])} NaN values in the genres_cmu column')

There are 0 NaN values in the genres_cmu column


In [73]:
directors_movies.genres_cmu = directors_movies.genres_cmu.apply(lambda x: list(x.values()))

Lastly for the ease of analysis some of the column names will be changed/shortened.

In [74]:
directors_movies.rename(mapper={
    'title_x': 'title',
    'combined_runtime':'runtime',
    'combined_release_year':'release_year',
    'averageRating_imdb':'rating',
    'genres_cmu':'genres',
    'numVotes_imdb':'num_votes'    
},inplace=True, axis=1)
directors_movies.head()

Unnamed: 0,title,release_year,runtime,rating,num_votes,genres,movie_freebase_id,movie_imdb_id,role,name,gender,birth,height,ethnicity,birth_year,age_at_release
0,Ghosts of Mars,2001.0,98.0,4.9,55061.0,"[Thriller, Science Fiction, Horror, Adventure,...",/m/03vyhn,tt0228333,director,John Carpenter,male,1948-01-16,,,1948.0,53.0
1,Getting Away with Murder: The JonBenét Ramsey ...,2000.0,95.0,5.9,66.0,"[Mystery, Biographical film, Drama, Crime Drama]",/m/08yl5d,tt0245916,director,,,,,,,
2,Brun bitter,1988.0,83.0,5.7,40.0,"[Crime Fiction, Drama]",/m/0crgdbh,tt0094806,director,Sølve Skagen,male,1945-02-17,,,1945.0,43.0
3,White Of The Eye,1987.0,110.0,6.1,2537.0,"[Thriller, Erotic thriller, Psychological thri...",/m/0285_cd,tt0094320,director,Donald Cammell,male,1934-01-17,,,1934.0,53.0
4,A Woman in Flames,1983.0,106.0,6.0,587.0,[Drama],/m/01mrr1,tt0083949,director,Robert van Ackeren,male,1946-12-22,,,1946.0,37.0


### Analysis

Before getting too deep into the analysis it should be understood what is in the data and what is missing from the data. We can first understand what director data was able to be collected. how many movies have complete director and writer data, how many movies have at least one director or writer present and how many directors and writers are present for each gender.

In [85]:
dirs = directors_movies[directors_movies.role == 'director']
writers = directors_movies[directors_movies.role == 'writer']
dir_num_tot = len(dirs)
wri_num_tot = len(writers)
dir_num_filled = len(dirs[dirs.name.notna()])
wri_num_filled = len(writers[writers.name.notna()])

print(f'Out of {len(directors_movies)} rows of data:')
print(f'There are {len(dirs)} number of directors and {len(writers)} number of writers.')
print(f'Out of all of the directors {len(dirs[dirs.name.notna()])} have a name value, {len(dirs[dirs.gender.notna()])} have a gender value, and {len(dirs[dirs.birth_year.notna()])} have a birth year.')
print(f'Out of all of the writers {len(writers[writers.name.notna()])} have a name value, {len(writers[writers.gender.notna()])} have a gender value, and {len(writers[writers.birth_year.notna()])} have a birth year.')
print(f'There are {dirs.name.nunique()} unique directors in the dataset and {writers.name.nunique()} unique writers.')

Out of 250745 rows of data:
There are 86474 number of directors and 164271 number of writers.
Out of all of the directors 73584 have a name value, 70281 have a gender value, and 66801 have a birth year.
Out of all of the writers 97458 have a name value, 95083 have a gender value, and 90194 have a birth year.
There are 21161 unique directors in the dataset and 25916 unique writers.


Let's see how many movies are present that have a complete set of directors and writers that have at least gender information

In [104]:
movie_groups = directors_movies.groupby('movie_freebase_id')

for fb_id, movie in tqdm(movie_groups):
    all_data = True
    indices = []
    for indx, row in movie.iterrows():
        indices.append(indx)
        if row.gender != 'male' and row.gender!= 'female':
            all_data = False
    directors_movies.loc[indices,'all_crew'] = all_data
    

100%|██████████| 74200/74200 [00:43<00:00, 1709.83it/s]


In [114]:
full_crew = directors_movies[directors_movies.all_crew == True]
print(f'Out of {directors_movies.movie_freebase_id.nunique()}, there are {directors_movies[directors_movies.gender.notna()].movie_freebase_id.nunique()} movies that have at least one director or writer with gender data')
print(f'and there are {full_crew.movie_freebase_id.nunique()} movies with full crew gender data.')
print(f'This is means that {round(directors_movies[directors_movies.gender.notna()].movie_freebase_id.nunique()/directors_movies.movie_freebase_id.nunique(),4)*100}% have at least one point of gender data and {100*round(full_crew.movie_freebase_id.nunique()/directors_movies.movie_freebase_id.nunique(),4)}% of the movies have full crew gender data.')

Out of 74200, there are 66755 movies that have at least one director or writer with gender data
and there are 27382 movies with full crew gender data.
This is means that 89.97% have at least one point of gender data and 36.9% of the movies have full crew gender data.


From the above analysis we can see that in general there is a lot of data available for analysis on gender and still a significant portion of data available with gender data on the full crew.

Now that we have an understanding of the number of data points that are present. the data can start to be analyzed for trends.