In [1]:
from collections import defaultdict
import requests
import time, os

import numpy as np
import pandas as pd

import json, urllib3, string
from bs4 import BeautifulSoup

import pickle
from sqlalchemy import create_engine

from itertools import product
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from IPython.display import Image
import matplotlib.pylab as pylab

import re

import gender_guesser.detector as gender

import statsmodels.api as sm
from scipy.stats import chisquare
from sklearn.model_selection import train_test_split

## Import Data

### Import data from Bechdel Test Movie List

Description of data found here: http://bechdeltest.com/api/v1/doc

In [2]:
%%time
all_movies_response = requests.get('http://bechdeltest.com/api/v1/getAllMovies')

Wall time: 2.41 s


In [3]:
bechdel_movies = pd.DataFrame(all_movies_response.json())

In [4]:
bechdel_movies.sample(n=5)

Unnamed: 0,id,rating,title,year,imdbid
3284,8114,3,East Is East,1999,166175
2084,1203,3,Overboard,1987,93693
1179,6807,2,Gamera vs. Viras,1968,63000
8178,8669,2,Rings,2017,498381
6403,3864,3,Strike Witches - The Movie,2012,2322603


In [5]:
bechdel_movies_mod = bechdel_movies.copy()
bechdel_movies_mod['imdbid_tt'] = 'tt'+ bechdel_movies_mod['imdbid']
bechdel_movies_mod.head()

Unnamed: 0,id,rating,title,year,imdbid,imdbid_tt
0,8040,0,Roundhay Garden Scene,1888,392728,tt0392728
1,5433,0,Pauvre Pierrot,1892,3,tt0000003
2,6200,0,"Execution of Mary, Queen of Scots, The",1895,132134,tt0132134
3,5444,0,Tables Turned on the Gardener,1895,14,tt0000014
4,5406,0,Une nuit terrible,1896,131,tt0000131


### Import data from IMDB

Description of data found here: https://www.imdb.com/interfaces/

This takes a long time to import

Download data (https://datasets.imdbws.com/), change filepath, and unzip here:    

In [6]:
# !gzip -d "C:\Users\wal12\Documents\Metis\Project 3\imdb-data\name.basics.tsv.gz"
# !gzip -d "C:\Users\wal12\Documents\Metis\Project 3\imdb-data\title.principals.tsv.gz"
# !gzip -d "C:\Users\wal12\Documents\Metis\Project 3\imdb-data\title.crew.tsv.gz"
# !gzip -d "C:\Users\wal12\Documents\Metis\Project 3\imdb-data\title.basics.tsv.gz"
# !gzip -d "C:\Users\wal12\Documents\Metis\Project 3\imdb-data\title.akas.tsv.gz"

In [7]:
data_dir = "C:\\Users\\wal12\\Documents\\Metis\\Project 3\\imdb-data\\" #change this directory to where you unzipped the files
title_crew = pd.read_csv(data_dir + 'title.crew.tsv',sep='\t')
title_principals = pd.read_csv(data_dir + 'title.principals.tsv',sep='\t')
name_basics = pd.read_csv(data_dir + 'name.basics.tsv',sep='\t')
title_basics = pd.read_csv(data_dir + 'title.basics.tsv',sep='\t')
title_akas = pd.read_csv(data_dir + 'title.akas.tsv',sep='\t')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Describe Bechdel Data

In [8]:
print("Number of movies after 1950:",len(bechdel_movies[bechdel_movies['year']>=1950]))
print("DataFrame shape:",bechdel_movies_mod.shape)
print("Available columns:", bechdel_movies_mod.columns)

Number of movies after 1950: 8205
DataFrame shape: (8796, 6)
Available columns: Index(['id', 'rating', 'title', 'year', 'imdbid', 'imdbid_tt'], dtype='object')


In [9]:
from varname import nameof
datasets = [bechdel_movies,title_crew,title_basics,title_principals,title_akas,name_basics]
dataset_names = [nameof(bechdel_movies),nameof(title_crew),nameof(title_basics),nameof(title_principals),nameof(title_akas),nameof(name_basics)]
datasets_zip = list(zip(datasets,dataset_names))
print("List of data imported:")
for name in dataset_names:
    print(name)

List of data imported:
bechdel_movies
title_crew
title_basics
title_principals
title_akas
name_basics


## Data Processing

In [10]:
%time

merge = bechdel_movies_mod.merge(title_crew,how='inner',left_on='imdbid_tt',right_on='tconst')
# merge = merge.merge(title_principals,how='inner',left_on='imdbid_tt',right_on='tconst')
merge.head()

Wall time: 0 ns


Unnamed: 0,id,rating,title,year,imdbid,imdbid_tt,tconst,directors,writers
0,8040,0,Roundhay Garden Scene,1888,392728,tt0392728,tt0392728,nm1284117,\N
1,5433,0,Pauvre Pierrot,1892,3,tt0000003,tt0000003,nm0721526,\N
2,6200,0,"Execution of Mary, Queen of Scots, The",1895,132134,tt0132134,tt0132134,nm0163632,\N
3,5444,0,Tables Turned on the Gardener,1895,14,tt0000014,tt0000014,"nm0525910,nm0349785",\N
4,5406,0,Une nuit terrible,1896,131,tt0000131,tt0000131,nm0617588,\N


In [None]:
merge_principals = bechdel_movies_mod.merge(title_principals,how='inner',left_on='imdbid_tt',right_on='tconst')
principals_names = merge_principals.merge(name_basics[['nconst','primaryName']],how='inner',left_on='nconst',right_on='nconst')
principals_names['firstName'] = principals_names['primaryName'].str.extract('(^\S+) ')
principals_names = principals_names[(principals_names.category.isin(['actor','writer','actress','producer','director','editor']))]
principals_names.head()

In [None]:
# merge[merge['imdbid_tt']=='tt0084745']

In [None]:
# merge_principals[merge_principals['imdbid_tt']=='tt0084745']

Process merged director data 

In [None]:
id_directors = pd.DataFrame(merge['directors'].str.split(',').tolist(),index=merge['imdbid_tt']).stack()
id_directors = id_directors.reset_index([0,'imdbid_tt'])
id_directors.columns = ['imdbid_tt','directors']

id_directors = id_directors.merge(name_basics[['nconst','primaryName']],how='inner',left_on='directors',right_on='nconst')
id_directors['firstName'] = id_directors['primaryName'].str.extract('(^\S+) ')
# id_directors['gender'] = id_directors['firstName'].apply(d.get_gender)

id_directors.head()

Process merged writer data

In [None]:
id_writers = pd.DataFrame(merge['writers'].str.split(',').tolist(),index=merge['imdbid_tt']).stack()
id_writers = id_writers.reset_index([0,'imdbid_tt'])
id_writers.columns = ['imdbid_tt','writers']

id_writers = id_writers.merge(name_basics[['nconst','primaryName']],how='inner',left_on='writers',right_on='nconst')
id_writers['firstName'] = id_writers['primaryName'].str.extract('(^\S+) ')
# id_writers['gender'] = id_writers['firstName'].apply(d.get_gender)

id_writers.head()

Generate dataset names

In [None]:
from varname import nameof
datasets = [merge,principals_names,id_directors,id_writers]
datasets_names = [nameof(merge),nameof(principals_names),nameof(id_directors),nameof(id_writers)]
datasets_zip = list(zip(datasets,datasets_names))
datasets_dict = {}
for item in datasets_zip:
    datasets_dict[item[1]] = item[0]

datasets_dict.keys()

Pickle

In [18]:
with open('datasets_dict.pickle', 'wb') as f:
    pickle.dump(datasets_dict, f, pickle.HIGHEST_PROTOCOL)