In [1]:
from collections import defaultdict
import requests
import time, os

import numpy as np
import pandas as pd

import json, urllib3, string
from bs4 import BeautifulSoup

import pickle
from sqlalchemy import create_engine

from itertools import product
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from IPython.display import Image
import matplotlib.pylab as pylab

import re

import gender_guesser.detector as gender

import statsmodels.api as sm
from scipy.stats import chisquare
from sklearn.model_selection import train_test_split

## Import Data

### Import data from Bechdel Test Movie List

Description of data found here: http://bechdeltest.com/api/v1/doc

In [2]:
%%time
all_movies_response = requests.get('http://bechdeltest.com/api/v1/getAllMovies')

Wall time: 2.52 s


In [3]:
bechdel_movies = pd.DataFrame(all_movies_response.json())

In [4]:
bechdel_movies.sample(n=5)

Unnamed: 0,rating,year,title,id,imdbid
7397,3,2015,Emelie,7993,4503598
3951,0,2003,"Last Samurai, The",1404,325710
3217,2,1999,A Midsummer Night&#39;s Dream,1135,140379
7429,3,2015,Meadowland,6767,3529656
348,3,1939,"Rules of the Game, The",3089,31885


In [5]:
bechdel_movies_mod = bechdel_movies.copy()
bechdel_movies_mod['imdbid_tt'] = 'tt'+ bechdel_movies_mod['imdbid']
bechdel_movies_mod.head()

Unnamed: 0,rating,year,title,id,imdbid,imdbid_tt
0,0,1888,Roundhay Garden Scene,8040,392728,tt0392728
1,0,1892,Pauvre Pierrot,5433,3,tt0000003
2,0,1895,"Execution of Mary, Queen of Scots, The",6200,132134,tt0132134
3,0,1895,Tables Turned on the Gardener,5444,14,tt0000014
4,0,1896,Une nuit terrible,5406,131,tt0000131


### Import data from IMDB

Description of data found here: https://www.imdb.com/interfaces/

This takes a long time to import

Download data (https://datasets.imdbws.com/), change filepath, and unzip here:    

In [6]:
# !gzip -d "C:\Users\wal12\Documents\Metis\Project 3\imdb-data\name.basics.tsv.gz"
# !gzip -d "C:\Users\wal12\Documents\Metis\Project 3\imdb-data\title.principals.tsv.gz"
# !gzip -d "C:\Users\wal12\Documents\Metis\Project 3\imdb-data\title.crew.tsv.gz"
# !gzip -d "C:\Users\wal12\Documents\Metis\Project 3\imdb-data\title.basics.tsv.gz"
# !gzip -d "C:\Users\wal12\Documents\Metis\Project 3\imdb-data\title.akas.tsv.gz"

In [7]:
data_dir = "C:\\Users\\wal12\\Documents\\Metis\\Project 3\\imdb-data\\" #change this directory to where you unzipped the files
title_crew = pd.read_csv(data_dir + 'title.crew.tsv',sep='\t')
title_principals = pd.read_csv(data_dir + 'title.principals.tsv',sep='\t')
name_basics = pd.read_csv(data_dir + 'name.basics.tsv',sep='\t')
title_basics = pd.read_csv(data_dir + 'title.basics.tsv',sep='\t')
title_akas = pd.read_csv(data_dir + 'title.akas.tsv',sep='\t')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Describe Bechdel Data

In [9]:
print("Number of movies after 1950:",len(bechdel_movies[bechdel_movies['year']>=1950]))
print("DataFrame shape:",bechdel_movies_mod.shape)
print("Available columns:", bechdel_movies_mod.columns)

Number of movies after 1950: 8205
DataFrame shape: (8796, 6)
Available columns: Index(['rating', 'year', 'title', 'id', 'imdbid', 'imdbid_tt'], dtype='object')


In [10]:
from varname import nameof
datasets = [bechdel_movies,title_crew,title_basics,title_principals,title_akas,name_basics]
dataset_names = [nameof(bechdel_movies),nameof(title_crew),nameof(title_basics),nameof(title_principals),nameof(title_akas),nameof(name_basics)]
datasets_zip = list(zip(datasets,dataset_names))
print("List of data imported:")
for name in dataset_names:
    print(name)

List of data imported:
bechdel_movies
title_crew
title_basics
title_principals
title_akas
name_basics


## Data Processing

In [11]:
%time

merge = bechdel_movies_mod.merge(title_crew,how='inner',left_on='imdbid_tt',right_on='tconst')
# merge = merge.merge(title_principals,how='inner',left_on='imdbid_tt',right_on='tconst')
merge.head()

Wall time: 0 ns


Unnamed: 0,rating,year,title,id,imdbid,imdbid_tt,tconst,directors,writers
0,0,1888,Roundhay Garden Scene,8040,392728,tt0392728,tt0392728,nm1284117,\N
1,0,1892,Pauvre Pierrot,5433,3,tt0000003,tt0000003,nm0721526,\N
2,0,1895,"Execution of Mary, Queen of Scots, The",6200,132134,tt0132134,tt0132134,nm0163632,\N
3,0,1895,Tables Turned on the Gardener,5444,14,tt0000014,tt0000014,"nm0525910,nm0349785",\N
4,0,1896,Une nuit terrible,5406,131,tt0000131,tt0000131,nm0617588,\N


In [12]:
merge_principals = bechdel_movies_mod.merge(title_principals,how='inner',left_on='imdbid_tt',right_on='tconst')
principals_names = merge_principals.merge(name_basics[['nconst','primaryName']],how='inner',left_on='nconst',right_on='nconst')
principals_names['firstName'] = principals_names['primaryName'].str.extract('(^\S+) ')
principals_names = principals_names[(principals_names.category.isin(['actor','writer','actress','producer','director','editor']))]
principals_names.head()

Unnamed: 0,rating,year,title,id,imdbid,imdbid_tt,tconst,ordering,nconst,category,job,characters,primaryName,firstName
4,0,1888,Roundhay Garden Scene,8040,392728,tt0392728,tt0392728,5,nm1284117,director,\N,\N,Louis Aimé Augustin Le Prince,Louis
5,0,1892,Pauvre Pierrot,5433,3,tt0000003,tt0000003,1,nm0721526,director,\N,\N,Émile Reynaud,Émile
6,0,1892,Pauvre Pierrot,5433,3,tt0000003,tt0000003,2,nm1770680,producer,producer,\N,Julien Pappé,Julien
8,0,1892,Pauvre Pierrot,5433,3,tt0000003,tt0000003,4,nm5442200,editor,\N,\N,Tamara Pappé,Tamara
9,0,1895,"Execution of Mary, Queen of Scots, The",6200,132134,tt0132134,tt0132134,1,nm0858405,actor,\N,"[""Queen Mary""]",Robert Thomae,Robert


In [13]:
# merge[merge['imdbid_tt']=='tt0084745']

In [14]:
# merge_principals[merge_principals['imdbid_tt']=='tt0084745']

Process merged director data 

In [15]:
id_directors = pd.DataFrame(merge['directors'].str.split(',').tolist(),index=merge['imdbid_tt']).stack()
id_directors = id_directors.reset_index([0,'imdbid_tt'])
id_directors.columns = ['imdbid_tt','directors']

id_directors = id_directors.merge(name_basics[['nconst','primaryName']],how='inner',left_on='directors',right_on='nconst')
id_directors['firstName'] = id_directors['primaryName'].str.extract('(^\S+) ')
# id_directors['gender'] = id_directors['firstName'].apply(d.get_gender)

id_directors.head()

Unnamed: 0,imdbid_tt,directors,nconst,primaryName,firstName
0,tt0392728,nm1284117,nm1284117,Louis Aimé Augustin Le Prince,Louis
1,tt0000003,nm0721526,nm0721526,Émile Reynaud,Émile
2,tt0132134,nm0163632,nm0163632,Alfred Clark,Alfred
3,tt0000014,nm0525910,nm0525910,Louis Lumière,Louis
4,tt0000012,nm0525910,nm0525910,Louis Lumière,Louis


Process merged writer data

In [16]:
id_writers = pd.DataFrame(merge['writers'].str.split(',').tolist(),index=merge['imdbid_tt']).stack()
id_writers = id_writers.reset_index([0,'imdbid_tt'])
id_writers.columns = ['imdbid_tt','writers']

id_writers = id_writers.merge(name_basics[['nconst','primaryName']],how='inner',left_on='writers',right_on='nconst')
id_writers['firstName'] = id_writers['primaryName'].str.extract('(^\S+) ')
# id_writers['gender'] = id_writers['firstName'].apply(d.get_gender)

id_writers.head()

Unnamed: 0,imdbid_tt,writers,nconst,primaryName,firstName
0,tt0223341,nm0349785,nm0349785,Alice Guy,Alice
1,tt0000091,nm0617588,nm0617588,Georges Méliès,Georges
2,tt0000211,nm0617588,nm0617588,Georges Méliès,Georges
3,tt0131934,nm0617588,nm0617588,Georges Méliès,Georges
4,tt0000417,nm0617588,nm0617588,Georges Méliès,Georges


Generate dataset names

In [17]:
from varname import nameof
datasets = [merge,principals_names,id_directors,id_writers]
datasets_names = [nameof(merge),nameof(principals_names),nameof(id_directors),nameof(id_writers)]
datasets_zip = list(zip(datasets,datasets_names))
datasets_dict = {}
for item in datasets_zip:
    datasets_dict[item[1]] = item[0]

datasets_dict.keys()

dict_keys(['merge', 'principals_names', 'id_directors', 'id_writers'])

Pickle

In [18]:
with open('datasets_dict.pickle', 'wb') as f:
    pickle.dump(datasets_dict, f, pickle.HIGHEST_PROTOCOL)