In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
import os
import csv
import requests
import json

# Import SQL Alchemy
from sqlalchemy import create_engine

# Import and establish Base for which classes will be constructed 
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()

# Import modules to declare columns and column data types
from sqlalchemy import Column, Integer, String, Float

import sqlite3

In [2]:
# Files to Load
# https://data.world/data-society/imdb-5000-movie-dataset
movies_to_load = os.path.join("data", "movie_metadata.csv")

In [3]:
movies_df = pd.read_csv(movies_to_load, encoding="UTF-8")
# Add escapechar; commas in title escaped with \

In [4]:
movies_df.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [5]:
# Nonbreaking space at end of titles; removed by regex
movies_df = movies_df.replace(' ','', regex=True)

In [22]:
# http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1
movies_df['tid'] = movies_df['movie_imdb_link'].replace('http://www.imdb.com/title/','', regex=True)
movies_df['tid'] = movies_df['tid'].replace('/\?ref_=fn_tt_tt_1','', regex=True)

In [29]:
movies_df.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,tid
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,tt0499549
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,tt0449088
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,tt2379713
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,tt1345836
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,12.0,7.1,,0,tt5289954


In [23]:
movies_df.to_csv("movies_df.csv")

In [24]:
data = pd.read_json("http://bechdeltest.com/api/v1/getAllMovies", orient="columns")
bechdel_df = pd.DataFrame(data)

In [25]:
bechdel_df.head()

Unnamed: 0,id,imdbid,rating,title,year
0,8040,392728,0,Roundhay Garden Scene,1888
1,5433,3,0,Pauvre Pierrot,1892
2,5444,14,0,Tables Turned on the Gardener,1895
3,6200,132134,0,"Execution of Mary, Queen of Scots, The",1895
4,6199,12,0,"Arrival of a Train, The",1896


In [26]:
# Add prefix tt to id to merge with movies df
bechdel_df['tid'] = 'tt' + bechdel_df['imdbid'].astype(str)

In [27]:
bechdel_df.head()


Unnamed: 0,id,imdbid,rating,title,year,tid
0,8040,392728,0,Roundhay Garden Scene,1888,tt0392728
1,5433,3,0,Pauvre Pierrot,1892,tt0000003
2,5444,14,0,Tables Turned on the Gardener,1895,tt0000014
3,6200,132134,0,"Execution of Mary, Queen of Scots, The",1895,tt0132134
4,6199,12,0,"Arrival of a Train, The",1896,tt0000012


In [28]:
truncatedBechdel_df = bechdel_df[['tid','rating']].copy()
truncatedBechdel_df.columns = ['tid','bechdelRating']
truncatedBechdel_df.head()

Unnamed: 0,tid,bechdelRating
0,tt0392728,0
1,tt0000003,0
2,tt0000014,0
3,tt0132134,0
4,tt0000012,0


In [30]:
# Merge to add bechdel rating on tid
movies_merged_df = pd.merge(movies_df, truncatedBechdel_df, on="tid", how="left")
movies_merged_df.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,tid,bechdelRating
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,tt0499549,2.0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,tt0449088,3.0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,tt2379713,1.0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,tt1345836,1.0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,12.0,7.1,,0,tt5289954,


In [31]:
movies_merged_df.to_csv("movies_merged_df.csv")

In [35]:
# Create the MoviesBechdel class
class MoviesBechdel(Base):
    __tablename__ = 'movies_bechdel', 
    id = Column(Integer, primary_key=True)
    color = Column(String(255))
    director_name = Column(String(255))
    num_critic_for_reviews = Column(Integer)
    duration = Column(Integer)
    director_facebook_likes = Column(Integer)
    actor_3_facebook_likes = Column(Integer)
    actor_2_name = Column(String(255))
    actor_1_facebook_likes = Column(Integer)
    gross = Column(Integer)
    genres = Column(String(255))
    actor_1_name = Column(String(255))
    movie_title = Column(String(255))
    num_voted_users = Column(Integer)
    cast_total_facebook_likes = Column(Integer)
    actor_3_name = Column(String(255))
    facenumber_in_poster = Column(Integer)
    plot_keywords = Column(String(255))
    movie_imdb_link = Column(String(255))
    num_user_for_reviews = Column(Integer)
    language = Column(String(255))
    country = Column(String(255))
    content_rating = Column(String(255))
    budget = Column(Integer)
    title_year = Column(Integer)
    actor_2_facebook_likes = Column(Integer)
    imdb_score = Column(Float)
    aspect_ratio = Column(Float)
    movie_facebook_likes = Column(Integer)
    tid = Column(String(255))
    bechdelRating = Column(Integer)

In [36]:
# Create a connection to a SQLite database
engine = create_engine('sqlite:///moviesBechdel.sqlite')

In [37]:
# Create the bechdel movies table within the database
Base.metadata.create_all(engine)

In [38]:
# To push the objects made and query the server we use a Session object
from sqlalchemy.orm import Session
session = Session(bind=engine)

In [40]:
conn = sqlite3.connect('moviesBechdel.sqlite')

In [41]:
movies_merged_df.to_sql('movies_bechdel', conn, if_exists='append', index=False)

In [42]:
session.commit()

In [44]:
dataRead = pd.read_sql("SELECT * FROM movies_bechdel", conn)

In [45]:
dataRead.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,tid,bechdelRating
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,tt0499549,2.0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,tt0449088,3.0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,tt2379713,1.0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,tt1345836,1.0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,12.0,7.1,,0,tt5289954,
