# Regression Open-Ended Project

-----

# Previous Notebooks

- Web Scraping
- Cleaning data
- Exploratory Data Analysis

In [1]:
import warnings
warnings.filterwarnings('ignore')

# Feature Engineering

In [2]:
import pandas as pd
import numpy as np
import pickle

In [3]:
ebert_imdb_df = pickle.load(open('data/ebert_imdb_df_v1.pkl', 'rb'))

In [4]:
ebert_imdb_df.head(2)

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review
0,A Cure for Wellness,6.6,1646.0,"Directed by Gore Verbinski. With Dane DeHaan,...",47.0,16.0,90.0,"Mystery, Thriller","Dane DeHaan,, Jason Isaacs,, Mia Goth",Gore Verbinski,USA,2017-02-17,2.0,2017.0,/reviews/a-cure-for-wellness-2017,R,146.0,I keep forgetting the title of “A Cure for Wel...
1,Big Little Lies,,,"With Alexander Skarsgård, James Tupper, Laura ...",,,90.0,"Comedy, Crime, Drama","Alexander Skarsgård,, James Tupper,, Laura Dern",Alexander Skarsgård,,NaT,3.5,2017.0,/reviews/big-little-lies-2017,NR,420.0,"HBO’s excellent “Big Little Lies,” based on th..."


In [5]:
def check_foreign(row):
    try:
        country = row['Country']

        if country in ['USA', 'UK', 'Canada']:
            return 0
        else:
            return 1
    except:
        return np.nan
    
def user_critic_ratio(row):
    try:
        ratio = row['User_Review_Count'] / row['Critic_Review_Count']
        return ratio
    except:
        return np.nan
    
def description_length(row):
    try:
        length = len(row['Description'].split())
        return length
    except:
        return np.nan
    
def review_length(row):
    try:
        length = len(row['Review'].split())
        return length
    except:
        return np.nan
    
def convert_season(row):
    try:
        day = row['Release_Date'].timetuple().tm_yday
        
        spring = range(80, 172)
        summer = range(172, 264)
        fall = range(264, 355)

        if day in spring:
            season = 'Spring'
        elif day in summer:
            season = 'Summer'
        elif day in fall:
            season = 'Fall'
        else:
            season = 'Winter'
            
        return season
    except:
        return np.nan

In [6]:
ebert_imdb_df['Foreign'] = ebert_imdb_df.apply(lambda x: check_foreign(x), 1)
ebert_imdb_df['UC_Ratio'] = ebert_imdb_df.apply(lambda x: user_critic_ratio(x), 1)
ebert_imdb_df['Description_Len'] = ebert_imdb_df.apply(lambda x: description_length(x), 1)
ebert_imdb_df['Review_Len'] = ebert_imdb_df.apply(lambda x: review_length(x), 1)
ebert_imdb_df['Season'] = ebert_imdb_df.apply(lambda x: convert_season(x), 1)

## Convert MPAA Rating and Season to Numeric

In [7]:
ebert_imdb_df.head()

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,...,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season
0,A Cure for Wellness,6.6,1646.0,"Directed by Gore Verbinski. With Dane DeHaan,...",47.0,16.0,90.0,"Mystery, Thriller","Dane DeHaan,, Jason Isaacs,, Mia Goth",Gore Verbinski,...,2017.0,/reviews/a-cure-for-wellness-2017,R,146.0,I keep forgetting the title of “A Cure for Wel...,0,0.177778,52,1096,Winter
1,Big Little Lies,,,"With Alexander Skarsgård, James Tupper, Laura ...",,,90.0,"Comedy, Crime, Drama","Alexander Skarsgård,, James Tupper,, Laura Dern",Alexander Skarsgård,...,2017.0,/reviews/big-little-lies-2017,NR,420.0,"HBO’s excellent “Big Little Lies,” based on th...",1,,28,1091,
2,The Great Wall,6.3,14568.0,"Directed by Yimou Zhang. With Matt Damon, Tia...",42.0,96.0,152.0,"Action, Adventure, Fantasy","Matt Damon,, Tian Jing,, Willem Dafoe",Yimou Zhang,...,2017.0,/reviews/the-great-wall-2017,PG-13,103.0,"Chinese/American co-produced action-fantasy ""T...",0,0.631579,36,820,Winter
3,American Fable,6.3,145.0,Directed by Anne Hamilton. With Peyton Kenned...,57.0,1.0,10.0,Thriller,"Peyton Kennedy,, Richard Schiff,, Kip Pardue",Anne Hamilton,...,2017.0,/reviews/american-fable-2017,NR,96.0,11-year-old Gitty (Peyton Kennedy) listens as ...,0,0.1,82,1115,Winter
4,Lovesong,6.4,153.0,"Directed by So Yong Kim. With Riley Keough, J...",74.0,3.0,14.0,Drama,"Riley Keough,, Jena Malone,, Jessie Ok Gray",So Yong Kim,...,2017.0,/reviews/lovesong-2017,NR,85.0,Whether you find “Lovesong” refreshingly coy a...,0,0.214286,27,958,Winter


In [8]:
ebert_imdb_df['Rating'].unique()

array(['R', 'NR', 'PG-13', '', 'PG', 'G', 'NC-17', 'Unrated', 'TV', 'PG13',
       'Not rated', 'No MPAA rating', 'PG-13&#8206;', 'No rating',
       'No MPAA Rating', '.', 'g PG-13', 'R,', ': R', 'PG- 13', 'X'], dtype=object)

In [9]:
mpaa_fix = {'': 'Unrated',
            'TV': 'Unrated',
            'NR': 'Unrated',
            'Not rated': 'Unrated',
            'No MPAA rating': 'Unrated',
            'No rating': 'Unrated',
            'No MPAA Rating': 'Unrated',
            '.': 'Unrated',
            'PG13': 'PG-13',
            'PG-13&#8206;': 'PG-13',
            'g PG-13': 'PG-13',
            'PG- 13': 'PG-13',
            'R,': 'R',
            ': R': 'R',
            'X': 'NC-17'}

In [10]:
for i, rating in ebert_imdb_df['Rating'].iteritems():
    if rating in mpaa_fix.keys():
        better_name = mpaa_fix.get(rating)
        ebert_imdb_df.set_value(i, 'Rating', better_name)
        
ebert_imdb_df['Rating'].unique()

array(['R', 'Unrated', 'PG-13', 'PG', 'G', 'NC-17'], dtype=object)

In [11]:
df_rating = pd.get_dummies(ebert_imdb_df['Rating'])
df_season = pd.get_dummies(ebert_imdb_df['Season'])
ebert_imdb_df = pd.concat([ebert_imdb_df, df_season, df_rating], axis=1)

In [12]:
ebert_imdb_df.columns

Index(['Title', 'IMDB_Rating', 'Rating_Count', 'Description', 'Metascore',
       'User_Review_Count', 'Critic_Review_Count', 'Genre_List', 'Stars_List',
       'Director', 'Country', 'Release_Date', 'EbertStars', 'Year', 'URL',
       'Rating', 'Runtime', 'Review', 'Foreign', 'UC_Ratio', 'Description_Len',
       'Review_Len', 'Season', 'Fall', 'Spring', 'Summer', 'Winter', 'G',
       'NC-17', 'PG', 'PG-13', 'R', 'Unrated'],
      dtype='object')

In [13]:
ebert_imdb_df.head(3)

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,...,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated
0,A Cure for Wellness,6.6,1646.0,"Directed by Gore Verbinski. With Dane DeHaan,...",47.0,16.0,90.0,"Mystery, Thriller","Dane DeHaan,, Jason Isaacs,, Mia Goth",Gore Verbinski,...,0,0,0,1,0,0,0,0,1,0
1,Big Little Lies,,,"With Alexander Skarsgård, James Tupper, Laura ...",,,90.0,"Comedy, Crime, Drama","Alexander Skarsgård,, James Tupper,, Laura Dern",Alexander Skarsgård,...,0,0,0,0,0,0,0,0,0,1
2,The Great Wall,6.3,14568.0,"Directed by Yimou Zhang. With Matt Damon, Tia...",42.0,96.0,152.0,"Action, Adventure, Fantasy","Matt Damon,, Tian Jing,, Willem Dafoe",Yimou Zhang,...,0,0,0,1,0,0,0,1,0,0


In [14]:
ebert_imdb_df.shape

(9494, 33)

In [15]:
pickle.dump(ebert_imdb_df, open('data/ebert_imdb_df_v2.pkl', 'wb'))

# Convert Genres to Numerical

In [16]:
ebert_imdb_df = pickle.load(open('data/ebert_imdb_df_v2.pkl', 'rb'))

In [17]:
genre_array = [genre_list.split(",") for genre_list in ebert_imdb_df['Genre_List']]
unique_genres = {genre.strip() for genres in genre_array for genre in genres}
unique_genres.discard('')
unique_genres

{'Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western'}

In [18]:
for genre in sorted(unique_genres):
    ebert_imdb_df[genre] = np.zeros(len(ebert_imdb_df), dtype=int)

In [19]:
pd.set_option("display.max_columns", 150)
ebert_imdb_df.head()

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,A Cure for Wellness,6.6,1646.0,"Directed by Gore Verbinski. With Dane DeHaan,...",47.0,16.0,90.0,"Mystery, Thriller","Dane DeHaan,, Jason Isaacs,, Mia Goth",Gore Verbinski,USA,2017-02-17,2.0,2017.0,/reviews/a-cure-for-wellness-2017,R,146.0,I keep forgetting the title of “A Cure for Wel...,0,0.177778,52,1096,Winter,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Big Little Lies,,,"With Alexander Skarsgård, James Tupper, Laura ...",,,90.0,"Comedy, Crime, Drama","Alexander Skarsgård,, James Tupper,, Laura Dern",Alexander Skarsgård,,NaT,3.5,2017.0,/reviews/big-little-lies-2017,Unrated,420.0,"HBO’s excellent “Big Little Lies,” based on th...",1,,28,1091,,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,The Great Wall,6.3,14568.0,"Directed by Yimou Zhang. With Matt Damon, Tia...",42.0,96.0,152.0,"Action, Adventure, Fantasy","Matt Damon,, Tian Jing,, Willem Dafoe",Yimou Zhang,USA,2017-02-17,3.0,2017.0,/reviews/the-great-wall-2017,PG-13,103.0,"Chinese/American co-produced action-fantasy ""T...",0,0.631579,36,820,Winter,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,American Fable,6.3,145.0,Directed by Anne Hamilton. With Peyton Kenned...,57.0,1.0,10.0,Thriller,"Peyton Kennedy,, Richard Schiff,, Kip Pardue",Anne Hamilton,USA,2017-02-17,3.0,2017.0,/reviews/american-fable-2017,Unrated,96.0,11-year-old Gitty (Peyton Kennedy) listens as ...,0,0.1,82,1115,Winter,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Lovesong,6.4,153.0,"Directed by So Yong Kim. With Riley Keough, J...",74.0,3.0,14.0,Drama,"Riley Keough,, Jena Malone,, Jessie Ok Gray",So Yong Kim,USA,2017-02-17,3.0,2017.0,/reviews/lovesong-2017,Unrated,85.0,Whether you find “Lovesong” refreshingly coy a...,0,0.214286,27,958,Winter,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
# fill the genres with 1's
for idx, row in ebert_imdb_df.iterrows():
    for genre in row['Genre_List'].split(", "):
        if genre != '':
            ebert_imdb_df.ix[idx, genre] = 1

In [21]:
ebert_imdb_df.head()

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Genre_List,Stars_List,Director,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,A Cure for Wellness,6.6,1646.0,"Directed by Gore Verbinski. With Dane DeHaan,...",47.0,16.0,90.0,"Mystery, Thriller","Dane DeHaan,, Jason Isaacs,, Mia Goth",Gore Verbinski,USA,2017-02-17,2.0,2017.0,/reviews/a-cure-for-wellness-2017,R,146.0,I keep forgetting the title of “A Cure for Wel...,0,0.177778,52,1096,Winter,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
1,Big Little Lies,,,"With Alexander Skarsgård, James Tupper, Laura ...",,,90.0,"Comedy, Crime, Drama","Alexander Skarsgård,, James Tupper,, Laura Dern",Alexander Skarsgård,,NaT,3.5,2017.0,/reviews/big-little-lies-2017,Unrated,420.0,"HBO’s excellent “Big Little Lies,” based on th...",1,,28,1091,,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,The Great Wall,6.3,14568.0,"Directed by Yimou Zhang. With Matt Damon, Tia...",42.0,96.0,152.0,"Action, Adventure, Fantasy","Matt Damon,, Tian Jing,, Willem Dafoe",Yimou Zhang,USA,2017-02-17,3.0,2017.0,/reviews/the-great-wall-2017,PG-13,103.0,"Chinese/American co-produced action-fantasy ""T...",0,0.631579,36,820,Winter,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,American Fable,6.3,145.0,Directed by Anne Hamilton. With Peyton Kenned...,57.0,1.0,10.0,Thriller,"Peyton Kennedy,, Richard Schiff,, Kip Pardue",Anne Hamilton,USA,2017-02-17,3.0,2017.0,/reviews/american-fable-2017,Unrated,96.0,11-year-old Gitty (Peyton Kennedy) listens as ...,0,0.1,82,1115,Winter,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,Lovesong,6.4,153.0,"Directed by So Yong Kim. With Riley Keough, J...",74.0,3.0,14.0,Drama,"Riley Keough,, Jena Malone,, Jessie Ok Gray",So Yong Kim,USA,2017-02-17,3.0,2017.0,/reviews/lovesong-2017,Unrated,85.0,Whether you find “Lovesong” refreshingly coy a...,0,0.214286,27,958,Winter,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [22]:
ebert_imdb_df = ebert_imdb_df.drop(['Genre_List'], axis='columns')

In [23]:
pickle.dump(ebert_imdb_df, open('data/ebert_imdb_df_v3.pkl', 'wb'))
pickle.dump(unique_genres, open('data/unique_genres.pkl', 'wb'))

## Convert Directors to Numerical

In [24]:
ebert_imdb_df = pickle.load(open('data/ebert_imdb_df_v3.pkl', 'rb'))

In [25]:
print(len(ebert_imdb_df.Director.unique()))
print((ebert_imdb_df.Director.value_counts() >= 2).value_counts()[1])

4437
1542


Directors that appear in a single movie don't add predictive value. As we can see above, removing them from the dataset will let us reduce the number of directors dummy variables from over 4400 to under 1600.

In [26]:
series = ebert_imdb_df['Director'].value_counts() >= 2
relevant_directors = series[series].index.values

In [27]:
for director in sorted(relevant_directors):
    ebert_imdb_df[director] = np.zeros(len(ebert_imdb_df), dtype=int)

In [28]:
ebert_imdb_df.head(2)

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Stars_List,Director,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,Unnamed: 60,Aaron Blaise,Aaron Katz,Abbas Kiarostami,Abdellatif Kechiche,Abel Ferrara,Adam F. Goldberg,Adam Green,Adam McKay,Adam Rapp,Adam Shankman,Adam Wingard,Adrian Lyne,Adrian Shergold,Adrianne Palicki,Adrián García Bogliano,...,Tommy Lee Jones,Tommy O'Haver,Tommy Wirkola,Tomás Gutiérrez Alea,Toni Myers,Tony Bill,Tony Gilroy,Tony Goldwyn,Tony Kaye,Tony Richardson,Tony Scott,Tran Anh Hung,Travis Cluff,Travis Zariwny,Trey Parker,Tyler Perry,Udayan Prasad,Uli Edel,Ulrich Seidl,Ulu Grosbard,Vicente Amorim,Victor Nunez,Victor Salva,Vikram Gandhi,Vincent Gallo,Vincent Paronnaud,Vincent Ward,Vincente Minnelli,Vincenzo Natali,Vitaliy Manskiy,Vittorio De Sica,Volker Schlöndorff,W.D. Hogan,Wai-Keung Lau,Wallace Wolodarsky,Walt Becker,Walter Hill,Walter Salles,Warren Beatty,Wayne Blair,Wayne Kramer,Wayne Wang,Werner Herzog,Wes Anderson,Wes Ball,Wes Craven,Whit Stillman,Will Finn,Will Gluck,William A. Graham,William Dear,William Eubank,William Friedkin,William Gazecki,William Girdler,Wilson Yip,Wim Wenders,Wolfgang Petersen,Woo-Ping Yuen,Woody Allen,Xavier Dolan,Xavier Giannoli,Xavier Koller,Yang Zhang,Yimou Zhang,Yorgos Lanthimos,Yvan Attal,Yves Robert,Zach Braff,Zack Snyder,Zalman King,Zhangke Jia,Zoe R. Cassavetes,Álex de la Iglesia,Éric Rohmer
0,A Cure for Wellness,6.6,1646.0,"Directed by Gore Verbinski. With Dane DeHaan,...",47.0,16.0,90.0,"Dane DeHaan,, Jason Isaacs,, Mia Goth",Gore Verbinski,USA,2017-02-17,2.0,2017.0,/reviews/a-cure-for-wellness-2017,R,146.0,I keep forgetting the title of “A Cure for Wel...,0,0.177778,52,1096,Winter,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Big Little Lies,,,"With Alexander Skarsgård, James Tupper, Laura ...",,,90.0,"Alexander Skarsgård,, James Tupper,, Laura Dern",Alexander Skarsgård,,NaT,3.5,2017.0,/reviews/big-little-lies-2017,Unrated,420.0,"HBO’s excellent “Big Little Lies,” based on th...",1,,28,1091,,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [29]:
# fill the directors with 1's
for idx, row in ebert_imdb_df.iterrows():
    director = row['Director']
    if director != '' and director in relevant_directors:
        ebert_imdb_df.ix[idx, director] = 1

In [30]:
ebert_imdb_df.head(2)

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Stars_List,Director,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,Unnamed: 60,Aaron Blaise,Aaron Katz,Abbas Kiarostami,Abdellatif Kechiche,Abel Ferrara,Adam F. Goldberg,Adam Green,Adam McKay,Adam Rapp,Adam Shankman,Adam Wingard,Adrian Lyne,Adrian Shergold,Adrianne Palicki,Adrián García Bogliano,...,Tommy Lee Jones,Tommy O'Haver,Tommy Wirkola,Tomás Gutiérrez Alea,Toni Myers,Tony Bill,Tony Gilroy,Tony Goldwyn,Tony Kaye,Tony Richardson,Tony Scott,Tran Anh Hung,Travis Cluff,Travis Zariwny,Trey Parker,Tyler Perry,Udayan Prasad,Uli Edel,Ulrich Seidl,Ulu Grosbard,Vicente Amorim,Victor Nunez,Victor Salva,Vikram Gandhi,Vincent Gallo,Vincent Paronnaud,Vincent Ward,Vincente Minnelli,Vincenzo Natali,Vitaliy Manskiy,Vittorio De Sica,Volker Schlöndorff,W.D. Hogan,Wai-Keung Lau,Wallace Wolodarsky,Walt Becker,Walter Hill,Walter Salles,Warren Beatty,Wayne Blair,Wayne Kramer,Wayne Wang,Werner Herzog,Wes Anderson,Wes Ball,Wes Craven,Whit Stillman,Will Finn,Will Gluck,William A. Graham,William Dear,William Eubank,William Friedkin,William Gazecki,William Girdler,Wilson Yip,Wim Wenders,Wolfgang Petersen,Woo-Ping Yuen,Woody Allen,Xavier Dolan,Xavier Giannoli,Xavier Koller,Yang Zhang,Yimou Zhang,Yorgos Lanthimos,Yvan Attal,Yves Robert,Zach Braff,Zack Snyder,Zalman King,Zhangke Jia,Zoe R. Cassavetes,Álex de la Iglesia,Éric Rohmer
0,A Cure for Wellness,6.6,1646.0,"Directed by Gore Verbinski. With Dane DeHaan,...",47.0,16.0,90.0,"Dane DeHaan,, Jason Isaacs,, Mia Goth",Gore Verbinski,USA,2017-02-17,2.0,2017.0,/reviews/a-cure-for-wellness-2017,R,146.0,I keep forgetting the title of “A Cure for Wel...,0,0.177778,52,1096,Winter,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Big Little Lies,,,"With Alexander Skarsgård, James Tupper, Laura ...",,,90.0,"Alexander Skarsgård,, James Tupper,, Laura Dern",Alexander Skarsgård,,NaT,3.5,2017.0,/reviews/big-little-lies-2017,Unrated,420.0,"HBO’s excellent “Big Little Lies,” based on th...",1,,28,1091,,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [31]:
ebert_imdb_df[ebert_imdb_df.Director == 'Gore Verbinski']['Gore Verbinski']

0       1
2223    1
2960    1
4212    1
4880    1
5503    1
Name: Gore Verbinski, dtype: int64

In [32]:
ebert_imdb_df = ebert_imdb_df.drop(['Director'], axis='columns')

In [33]:
pickle.dump(ebert_imdb_df, open('data/ebert_imdb_df_v4.pkl', 'wb'))

## Convert Actors to Numerical

In [34]:
ebert_imdb_df = pickle.load(open('data/ebert_imdb_df_v4.pkl', 'rb'))

In [35]:
stars_array = [stars_list.split(",,") for stars_list in ebert_imdb_df['Stars_List']]
unique_stars = {star.strip() for stars in stars_array for star in stars}
unique_stars.discard('')
unique_stars = pd.Series(sorted(unique_stars))

In [36]:
# count how many appearances each star has
stars_appearances  = dict.fromkeys(unique_stars, 0)

for idx, row in ebert_imdb_df.iterrows():
    for star in row['Stars_List'].split(",, "):
        if star != '':
            stars_appearances[star] += 1

stars_appearances

{'Nils Jørgen Kaalstad': 1,
 'Tim Daly': 1,
 "Kevin O'Morrison": 1,
 'Dee Dee Ramone': 1,
 'Mascha Gonska': 1,
 'María Telón': 1,
 'Rufus': 2,
 'Aiden Turner': 1,
 'Spanky Taylor': 1,
 'Gad Elmaleh': 1,
 'Ricardo Pereira': 1,
 'Trevor Morgan': 3,
 'Kathryn McCormick': 1,
 'Giuseppe Battiston': 5,
 'Monica Bellucci': 10,
 'Hugo Stiglitz': 1,
 'David Damas': 1,
 'Paul Dillon': 1,
 'Tina Turner': 1,
 'Dianna Agron': 2,
 'Tara Fitzgerald': 3,
 'Daniel Day-Lewis': 12,
 'Shawn Driscoll': 1,
 'Paris Hilton': 2,
 'Micheline Lanctôt': 1,
 'Genya Chernaiev': 1,
 'Amanda Crew': 1,
 'Winifred Wagner': 1,
 'Nobuo Kaneko': 1,
 'Louise Bourgoin': 1,
 'Daan Schuurmans': 1,
 'Alyssa Milano': 1,
 'Gene Bervoets': 1,
 'Josh Hartnett': 13,
 'Mahalia Jackson': 1,
 'Jason Scott Lee': 4,
 'Bipasha Basu': 1,
 'Laetitia Casta': 1,
 'Ebru Ceylan': 1,
 'Gene Cross': 1,
 'Eve Lindley': 1,
 'Kate Dickie': 3,
 'Delois Barrett Campbell': 1,
 'Zitao Huang': 1,
 'Kevin Rivera': 1,
 'Tadanobu Asano': 4,
 'Alexis Bledel

In [37]:
stars_appearances_df = pd.DataFrame.from_dict(stars_appearances, orient='index')
stars_appearances_df.columns = ['Appearances']
stars_appearances_df

Unnamed: 0,Appearances
Nils Jørgen Kaalstad,1
Tim Daly,1
Kevin O'Morrison,1
Dee Dee Ramone,1
Mascha Gonska,1
María Telón,1
Rufus,2
Aiden Turner,1
Spanky Taylor,1
Gad Elmaleh,1


In [38]:
print(len(stars_appearances_df))
print(len(stars_appearances_df[stars_appearances_df.Appearances >= 2]))

11113
3341


As with directors, stars that only appear in a single movie in the dataset don't provide predictive value. Removing them allows us to reduce the number of stars dummy variables from over 11 thousand to under 3400.

In [39]:
relevant_actors = stars_appearances_df[stars_appearances_df.Appearances >= 2].index.values
relevant_actors

array(['Rufus', 'Trevor Morgan', 'Giuseppe Battiston', ...,
       'Greta Scacchi', 'David Spade', 'Adam Trese'], dtype=object)

In [40]:
for actor in sorted(relevant_actors):
    ebert_imdb_df[actor] = np.zeros(len(ebert_imdb_df), dtype=int)

In [41]:
ebert_imdb_df.head(2)

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Stars_List,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,Unnamed: 59,Aaron Blaise,Aaron Katz,Abbas Kiarostami,Abdellatif Kechiche,Abel Ferrara,Adam F. Goldberg,Adam Green,Adam McKay,Adam Rapp,Adam Shankman,Adam Wingard,Adrian Lyne,Adrian Shergold,Adrianne Palicki,Adrián García Bogliano,Agnieszka Holland,...,Whoopi Goldberg,Wiley Wiggins,Wilford Brimley,Will Arnett,Will Ferrell,Will Forte,Will Oldham,Will Patton,Will Poulter,Will Smith,Will Yun Lee,Willa Fitzgerald,Willem Dafoe,William Atherton,William Baldwin,William Daniels,William Fichtner,William Forsythe,William H. Macy,William Holden,William Hurt,William Katt,William Petersen,William Ragsdale,William Sadler,William Shatner,Willie Nelson,Winona Ryder,Wojciech Pszoniak,Wood Harris,Woody Harrelson,Wu Jiang,Xavier Samuel,Xiao Shen-Yang,Yada Beener,Yannick Bisson,Yaphet Kotto,Yasiin Bey,Yaya DaCosta,Yayan Ruhian,Yaël Abecassis,Ye Liu,Yeo-jeong Yoon,Yiftach Klein,Yoko Ono,Yoo Gong,Yu-Yong,Yui Natsukawa,Yun-Fat Chow,Yuqi Zhang,Yuriy Tsurilo,Yves Montand,Yvette Mimieux,Yûki Kudô,Zabou Breitman,Zac Efron,Zach Galifianakis,Zach Galligan,Zach Gilford,Zachary Booth,Zachary Knighton,Zachary Quinto,Zakes Mokae,Zdenek Sverák,Ziggy Marley,Zinedine Soualem,Ziyi Zhang,Zoe Kazan,Zoe Saldana,Zoey Deutch,Zooey Deschanel,Zoë Kravitz,Zulay Henao,Élodie Bouchez,Émilie Dequenne
0,A Cure for Wellness,6.6,1646.0,"Directed by Gore Verbinski. With Dane DeHaan,...",47.0,16.0,90.0,"Dane DeHaan,, Jason Isaacs,, Mia Goth",USA,2017-02-17,2.0,2017.0,/reviews/a-cure-for-wellness-2017,R,146.0,I keep forgetting the title of “A Cure for Wel...,0,0.177778,52,1096,Winter,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Big Little Lies,,,"With Alexander Skarsgård, James Tupper, Laura ...",,,90.0,"Alexander Skarsgård,, James Tupper,, Laura Dern",,NaT,3.5,2017.0,/reviews/big-little-lies-2017,Unrated,420.0,"HBO’s excellent “Big Little Lies,” based on th...",1,,28,1091,,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [42]:
# fill actors with 1's
for idx, row in ebert_imdb_df.iterrows():
    for actor in row['Stars_List'].split(",, "):
        if actor != '' and actor in relevant_actors:
            ebert_imdb_df.ix[idx, actor] = 1

In [43]:
ebert_imdb_df.head(2)

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Stars_List,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,Unnamed: 59,Aaron Blaise,Aaron Katz,Abbas Kiarostami,Abdellatif Kechiche,Abel Ferrara,Adam F. Goldberg,Adam Green,Adam McKay,Adam Rapp,Adam Shankman,Adam Wingard,Adrian Lyne,Adrian Shergold,Adrianne Palicki,Adrián García Bogliano,Agnieszka Holland,...,Whoopi Goldberg,Wiley Wiggins,Wilford Brimley,Will Arnett,Will Ferrell,Will Forte,Will Oldham,Will Patton,Will Poulter,Will Smith,Will Yun Lee,Willa Fitzgerald,Willem Dafoe,William Atherton,William Baldwin,William Daniels,William Fichtner,William Forsythe,William H. Macy,William Holden,William Hurt,William Katt,William Petersen,William Ragsdale,William Sadler,William Shatner,Willie Nelson,Winona Ryder,Wojciech Pszoniak,Wood Harris,Woody Harrelson,Wu Jiang,Xavier Samuel,Xiao Shen-Yang,Yada Beener,Yannick Bisson,Yaphet Kotto,Yasiin Bey,Yaya DaCosta,Yayan Ruhian,Yaël Abecassis,Ye Liu,Yeo-jeong Yoon,Yiftach Klein,Yoko Ono,Yoo Gong,Yu-Yong,Yui Natsukawa,Yun-Fat Chow,Yuqi Zhang,Yuriy Tsurilo,Yves Montand,Yvette Mimieux,Yûki Kudô,Zabou Breitman,Zac Efron,Zach Galifianakis,Zach Galligan,Zach Gilford,Zachary Booth,Zachary Knighton,Zachary Quinto,Zakes Mokae,Zdenek Sverák,Ziggy Marley,Zinedine Soualem,Ziyi Zhang,Zoe Kazan,Zoe Saldana,Zoey Deutch,Zooey Deschanel,Zoë Kravitz,Zulay Henao,Élodie Bouchez,Émilie Dequenne
0,A Cure for Wellness,6.6,1646.0,"Directed by Gore Verbinski. With Dane DeHaan,...",47.0,16.0,90.0,"Dane DeHaan,, Jason Isaacs,, Mia Goth",USA,2017-02-17,2.0,2017.0,/reviews/a-cure-for-wellness-2017,R,146.0,I keep forgetting the title of “A Cure for Wel...,0,0.177778,52,1096,Winter,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Big Little Lies,,,"With Alexander Skarsgård, James Tupper, Laura ...",,,90.0,"Alexander Skarsgård,, James Tupper,, Laura Dern",,NaT,3.5,2017.0,/reviews/big-little-lies-2017,Unrated,420.0,"HBO’s excellent “Big Little Lies,” based on th...",1,,28,1091,,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [44]:
print(ebert_imdb_df.iloc[0]['Will Smith'])
print(ebert_imdb_df.iloc[0]['Dane DeHaan'])

0
1


In [45]:
ebert_imdb_df = ebert_imdb_df.drop(['Stars_List'], axis='columns')

In [46]:
pickle.dump(ebert_imdb_df, open('data/ebert_imdb_df_v5.pkl', 'wb'))

## Create buckets for decades

In [47]:
ebert_imdb_df = pickle.load(open('data/ebert_imdb_df_v5.pkl', 'rb'))

In [48]:
ebert_imdb_df = ebert_imdb_df.dropna(subset=['Release_Date', 'Year'])
ax = ebert_imdb_df.Year.hist(bins=range(1920,2020, 10), figsize=(17, 8))

In [49]:
decade_buckets = range(1920, 2020, 10)
for decade in decade_buckets:
    ebert_imdb_df[decade] = np.zeros(len(ebert_imdb_df), dtype=int)

In [50]:
# fill decades with 1's
for idx, row in ebert_imdb_df.iterrows():
    decade_idx = int((row['Year'] - 1920) // 10)
    ebert_imdb_df.ix[idx, decade_buckets[decade_idx]] = 1

In [51]:
ebert_imdb_df.head(2)

Unnamed: 0,Title,IMDB_Rating,Rating_Count,Description,Metascore,User_Review_Count,Critic_Review_Count,Country,Release_Date,EbertStars,Year,URL,Rating,Runtime,Review,Foreign,UC_Ratio,Description_Len,Review_Len,Season,Fall,Spring,Summer,Winter,G,NC-17,PG,PG-13,R,Unrated,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,Unnamed: 58,Aaron Blaise,Aaron Katz,Abbas Kiarostami,Abdellatif Kechiche,Abel Ferrara,Adam F. Goldberg,Adam Green,Adam McKay,Adam Rapp,Adam Shankman,Adam Wingard,Adrian Lyne,Adrian Shergold,Adrianne Palicki,Adrián García Bogliano,Agnieszka Holland,Agnieszka Wojtowicz-Vosloo,...,Will Yun Lee,Willa Fitzgerald,Willem Dafoe,William Atherton,William Baldwin,William Daniels,William Fichtner,William Forsythe,William H. Macy,William Holden,William Hurt,William Katt,William Petersen,William Ragsdale,William Sadler,William Shatner,Willie Nelson,Winona Ryder,Wojciech Pszoniak,Wood Harris,Woody Harrelson,Wu Jiang,Xavier Samuel,Xiao Shen-Yang,Yada Beener,Yannick Bisson,Yaphet Kotto,Yasiin Bey,Yaya DaCosta,Yayan Ruhian,Yaël Abecassis,Ye Liu,Yeo-jeong Yoon,Yiftach Klein,Yoko Ono,Yoo Gong,Yu-Yong,Yui Natsukawa,Yun-Fat Chow,Yuqi Zhang,Yuriy Tsurilo,Yves Montand,Yvette Mimieux,Yûki Kudô,Zabou Breitman,Zac Efron,Zach Galifianakis,Zach Galligan,Zach Gilford,Zachary Booth,Zachary Knighton,Zachary Quinto,Zakes Mokae,Zdenek Sverák,Ziggy Marley,Zinedine Soualem,Ziyi Zhang,Zoe Kazan,Zoe Saldana,Zoey Deutch,Zooey Deschanel,Zoë Kravitz,Zulay Henao,Élodie Bouchez,Émilie Dequenne,1920,1930,1940,1950,1960,1970,1980,1990,2000,2010
0,A Cure for Wellness,6.6,1646.0,"Directed by Gore Verbinski. With Dane DeHaan,...",47.0,16.0,90.0,USA,2017-02-17,2.0,2017.0,/reviews/a-cure-for-wellness-2017,R,146.0,I keep forgetting the title of “A Cure for Wel...,0,0.177778,52,1096,Winter,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,The Great Wall,6.3,14568.0,"Directed by Yimou Zhang. With Matt Damon, Tia...",42.0,96.0,152.0,USA,2017-02-17,3.0,2017.0,/reviews/the-great-wall-2017,PG-13,103.0,"Chinese/American co-produced action-fantasy ""T...",0,0.631579,36,820,Winter,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [52]:
pickle.dump(ebert_imdb_df, open('data/ebert_imdb_df_v6.pkl', 'wb'))

# Plan for Following Notebooks

- More Exploratory Data Analysis
- Making predictions
- Final analysis