# Box Office Revenue Prediction

## Imports

In [202]:
# Widen width of notebook
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [203]:
import pandas as pd
import numpy as np


import re # regex
import ast

from datetime import datetime
from datetime import date

from sklearn import metrics
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import learning_curve, RandomizedSearchCV, train_test_split

## Data Sources
1. IMDB:
    * No. of ratings (star)
    * Avg. star rating (out of 10)
    * No. of user ratings (text reviews)
    * No. of critic ratings (meta critic)
    * Countries (truncate to just first country) - √
    * Language (truncate to just first language) - √
    * No. of languages (to be created from previous column) - √
    * Production house (truncate to just first production house) - √
    * Duration (convert to minutes/numeric from string) - √
    * Genre (need to explode to one-hot columns for 24 genres) - √
    * MPAA rating 
        * TV- ratings - reclassify - √ 
        * Not Rated --> PG-13/Unrated --> PG - many from non-English fall under these categories - √  
       
    * **Gross:** 'Y' variable (need to remove movies with alphabet-only gross, convert INR (other currencies?) to USD and 3 INR 3-digit movies):
        * **`TODO`** Adjust for inflation, based on year of release
    * **`TODO`** Release date + release region - separate out into 2 columns:
        * convert first to datetime and use as a feature (holiday - US - release/not) 

    
&nbsp; 
2. Popularity Scores:
    * Average popularity score per movie

&nbsp; 

3. Sentiment Scores (**`TODO`**):
    * AFINN score - based on IMDB user reviews pre-release
    * AFINN score - based on IMDB user reviews post-release
    

&nbsp;    
4. YouTube (**`TODO`**):
    * View count
    * Like count
    * Dislike count
    * Comment count

## Reading Excel File(s)
#### !!!!!! IMPORTANT- manually removed the unnamed serial number column from the excel sheet I'm reading below!!!!! 
#### Check that it is removed in your excel sheet also

In [281]:
# read xlsx with IMDB_ID as the indexing column (2nd column from the left, or 1st column) 
df = pd.read_excel("Final_data_sheets_updated_popularity_scores.xlsx", index_col = 0)

In [282]:
# read xlsx with IMDB_ID as the indexing column (2nd column from the left, or 1st column) 
df = pd.read_excel("Final_data_sheets_with_Features.xlsx", index_col = 0)

In [283]:
df.shape

(4351, 28)

In [284]:
df.dtypes

Name                             object
num_ratings                       int64
avg_rating                      float64
main_cast_list                   object
main_cast_links                  object
dir_list                         object
creator_list                     object
genre                            object
motion_picture_rating            object
release_date                     object
duration                         object
meta_critic_score               float64
num_user_ratings                float64
num_critic_ratings              float64
story_line                       object
others                           object
Gross                            object
Country                          object
Language                         object
Production_House                 object
viewCount                         int64
likeCount                         int64
dislikeCount                      int64
commentCount                      int64
Afinn Pre Release               float64


In [286]:
# inspect
df.head(50)

Unnamed: 0_level_0,Name,num_ratings,avg_rating,main_cast_list,main_cast_links,dir_list,creator_list,genre,motion_picture_rating,release_date,...,Language,Production_House,viewCount,likeCount,dislikeCount,commentCount,Afinn Pre Release,Afinn Post Release,Release date,Holiday
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1690470,Gekijouban Poketto monsutâ: Daiamondo & Pâru -...,1251,6.2,Ikue Ôtani|Sarah Natochenny|Wayne Grayson,/name/nm0649026/|/name/nm2516299/|/name/nm0969...,Kunihiko Yuyama,Satoshi Tajiri|Hideki Sonoda,"[Animation, Action, Family, Fantasy, Sci-Fi]",Not Rated,10 July 2010 (Japan),...,Japanese|English,"East Japan Marketing & Communications Inc., G...",0,0,0,0,0.0,0.0,2010-07-10,0
1508290,Kyatapirâ,952,6.7,Shinobu Terajima|Keigo Kasuya|Emi Masuda,/name/nm0855429/|/name/nm2486225/|/name/nm3787...,Kôji Wakamatsu,Hisako Kurosawa|Masao Adachi,"[Drama, War]",Not Rated,14 August 2010 (Japan),...,Japanese,"Skhole Co., Wakamatsu Production",0,0,0,0,0.0,-23.0,2010-08-14,0
2057455,Dalpaengee eui byeol,323,7.3,Cho Young-Chan|Kim Soon-ho|Choi Jungah,/name/nm9804862/|/name/nm9804863/|/name/nm9804...,Seung-jun Yi,,"[Documentary, Drama]",Not Rated,15 February 2014 (Japan),...,Korean,"CreativeEAST, Dalpaengee, NHK",0,0,0,0,0.0,13.0,2014-02-15,0
2077826,Gekijoban Poketto Monsuta besuto uisshu bikuti...,1152,6.1,Rica Matsumoto|Ikue Ôtani|Hideki Takahashi,/name/nm0559551/|/name/nm0649026/|/name/nm0847...,Kunihiko Yuyama,Junichi Masuda|Hideki Sonoda|Ken Sugimori|Sato...,"[Animation, Adventure, Drama, Family, Fantasy,...",Not Rated,16 July 2011 (Japan),...,Japanese|English,"Creatures, GAME FREAK, Holm Inc.",0,0,0,0,0.0,0.0,2011-07-16,0
1937133,De l'autre côté du périph,4319,5.8,Omar Sy|Laurent Lafitte|Sabrina Ouazani,/name/nm1082477/|/name/nm0480850/|/name/nm1493...,David Charhon,Eric Altmayer|Nicolas Altmayer|David Charhon|A...,"[Action, Comedy, Crime, Drama]",R,19 December 2012 (France),...,French,"Mandarin Films, Mars Films, M6 Films",0,0,0,0,0.0,0.0,2012-12-19,0
6595896,Gekijouban Poketto monsutâ: Kimi ni kimeta!,3870,6.4,Unshô Ishizuka|Rica Matsumoto|Sarah Natochenny,/name/nm0411167/|/name/nm0559551/|/name/nm2516...,Kunihiko Yuyama,Takeshi Shudo|Satoshi Tajiri|Shôji Yonemura,"[Animation, Action, Adventure, Family, Fantasy]",Not Rated,23 November 2017 (Singapore),...,Japanese,"Oriental Light and Magic (OLM), Pikachu Proje...",0,0,0,0,12.25,26.692308,2017-11-23,1
4294052,Shingeki no kyojin endo obu za wârudo,3753,4.8,Haruma Miura|Hiroki Hasegawa|Kanata Hongô,/name/nm2067218/|/name/nm4056296/|/name/nm1959...,Shinji Higuchi,Hajime Isayama|Tomohiro Machiyama|Yûsuke Watanabe,"[Action, Drama, Fantasy, Horror, Sci-Fi]",Not Rated,24 September 2015 (Singapore),...,Japanese,"Kôdansha, Licri, Nikkatsu",0,0,0,0,6.333333,0.0,2015-09-24,0
5278832,Córki dancingu,4793,6.4,Marta Mazurek|Michalina Olszanska|Kinga Preis,/name/nm4658761/|/name/nm5584702/|/name/nm0695...,Agnieszka Smoczynska,Robert Bolesto,"[Comedy, Drama, Fantasy, Horror, Musical, Thri...",Not Rated,25 December 2015 (Poland),...,Polish,Wytwórnia Filmów Dokumentalnych i Fabularnych...,0,0,0,0,0.0,0.0,2015-12-25,1
4505170,Joseon Myungtamjung: Nobui Ddal,387,6.4,Myung-Min Kim|Yeon-hee Lee|Dal-su Oh,/name/nm1047988/|/name/nm2165232/|/name/nm1367...,Suk-Yoon Kim,,"[Action, Adventure, Comedy]",G,11 February 2015 (South Korea),...,,"Bakugan Zoobles Comics, DMZ Comics, Showbox/M...",3215,8,0,1,0.0,0.0,2015-02-11,0
3809478,The Kind Words,225,6.8,Rotem Zissman-Cohen|Rassabn Abes|Roy Assaf,/name/nm1577510/|/name/nm8584239/|/name/nm2507...,Shemi Zarhin,Shemi Zarhin,Drama,Not Rated,28 May 2015 (Israel),...,French|Hebrew,"Amérique Film, Israel Film Council, Jerusalem...",6104,22,4,4,36.0,0.0,2015-05-28,0


In [287]:
df.tail(5)

Unnamed: 0_level_0,Name,num_ratings,avg_rating,main_cast_list,main_cast_links,dir_list,creator_list,genre,motion_picture_rating,release_date,...,Language,Production_House,viewCount,likeCount,dislikeCount,commentCount,Afinn Pre Release,Afinn Post Release,Release date,Holiday
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3626180,The Christmas Secret,2125,7.3,Bethany Joy Lenz|John Reardon|Susan Hogan,/name/nm0502342/|/name/nm1221622/|/name/nm0389...,Norma Bailey,Wesley Bishop|Judd Parkin|Donna Vanliere,"[Drama, Family, Romance]",TV-G,7 December 2014 TV Movie,...,English,,0,0,0,0,0.0,0.0,2014-12-07,0
5925968,Les fausses confidences,197,5.1,Isabelle Huppert|Louis Garrel|Bulle Ogier,/name/nm0001376/|/name/nm0308039/|/name/nm0644...,Luc Bondy|Marie-Louise Bischofberger,Luc Bondy|Marivaux|Geoffrey Layton,"[Comedy, Drama]",,9 March 2017 TV Movie,...,French,"Idéale Audience, ARTE France, Odéon Théâtre d...",0,0,0,0,0.0,0.0,2017-03-09,0
1482393,Beverly Hills Chihuahua 2,2901,4.4,Marcus Coloma|Erin Cahill|Susan Blakely,/name/nm1324844/|/name/nm0128657/|/name/nm0086...,Alex Zamm,Dannah Feinglass Phirman|Danielle Schneider|Je...,"[Adventure, Comedy, Family]",G,1 February 2011 Video,...,English,"Walt Disney Pictures, CounterPunch Studios, M...",0,0,0,0,0.0,0.0,2011-02-01,0
2483260,The Pirate Fairy,11127,6.7,Mae Whitman|Christina Hendricks|Tom Hiddleston,/name/nm0926165/|/name/nm0376716/|/name/nm1089...,Peggy Holmes,Jeffrey M. Howard|Kate Kondell|John Lasseter|P...,"[Animation, Adventure, Family, Fantasy]",G,13 February 2014 Video,...,English,"Prana Studios, Disneytoon Studios",0,0,0,0,0.0,0.0,2014-02-13,0
1216515,Tinker Bell and the Great Fairy Rescue,6664,7.0,Mae Whitman|Lauren Mote|Michael Sheen,/name/nm0926165/|/name/nm3454095/|/name/nm0790...,Bradley Raymond,Bob Hilgenberg|Rob Muir|Joe Ansolabehere|Paul ...,"[Animation, Adventure, Family, Fantasy]",G,21 September 2010 Video,...,English,Disneytoon Studios,0,0,0,0,0.0,0.0,2010-09-21,0


In [288]:
df.shape

(4351, 28)

##### Why are we dropping these?

In [289]:
df = df.drop(['main_cast_list', 'main_cast_links','dir_list','creator_list', 'meta_critic_score','story_line', 'others'], axis = 1)

In [290]:
df.tail(3)

Unnamed: 0_level_0,Name,num_ratings,avg_rating,genre,motion_picture_rating,release_date,duration,num_user_ratings,num_critic_ratings,Gross,...,Language,Production_House,viewCount,likeCount,dislikeCount,commentCount,Afinn Pre Release,Afinn Post Release,Release date,Holiday
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1482393,Beverly Hills Chihuahua 2,2901,4.4,"[Adventure, Comedy, Family]",G,1 February 2011 Video,1h 25min,9.0,20.0,63500000,...,English,"Walt Disney Pictures, CounterPunch Studios, M...",0,0,0,0,0.0,0.0,2011-02-01,0
2483260,The Pirate Fairy,11127,6.7,"[Animation, Adventure, Family, Fantasy]",G,13 February 2014 Video,1h 18min,25.0,66.0,67400000,...,English,"Prana Studios, Disneytoon Studios",0,0,0,0,0.0,0.0,2014-02-13,0
1216515,Tinker Bell and the Great Fairy Rescue,6664,7.0,"[Animation, Adventure, Family, Fantasy]",G,21 September 2010 Video,1h 16min,17.0,30.0,10872752,...,English,Disneytoon Studios,0,0,0,0,0.0,0.0,2010-09-21,0


In [291]:
df.shape

(4351, 21)

## Data Transformations

### 1. Check for NAs/NANs

In [292]:
df.describe()

Unnamed: 0,num_ratings,avg_rating,num_user_ratings,num_critic_ratings,viewCount,likeCount,dislikeCount,commentCount,Afinn Pre Release,Afinn Post Release,Holiday
count,4351.0,4351.0,4267.0,4267.0,4351.0,4351.0,4351.0,4351.0,4351.0,4351.0,4351.0
mean,49438.71,6.466766,117.522615,132.775486,1588189.0,10648.13,844.473684,1067.971731,8.816311,10.462413,0.033096
std,110896.0,0.960129,173.167208,140.513302,6871187.0,76847.61,7162.396242,9218.423397,12.745791,13.2217,0.178907
min,9.0,1.5,1.0,1.0,0.0,0.0,0.0,0.0,-62.0,-67.5,0.0
25%,1779.5,5.9,13.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,8299.0,6.6,43.0,77.0,0.0,0.0,0.0,0.0,5.0,8.0,0.0
75%,42562.0,7.1,145.0,192.0,171315.0,479.0,29.0,46.5,16.203297,18.125,0.0
max,1812301.0,9.3,998.0,974.0,222426800.0,3483078.0,244120.0,476931.0,125.0,106.0,1.0


In [293]:
df.shape

(4351, 21)

#### Columns with NAs
MPAA rating, num_user_ratings, num_critic_ratings, language, production_house

In [294]:
df.isnull().any()

Name                     False
num_ratings              False
avg_rating               False
genre                    False
motion_picture_rating     True
release_date             False
duration                  True
num_user_ratings          True
num_critic_ratings        True
Gross                    False
Country                  False
Language                  True
Production_House          True
viewCount                False
likeCount                False
dislikeCount             False
commentCount             False
Afinn Pre Release        False
Afinn Post Release       False
Release date             False
Holiday                  False
dtype: bool

#### How many NAs total?
Could be multiple NAs for a given row.

In [295]:
df.isnull().sum().sum()

648

In [296]:
df.isnull().sum()

Name                       0
num_ratings                0
avg_rating                 0
genre                      0
motion_picture_rating    355
release_date               0
duration                   1
num_user_ratings          84
num_critic_ratings        84
Gross                      0
Country                    0
Language                  13
Production_House         111
viewCount                  0
likeCount                  0
dislikeCount               0
commentCount               0
Afinn Pre Release          0
Afinn Post Release         0
Release date               0
Holiday                    0
dtype: int64

#### How many NAs per column?
Can it be manually fixed by finding the true value? Say, for duration of a couple of movies.

In [297]:
df.isnull().sum(axis = 0)

Name                       0
num_ratings                0
avg_rating                 0
genre                      0
motion_picture_rating    355
release_date               0
duration                   1
num_user_ratings          84
num_critic_ratings        84
Gross                      0
Country                    0
Language                  13
Production_House         111
viewCount                  0
likeCount                  0
dislikeCount               0
commentCount               0
Afinn Pre Release          0
Afinn Post Release         0
Release date               0
Holiday                    0
dtype: int64

In [298]:
df[df['duration'].isnull()]

Unnamed: 0_level_0,Name,num_ratings,avg_rating,genre,motion_picture_rating,release_date,duration,num_user_ratings,num_critic_ratings,Gross,...,Language,Production_House,viewCount,likeCount,dislikeCount,commentCount,Afinn Pre Release,Afinn Post Release,Release date,Holiday
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3861006,Xiao shi dai 3: Ci jin shi dai,619,2.7,"[Drama, Romance]",,17 July 2014 (China),,,,86900000,...,Mandarin,Le Vision Pictures (Tianjin) Co.,0,0,0,0,0.0,0.0,2014-07-17,0


This anyway has other columns as NaNs (info isn't available on IMDB anymore?),  so might as well drop the row.

#### If dropping all NAs?

In [299]:
# Lose 442 (45 rows still have NA in num_user_ratings, num_critic_ratings)
df = df.dropna(subset=['Language', 'Production_House','motion_picture_rating'])
df.shape

(3909, 21)

#### num_user_ratings and num_critic_ratings still have NAs - we will impute these later (pipelining) using median

In [300]:
df.isnull().any()

Name                     False
num_ratings              False
avg_rating               False
genre                    False
motion_picture_rating    False
release_date             False
duration                 False
num_user_ratings          True
num_critic_ratings        True
Gross                    False
Country                  False
Language                 False
Production_House         False
viewCount                False
likeCount                False
dislikeCount             False
commentCount             False
Afinn Pre Release        False
Afinn Post Release       False
Release date             False
Holiday                  False
dtype: bool

In [301]:
# number of IMDB user reviews written
df['num_user_ratings'].isnull().sum()

42

In [302]:
# metacritic rating
df['num_critic_ratings'].isnull().sum()

42

### 2. Add number of languages

In [303]:
# inspect unique values
df.Language.unique()

array(['Japanese|English', 'Japanese', 'Korean', 'French', 'Polish',
       'French|Hebrew', 'English', 'Mandarin|English|Cantonese',
       'Spanish', 'English|Hebrew', 'French|Polish|Russian',
       'Spanish|French|Dutch|Mapudungun', 'Romanian', 'Italian|English',
       'English|Portuguese', 'German|English|Romanian', 'English|Persian',
       'None', 'Korean|Japanese', 'English|French', 'Hindi',
       'English|Spanish', 'English|Esperanto',
       'English|Russian|Afrikaans', 'English|Mandarin|Cantonese',
       'English|Greek|German|Japanese', 'English|American Sign Language',
       'Telugu|Tamil|Hindi|Malayalam',
       'English|Bosnian|Arabic|Dari|Hausa|Fur',
       'English|Portuguese|Thai|Bulgarian', 'English|Punjabi',
       'Kannada|Tulu', 'Malayalam|English', 'Hebrew|French|Arabic',
       'French|English', 'Punjabi|Hindi|English', 'Mandarin|Chinese',
       'Tamil|Telugu', 'Danish', 'French|Danish|Flemish',
       'Mandarin|Cantonese|English', 'French|Latin', 'English|G

In [304]:
languages_split = df.Language.str.split(pat="|")
df['num_languages'] = languages_split.str.len()

In [305]:
# inspect
df.tail(3)

Unnamed: 0_level_0,Name,num_ratings,avg_rating,genre,motion_picture_rating,release_date,duration,num_user_ratings,num_critic_ratings,Gross,...,Production_House,viewCount,likeCount,dislikeCount,commentCount,Afinn Pre Release,Afinn Post Release,Release date,Holiday,num_languages
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1482393,Beverly Hills Chihuahua 2,2901,4.4,"[Adventure, Comedy, Family]",G,1 February 2011 Video,1h 25min,9.0,20.0,63500000,...,"Walt Disney Pictures, CounterPunch Studios, M...",0,0,0,0,0.0,0.0,2011-02-01,0,1
2483260,The Pirate Fairy,11127,6.7,"[Animation, Adventure, Family, Fantasy]",G,13 February 2014 Video,1h 18min,25.0,66.0,67400000,...,"Prana Studios, Disneytoon Studios",0,0,0,0,0.0,0.0,2014-02-13,0,1
1216515,Tinker Bell and the Great Fairy Rescue,6664,7.0,"[Animation, Adventure, Family, Fantasy]",G,21 September 2010 Video,1h 16min,17.0,30.0,10872752,...,Disneytoon Studios,0,0,0,0,0.0,0.0,2010-09-21,0,1


In [306]:
# check NaN
df['num_languages'].isnull().values.any()

False

### 3. Truncate 'Languages' and 'Country'

In [307]:
# inspect unique values
df.Country.unique()

array(['Japan', 'South Korea|Japan|Finland', 'France', 'Poland',
       'Israel|Canada', 'USA', 'UK', 'Canada', 'Hong Kong|China',
       'Mexico', 'Israel|USA', 'South Korea', 'Spain', 'France|Poland',
       'Chile|Argentina|France|Spain|USA', 'Romania|France|Belgium',
       'Italy|France',
       'Germany|Austria|Monaco|Romania|France|Switzerland',
       'UK|France|Belgium',
       'France|Belgium|Japan|Poland|Netherlands|Hungary|UK|Germany|China|Thailand|Italy|Denmark|USA',
       'USA|China', 'South Korea|USA', 'UK|USA', 'India',
       'USA|Hong Kong|China|Canada', 'USA|Japan', 'France|USA',
       'USA|Sweden', 'Canada|USA', 'Ireland|Canada|UK|USA',
       'UK|India|USA', 'USA|China|Hong Kong', 'France|Belgium',
       'Israel|France|Germany', 'UK|India|Sweden', 'China', 'USA|UK',
       'Australia', 'USA|Spain', 'Denmark', 'France|Portugal',
       'Belgium|France|Luxembourg', 'China|Japan|France', 'UK|France|USA',
       'Germany|UK', 'USA|Canada|New Zealand', 'Israel|German

In [308]:

languages_split = df.Language.str.split(pat="|").apply(lambda x: x[0])
df['Language'] = languages_split

In [309]:
countries_split = df.Country.str.split(pat="|").apply(lambda x: x[0])
df['Country'] = countries_split

In [310]:
df.tail(3)

Unnamed: 0_level_0,Name,num_ratings,avg_rating,genre,motion_picture_rating,release_date,duration,num_user_ratings,num_critic_ratings,Gross,...,Production_House,viewCount,likeCount,dislikeCount,commentCount,Afinn Pre Release,Afinn Post Release,Release date,Holiday,num_languages
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1482393,Beverly Hills Chihuahua 2,2901,4.4,"[Adventure, Comedy, Family]",G,1 February 2011 Video,1h 25min,9.0,20.0,63500000,...,"Walt Disney Pictures, CounterPunch Studios, M...",0,0,0,0,0.0,0.0,2011-02-01,0,1
2483260,The Pirate Fairy,11127,6.7,"[Animation, Adventure, Family, Fantasy]",G,13 February 2014 Video,1h 18min,25.0,66.0,67400000,...,"Prana Studios, Disneytoon Studios",0,0,0,0,0.0,0.0,2014-02-13,0,1
1216515,Tinker Bell and the Great Fairy Rescue,6664,7.0,"[Animation, Adventure, Family, Fantasy]",G,21 September 2010 Video,1h 16min,17.0,30.0,10872752,...,Disneytoon Studios,0,0,0,0,0.0,0.0,2010-09-21,0,1


In [311]:
# check both columns for NaN
print(df['Language'].isnull().values.any())
print(df['Country'].isnull().values.any())

False
False


In [312]:
# check how many unique values of each
print(len(df.Country.unique()))
print(len(df.Language.unique()))

77
61


### 4. Truncate 'Production_House'

In [313]:
# inspect unique values - 3697 of them, can't see all
df.Production_House.unique()

array([' East Japan Marketing & Communications Inc., GAME FREAK, Nintendo  ',
       ' Skhole Co., Wakamatsu Production  ',
       ' CreativeEAST, Dalpaengee, NHK  ', ...,
       ' Arte France, CB Films, Flach Film  ',
       ' Walt Disney Pictures, CounterPunch Studios, Motion Picture Corporation of America (MPCA)  ',
       ' Disneytoon Studios  '], dtype=object)

In [314]:
production_house_split = df.Production_House.str.split(pat=", ").apply(lambda x: x[0])
df['Production_House'] = production_house_split

In [315]:
# still could too unqiue of a column - 2277 unique values!
len(df.Production_House.unique())

2316

### 5. Check MPAA column and regroup
https://simple.m.wikipedia.org/wiki/Motion_Picture_Association_of_America_film_rating_system

#### **Reclassification:**
* TV-Y, TV-7, TV-G --> G
* TV-PG --> PG
* TV-14 --> PG-13
* TV-MA --> R
* Not Rated (923!) --> 
* Unrated (143) -->

In [316]:
# check NaN
df['motion_picture_rating'].isnull().values.any()

False

In [317]:
# what are the unique ratings, and how many in each category?
df.groupby('motion_picture_rating').size()

motion_picture_rating
G              39
M               1
NC-17           2
Not Rated     945
PG            362
PG-13         913
R            1420
TV-14          29
TV-G            4
TV-MA          34
TV-PG          11
TV-Y            1
TV-Y7           1
Unrated       147
dtype: int64

In [318]:
df.loc[df['motion_picture_rating'].isin(["TV-G", "TV-Y7", "TV-Y"]), 'motion_picture_rating'] = "G"
df.loc[df['motion_picture_rating'].isin(["TV-PG"]), 'motion_picture_rating'] = "PG"
df.loc[df['motion_picture_rating'].isin(["TV-14"]), 'motion_picture_rating'] = "PG-13"
df.loc[df['motion_picture_rating'].isin(["TV-MA"]), 'motion_picture_rating'] = "R"


df.loc[df['motion_picture_rating'].isin(["Unrated"]), 'motion_picture_rating'] = "PG" # mostly documentaries
df.loc[df['motion_picture_rating'].isin(["Not Rated"]), 'motion_picture_rating'] = "PG-13"

df.groupby('motion_picture_rating').size()

motion_picture_rating
G          45
M           1
NC-17       2
PG        520
PG-13    1887
R        1454
dtype: int64

In [319]:
df[df['motion_picture_rating'] == "Not Rated"]

Unnamed: 0_level_0,Name,num_ratings,avg_rating,genre,motion_picture_rating,release_date,duration,num_user_ratings,num_critic_ratings,Gross,...,Production_House,viewCount,likeCount,dislikeCount,commentCount,Afinn Pre Release,Afinn Post Release,Release date,Holiday,num_languages
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [320]:
df[df['motion_picture_rating'] == "Unrated"]

Unnamed: 0_level_0,Name,num_ratings,avg_rating,genre,motion_picture_rating,release_date,duration,num_user_ratings,num_critic_ratings,Gross,...,Production_House,viewCount,likeCount,dislikeCount,commentCount,Afinn Pre Release,Afinn Post Release,Release date,Holiday,num_languages
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [321]:
# check NaN after transforming
df['motion_picture_rating'].isnull().values.any()

False

### 6. Convert 'duration' column to time in minutes (integer)

In [322]:
def check_time(time):
    if len(time) == 1:
        if "h" in time[0]:
            new_time = 60*int(re.sub("\D", "", time[0]))
        else:
            new_time = int(re.sub("\D", "", time[0]))
    else:
        new_time = 60*int(re.sub("\D", "", time[0])) + int(re.sub("\D", "", time[1]))
    return new_time

test1 = df.duration.str.split(" ")
test2 = test1.apply(lambda x: check_time(x))

df['duration'] = test2

In [323]:
# check unique values
print(df.duration.unique())

[ 96  85  88  98  87  92 118  72 141 105 101  97 111 135  99 115 107 128
 119 116  76 103 162 100  91  80  93 156 155 112 110  94 108 129 123 132
  90  81 134 170 167 102 137 165 154 109 160 163 120 106 130  73  86 169
  95 127  89 140 104 114 113 158 176 122 121 124 117 125 173 131  71  84
 171 180 157 136 139 151 133  82  83 126 145 144 272 149 146  40 143 159
 148 201  79  78 161 152  75 153 138 147 164 142  70  68 172 166 188 187
  39 168  74 150  77 185 186 220 183  63  58  44  66 190 174 226  46  60
  69  59 334 270]


In [324]:
## check for NaNs after transforming
df['duration'].isnull().values.any()

False

### 7. Expanding 'genre' to one-hot columns

In [325]:
type(df['genre'].iloc[0]) # Need to convert string representation of list to an actual Python list to accumulate as et later

str

In [326]:
# check unique genre lists
unique_genre_lists = df['genre'].unique()
print(unique_genre_lists) 

['[Animation, Action, Family, Fantasy, Sci-Fi]' '[Drama, War]'
 '[Documentary, Drama]'
 '[Animation, Adventure, Drama, Family, Fantasy, Sci-Fi]'
 '[Action, Comedy, Crime, Drama]'
 '[Animation, Action, Adventure, Family, Fantasy]'
 '[Action, Drama, Fantasy, Horror, Sci-Fi]'
 '[Comedy, Drama, Fantasy, Horror, Musical, Thriller]' 'Drama'
 '[Documentary, Biography, History]' 'Musical'
 '[Action, Biography, Crime, Drama, Family, Fantasy]'
 '[Action, Crime, Thriller]' '[Comedy, Romance]' 'Documentary'
 '[Crime, Drama, Thriller]' '[Drama, Thriller]' '[Action, Drama]'
 '[Drama, Mystery, Romance]' '[Drama, History]'
 '[Biography, Crime, Drama, History]' '[Crime, Drama]'
 '[Comedy, Drama, Family]' '[Comedy, Drama]'
 '[Documentary, Comedy, Drama, Family, Fantasy, Romance]'
 '[Drama, Horror]' '[Comedy, Drama, Romance]' '[Drama, Mystery]'
 '[Animation, Fantasy]' '[Action, Drama, Mystery, Thriller]' 'Horror'
 '[Comedy, Music]' '[Horror, Mystery, Thriller]'
 '[Biography, Drama, Sport]' '[Comedy, Crim

In [327]:
def convert_to_list(x):
    if "[" in x:
        x = re.sub("[\[\]]", "", x)
        x = x.split(", ")
    else:
        x = x.split(" ") # split by non-existent delimiter
    return x

In [328]:
# get all unique genres available
genre_lists = df.genre.apply(lambda x: convert_to_list(x))
df.genre = genre_lists

# temp = genre_lists.tolist()
# flattened =  [y for x in temp for y in x]
# print(set(flattened))

In [329]:
type(df['genre'].iloc[0])

list

In [330]:
# add 23 new one-hot columns
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('genre')),
                          columns=mlb.classes_,
                          index=df.index))

In [331]:
df.tail(3)

Unnamed: 0_level_0,Name,num_ratings,avg_rating,motion_picture_rating,release_date,duration,num_user_ratings,num_critic_ratings,Gross,Country,...,Musical,Mystery,News,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1482393,Beverly Hills Chihuahua 2,2901,4.4,G,1 February 2011 Video,85,9.0,20.0,63500000,USA,...,0,0,0,0,0,0,0,0,0,0
2483260,The Pirate Fairy,11127,6.7,G,13 February 2014 Video,78,25.0,66.0,67400000,USA,...,0,0,0,0,0,0,0,0,0,0
1216515,Tinker Bell and the Great Fairy Rescue,6664,7.0,G,21 September 2010 Video,76,17.0,30.0,10872752,USA,...,0,0,0,0,0,0,0,0,0,0


In [332]:
# check for NaNs after transforming in all genre columns
df.shape

(3909, 44)

### 8. Release date to datetime and release location as a separate column

### 9. Cleaning up 'Gross'
Standardize currency, hard-code 3-digit movies, convert string to int/float

In [333]:
type(df['Gross'].iloc[0])

str

In [334]:
pd.set_option('display.max_row', 4000)
#df.Gross

In [335]:
# look for values which have alphabetic characters in them => not in USD and  has to be converted
df[df.Gross.str.contains(pat = "[a-zA-Z]")]

Unnamed: 0_level_0,Name,num_ratings,avg_rating,motion_picture_rating,release_date,duration,num_user_ratings,num_critic_ratings,Gross,Country,...,Musical,Mystery,News,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3417422,Drishyam,23724,8.8,PG-13,19 December 2013 (India),160,81.0,10.0,"INR 750,000,000",India,...,0,0,0,0,0,0,0,1,0,0
6980546,Bharat Ane Nenu,13039,8.0,PG-13,20 April 2018 (India),173,125.0,21.0,INR 206,India,...,0,0,0,0,0,0,0,0,0,0
3569782,Jigarthanda,8325,8.4,PG-13,1 August 2014 (India),171,34.0,20.0,"INR 350,000,000",India,...,0,0,0,0,0,0,0,1,0,0
5440700,Theri,11077,7.2,PG-13,14 April 2016 (India),157,54.0,12.0,"INR 1,500,000,000",India,...,0,0,0,0,0,0,0,0,0,0
2106537,Matru ki Bijlee ka Mandola,5727,5.7,PG-13,11 January 2013 (India),151,42.0,20.0,"INR 466,500,000",India,...,0,0,0,0,0,0,0,0,0,0
6878378,Vivegam,9840,5.9,PG-13,24 August 2017 (Kuwait),149,56.0,20.0,"INR 120,000,000",India,...,0,0,0,0,0,0,0,1,0,0
3848892,Baby,47710,8.0,PG-13,23 January 2015 (India),159,212.0,25.0,"INR 1,429,900,000",India,...,0,0,0,0,0,0,0,1,0,0
3320578,Veeram,6312,6.6,PG-13,10 January 2014 (India),161,39.0,3.0,"INR 1,300,000,000",India,...,0,0,0,0,0,0,0,0,0,0
6734984,Duvvada Jagannadham,2451,5.5,PG-13,23 June 2017 (India),152,10.0,5.0,INR 157,India,...,0,0,0,0,0,0,0,0,0,0
4727512,Srimanthudu,9548,7.6,PG-13,7 August 2015 (India),158,64.0,5.0,"INR 1,445,500,000",India,...,0,0,0,0,0,0,0,0,0,0


In [336]:
# remove extra whitespaces, commas:
df['Gross'] = df.Gross.apply(lambda x: re.sub("[,\s]", "", x))
df[df.Gross.str.contains(pat = "[a-zA-Z]")]

Unnamed: 0_level_0,Name,num_ratings,avg_rating,motion_picture_rating,release_date,duration,num_user_ratings,num_critic_ratings,Gross,Country,...,Musical,Mystery,News,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3417422,Drishyam,23724,8.8,PG-13,19 December 2013 (India),160,81.0,10.0,INR750000000,India,...,0,0,0,0,0,0,0,1,0,0
6980546,Bharat Ane Nenu,13039,8.0,PG-13,20 April 2018 (India),173,125.0,21.0,INR206,India,...,0,0,0,0,0,0,0,0,0,0
3569782,Jigarthanda,8325,8.4,PG-13,1 August 2014 (India),171,34.0,20.0,INR350000000,India,...,0,0,0,0,0,0,0,1,0,0
5440700,Theri,11077,7.2,PG-13,14 April 2016 (India),157,54.0,12.0,INR1500000000,India,...,0,0,0,0,0,0,0,0,0,0
2106537,Matru ki Bijlee ka Mandola,5727,5.7,PG-13,11 January 2013 (India),151,42.0,20.0,INR466500000,India,...,0,0,0,0,0,0,0,0,0,0
6878378,Vivegam,9840,5.9,PG-13,24 August 2017 (Kuwait),149,56.0,20.0,INR120000000,India,...,0,0,0,0,0,0,0,1,0,0
3848892,Baby,47710,8.0,PG-13,23 January 2015 (India),159,212.0,25.0,INR1429900000,India,...,0,0,0,0,0,0,0,1,0,0
3320578,Veeram,6312,6.6,PG-13,10 January 2014 (India),161,39.0,3.0,INR1300000000,India,...,0,0,0,0,0,0,0,0,0,0
6734984,Duvvada Jagannadham,2451,5.5,PG-13,23 June 2017 (India),152,10.0,5.0,INR157,India,...,0,0,0,0,0,0,0,0,0,0
4727512,Srimanthudu,9548,7.6,PG-13,7 August 2015 (India),158,64.0,5.0,INR1445500000,India,...,0,0,0,0,0,0,0,0,0,0


In [337]:
# add 7 trailing zeros for these 4:
# 6980546                 INR 206 Bharat Ane Nenu
# 3142764                 INR 130 Race Gurram
# 6734984                INR 157 Duvvada Jagannadham
# 6522546                INR 124 Spyder

gross_truncated = ["INR206", "INR130","INR157","INR124"]

df['Gross'] = df.Gross.apply(lambda x: x + "0000000" if x in gross_truncated else x)

In [338]:
# for American Satan - VND 74 cumulative worldwide gross - change to USD $226,232
# https://www.the-numbers.com/movie/American-Satan#tab=international
#df.at[5451690, 'Gross'] = "226232"

In [339]:
# Raazi - incorrectly entered as 2070 crores gross on IMDB
#df.at[7098658, 'Gross'] = "2070000000"

In [340]:
from currency_converter import CurrencyConverter
c = CurrencyConverter()

def convert_currency(x):
    if re.search('[a-zA-Z£]', x) == None:
        return float(x)
    
    split_gross = re.split('(\d+)',x)
    
    # GBP
    if(split_gross[0] == "£"):
        return (c.convert(float(split_gross[1]), 'GBP','USD'))
    
    # NPR isn't supported CurrencyConverter - hard code
    if(split_gross[0]  == "NPR"):
        return (float(split_gross[1])*0.0090)
    
    # VND isn't supported CurrencyConverter - hard code
    if(split_gross[0]  == "VND"):
        return (float(split_gross[1])*0.000043)
    
    return (c.convert(float(split_gross[1]), split_gross[0],'USD'))

In [342]:
df.tail(5)

Unnamed: 0_level_0,Name,num_ratings,avg_rating,motion_picture_rating,release_date,duration,num_user_ratings,num_critic_ratings,Gross,Country,...,Musical,Mystery,News,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2806908,Le métis de Dieu,243,6.5,PG-13,29 March 2013 TV Movie,96,9.0,9.0,130661,France,...,0,0,0,0,0,0,0,0,0,0
1721683,La belle endormie,640,5.7,PG,3 September 2010 TV Movie,82,4.0,41.0,28791,France,...,0,0,0,0,0,0,0,0,0,0
1482393,Beverly Hills Chihuahua 2,2901,4.4,G,1 February 2011 Video,85,9.0,20.0,63500000,USA,...,0,0,0,0,0,0,0,0,0,0
2483260,The Pirate Fairy,11127,6.7,G,13 February 2014 Video,78,25.0,66.0,67400000,USA,...,0,0,0,0,0,0,0,0,0,0
1216515,Tinker Bell and the Great Fairy Rescue,6664,7.0,G,21 September 2010 Video,76,17.0,30.0,10872752,USA,...,0,0,0,0,0,0,0,0,0,0


In [346]:
df.tail(5)['Release date']

KeyError: 'Release date'

In [266]:
# check NaN
df['Gross'].isnull().values.any()

False

In [273]:
df['Gross'].describe()

count        3909
unique       3883
top       1100000
freq            3
Name: Gross, dtype: object

In [269]:
#### Adjust gross for inflation based on the year - need inputs from Karthik for this:
def adjust_for_inflation(gross, release_date):
    if release_date.year == 2010:
        return(gross*1.152)
    elif release_date.year == 2011: 
        return(gross*1.124)
    elif release_date.year == 2012:
        return(gross*1.101)
    elif release_date.year == 2013:
        return(gross*1.087)
    elif release_date.year == 2014:
        return(gross*1.086)
    elif release_date.year == 2015:
        return(gross*1.068)
    elif release_date.year == 2016:
        return(gross*1.053)
    elif release_date.year == 2017:
        return(gross*1.032)
    else: # 2018 => just return x itself
        return gross
    

In [275]:
df['gross'] = df.apply(lambda x: adjust_for_inflation(x['Gross'], x['release_date']), axis=1)    

AttributeError: ("'str' object has no attribute 'year'", 'occurred at index 0')

In [348]:
df.head(5)['Release date'][0]

KeyError: 'Release date'

### If dividing into equal revenue ranges (instead of quintiles)

In [24]:
# def find_revenue_range(x):
#     if  0 <= x <= 588111000:
#         return 0
#     elif 588111001 <= x <= 1176222000:
#         return 1
#     elif 1176222000 <= x <= 1764333003:
#         return 2
#     elif 1764333003 <= x <= 2352444004:
#         return 3
#     else:
#         return 4
    
        
# df['gross_equal_range'] = df['Gross'].apply(lambda x: find_revenue_range(x))
 

In [29]:
# df[df['gross_equal_range'] == 3]

Unnamed: 0_level_0,Name,num_ratings,avg_rating,genre,motion_picture_rating,release_date,duration,num_user_ratings,num_critic_ratings,Gross,Country,Language,Production_House,Average_popularity_score_per_movie,num_languages,gross_equal_range
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2488496,Star Wars: Episode VII - The Force Awakens,775868,8.0,"[Action, Adventure, Fantasy, Sci-Fi]",PG-13,17 December 2015 (Singapore),136,4.0,869.0,2068224000.0,USA,English,Lucasfilm,4.859333,1,3
4154756,Avengers: Infinity War,609049,8.5,"[Action, Adventure, Sci-Fi]",PG-13,25 April 2018 (Singapore),149,3.0,560.0,2048710000.0,USA,English,Marvel Studios,15.181667,1,3


## Categorize movies by gross revenue quintile
Split movies into 5 groups by revenue, and add (one-hot?) columns for classification.

In [67]:
print(list(df.columns.values))
print(df.shape)

['Name', 'num_ratings', 'avg_rating', 'main_cast_list', 'main_cast_links', 'dir_list', 'creator_list', 'motion_picture_rating', 'release_date', 'duration', 'meta_critic_score', 'num_user_ratings', 'num_critic_ratings', 'story_line', 'others', 'Gross', 'Country', 'Language', 'Production_House', 'viewCount', 'likeCount', 'dislikeCount', 'commentCount', 'Afinn Pre Release', 'Afinn Post Release', 'Release date ', 'Holiday', 'Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Thriller', 'War', 'Western']
(3909, 50)


In [68]:
df.head(5)

Unnamed: 0_level_0,Name,num_ratings,avg_rating,main_cast_list,main_cast_links,dir_list,creator_list,motion_picture_rating,release_date,duration,...,Musical,Mystery,News,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1690470,Gekijouban Poketto monsutâ: Daiamondo & Pâru -...,1251,6.2,Ikue Ôtani|Sarah Natochenny|Wayne Grayson,/name/nm0649026/|/name/nm2516299/|/name/nm0969...,Kunihiko Yuyama,Satoshi Tajiri|Hideki Sonoda,PG-13,10 July 2010 (Japan),1h 36min,...,0,0,0,0,1,0,0,0,0,0
1508290,Kyatapirâ,952,6.7,Shinobu Terajima|Keigo Kasuya|Emi Masuda,/name/nm0855429/|/name/nm2486225/|/name/nm3787...,Kôji Wakamatsu,Hisako Kurosawa|Masao Adachi,PG-13,14 August 2010 (Japan),1h 25min,...,0,0,0,0,0,0,0,0,1,0
2057455,Dalpaengee eui byeol,323,7.3,Cho Young-Chan|Kim Soon-ho|Choi Jungah,/name/nm9804862/|/name/nm9804863/|/name/nm9804...,Seung-jun Yi,,PG-13,15 February 2014 (Japan),1h 28min,...,0,0,0,0,0,0,0,0,0,0
2077826,Gekijoban Poketto Monsuta besuto uisshu bikuti...,1152,6.1,Rica Matsumoto|Ikue Ôtani|Hideki Takahashi,/name/nm0559551/|/name/nm0649026/|/name/nm0847...,Kunihiko Yuyama,Junichi Masuda|Hideki Sonoda|Ken Sugimori|Sato...,PG-13,16 July 2011 (Japan),1h 28min,...,0,0,0,0,1,0,0,0,0,0
1937133,De l'autre côté du périph,4319,5.8,Omar Sy|Laurent Lafitte|Sabrina Ouazani,/name/nm1082477/|/name/nm0480850/|/name/nm1493...,David Charhon,Eric Altmayer|Nicolas Altmayer|David Charhon|A...,R,19 December 2012 (France),1h 36min,...,0,0,0,0,0,0,0,0,0,0


#### Divide into quintiles based on gross revenue
This divides into 5 balanced classes.
*** Dividing into 5 based on manually selected ranges results in a very high accuracy ~97%, because it is highly imbalanced - even easiest prediction of majority class can result in this accuracy.  **

In [70]:
ret_value = pd.qcut(df['Gross'], 5, labels=["very low", "low", "medium", "high", "very high"], retbins = True)

TypeError: unsupported operand type(s) for -: 'str' and 'str'

#### Check bucket values

In [71]:
df['gross_category'] = ret_value[0]
ret_value[1]
# low ends at 3.782940e+05, medium ends at 5.823487e+06, high ends at 6.337276e+07

NameError: name 'ret_value' is not defined

In [None]:
df.groupby('gross_category').size()

In [None]:
df_sorted = df.sort_values(['Gross','gross_category'])

#### This prints the whole dataframe (all ~3k rows)! 

In [None]:
#df_sorted

## Basic Classification Model - Logistic Regression

### Join df with YouTube features, Sentiment features


In [72]:
df_cleaned = df.copy()

In [73]:
df_cleaned.head(5)

Unnamed: 0_level_0,Name,num_ratings,avg_rating,main_cast_list,main_cast_links,dir_list,creator_list,motion_picture_rating,release_date,duration,...,Musical,Mystery,News,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1690470,Gekijouban Poketto monsutâ: Daiamondo & Pâru -...,1251,6.2,Ikue Ôtani|Sarah Natochenny|Wayne Grayson,/name/nm0649026/|/name/nm2516299/|/name/nm0969...,Kunihiko Yuyama,Satoshi Tajiri|Hideki Sonoda,PG-13,10 July 2010 (Japan),1h 36min,...,0,0,0,0,1,0,0,0,0,0
1508290,Kyatapirâ,952,6.7,Shinobu Terajima|Keigo Kasuya|Emi Masuda,/name/nm0855429/|/name/nm2486225/|/name/nm3787...,Kôji Wakamatsu,Hisako Kurosawa|Masao Adachi,PG-13,14 August 2010 (Japan),1h 25min,...,0,0,0,0,0,0,0,0,1,0
2057455,Dalpaengee eui byeol,323,7.3,Cho Young-Chan|Kim Soon-ho|Choi Jungah,/name/nm9804862/|/name/nm9804863/|/name/nm9804...,Seung-jun Yi,,PG-13,15 February 2014 (Japan),1h 28min,...,0,0,0,0,0,0,0,0,0,0
2077826,Gekijoban Poketto Monsuta besuto uisshu bikuti...,1152,6.1,Rica Matsumoto|Ikue Ôtani|Hideki Takahashi,/name/nm0559551/|/name/nm0649026/|/name/nm0847...,Kunihiko Yuyama,Junichi Masuda|Hideki Sonoda|Ken Sugimori|Sato...,PG-13,16 July 2011 (Japan),1h 28min,...,0,0,0,0,1,0,0,0,0,0
1937133,De l'autre côté du périph,4319,5.8,Omar Sy|Laurent Lafitte|Sabrina Ouazani,/name/nm1082477/|/name/nm0480850/|/name/nm1493...,David Charhon,Eric Altmayer|Nicolas Altmayer|David Charhon|A...,R,19 December 2012 (France),1h 36min,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# read from YouTube excel file here and join the 2 dataframes for columns - viewCount,  likeCount, dislikeCount, commentCount

In [None]:
# read from sentiment features

### Dealing with categorical features
Inspect non-numeric columns:

* Country                                 object -- 61 unique - categorize as top 5 vs. others 
* Language                                object -- 77 unique - categorize as top 5 vs. others 
* Production_House                        object -- ~2000+ unique - categorize as top 5 vs. othersCheck
* motion_picture_rating                   object -- only 5 groups 
* Name                                    object -- drop, too unique, unless using to derive a text-based feature
* release_date                            object -- drop, can be used to extract weekend/not later 

In [None]:
df_cleaned.dtypes

#### Check production house split

In [None]:
df_cleaned['Production_House'].dtypes

In [None]:
df_cleaned['Production_House'].head(5)

In [None]:
# could do top 5 vs others
df_cleaned.groupby('Production_House').size().sort_values(ascending = False).head(20)

In [34]:
top_production = list(df_cleaned.groupby('Production_House').size().sort_values(ascending = False).head(5).index)

df_cleaned['Production_House'] = df_cleaned.Production_House.apply(lambda x: x if x in top_production
                                         else "Other")

In [None]:
df_cleaned.groupby('Production_House').size().sort_values(ascending = False)

#### Check language split

In [None]:
df_cleaned.groupby('Language').size().sort_values(ascending = False).head(20) # could do English, French, Hindi, Spanish, Mandarin vs. others

In [35]:
top_language = list(df_cleaned.groupby('Language').size().sort_values(ascending = False).head(5).index)

df_cleaned['Language'] = df_cleaned.Language.apply(lambda x: x if x in top_language
                                         else "Other")

In [None]:
df_cleaned.groupby('Language').size().sort_values(ascending = False)

#### Check country split

In [None]:
df_cleaned.groupby('Country').size().sort_values(ascending = False).head(20) # could do USA, UK, France, India, Canada, China vs. others

In [36]:
top_countries = list(df_cleaned.groupby('Country').size().sort_values(ascending = False).head(5).index)

df_cleaned['Country'] = df_cleaned.Country.apply(lambda x: x if x in top_countries
                                         else "Other")

In [None]:
df_cleaned.groupby('Country').size().sort_values(ascending = False)

### One-Hot Encoding 
For categorical features, and the gross_category label.

In [347]:
X = df_cleaned.drop(['gross_category', 'Gross', 'Name'], axis=1) 
# drop is NOT in-place by default, doesn't affect original DF

y = df_cleaned['gross_category'].copy()

KeyError: "['gross_category'] not found in axis"

In [79]:
X.dtypes

num_ratings                             int64
avg_rating                            float64
genre                                  object
motion_picture_rating                  object
duration                                int64
num_user_ratings                      float64
num_critic_ratings                    float64
Country                                object
Language                               object
Production_House                       object
Average_popularity_score_per_movie    float64
num_languages                           int64
gross_equal_range                       int64
dtype: object

In [80]:
#categorical_cols = ["motion_picture_rating", "Country", "Language",  "Production_House"]
X_dummies = pd.get_dummies(X)

In [None]:
X_dummies.shape

In [None]:
X_dummies.head(5)

In [81]:
le = preprocessing.LabelEncoder()
le.fit(y)

LabelEncoder()

In [None]:
list(le.classes_)

In [82]:
y_encoded = le.transform(y) 
#y_encoded = y

### Split data - train, test

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y_encoded, random_state=1)

NameError: name 'X_dummies' is not defined

In [184]:
imputer = SimpleImputer()
scaler = StandardScaler()
lr = LogisticRegression(multi_class = "multinomial", solver = 'newton-cg', max_iter = 3000)

pipe = Pipeline([('imputer', imputer),
                 ('scaler', scaler), 
                 ('lr', lr)])


pipe.fit(X_train, y_train) 

Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=3000, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False))])

In [180]:
pipe.named_steps.keys()

dict_keys(['imputer', 'scaler', 'lr'])

In [185]:
# for any continuous parameters, specify a distribution instead of a list of options
param_grid = {}
param_grid['imputer__strategy'] = ["mean", "median"]
param_grid['scaler__with_mean'] = [True, False]
param_grid['scaler__with_std'] = [True, False]
param_grid['lr__C'] = [1, 0.75, 0.5] # smaller specifies stronger regularization
param_grid

{'imputer__strategy': ['mean', 'median'],
 'scaler__with_mean': [True, False],
 'scaler__with_std': [True, False],
 'lr__C': [1, 0.75, 0.5]}

In [186]:
# additional parameters are n_iter (number of searches) and random_state
rand = RandomizedSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_iter=5, random_state=1)

In [187]:
# time the randomized search
%time rand.fit(X_train, y_train)



CPU times: user 14min 32s, sys: 5.54 s, total: 14min 37s
Wall time: 7min 27s




RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_interce...y='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False))]),
          fit_params=None, iid='warn', n_iter=5, n_jobs=None,
          param_distributions={'imputer__strategy': ['mean', 'median'], 'scaler__with_mean': [True, False], 'scaler__with_std': [True, False], 'lr__C': [1, 0.75, 0.5]},
          pre_dispatch='2*n_jobs', random_state=1, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=0)

In [188]:
print(rand.best_score_) # hold-out set
print(rand.best_params_)
# print the best model found by RandomizedSearchCV
print(rand.best_estimator_)

0.5131034482758621
{'scaler__with_std': False, 'scaler__with_mean': True, 'lr__C': 1, 'imputer__strategy': 'median'}
Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=False)), ('lr', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=3000, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False))])


In [189]:
# predictions on train and test data with best estimator
y_trainpred0 = rand.predict(X_train)
y_pred0 = rand.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred0))
#print(metrics.f1_score(y_test, y_pred0, average='macro')) 

0.5222337125129266
0.5205962928208976


In [190]:
# train set
print(metrics.accuracy_score(y_train, y_trainpred0))
#print(metrics.f1_score(y_train, y_trainpred0, average='macro'))

0.6610344827586206
0.6604624669319147


In [221]:
# interpretation - none of these work?
#lr.summary()
# lr.intercept
lr.coef_

array([[ 1.08828396, -0.16609268,  0.04878577, ...,  0.16808627,
         0.1417818 , -0.18231989],
       [-1.14826968,  0.13146162, -0.2558423 , ..., -0.29465784,
         0.22114469,  0.12376476],
       [-2.15067419,  0.28179708, -0.18237579, ...,  0.15799099,
        -0.57826512,  0.12815282],
       [ 2.5991521 , -0.22380201,  0.88739497, ...,  0.2120307 ,
         0.22211297, -0.39524249],
       [-0.3884922 , -0.02336401, -0.49796265, ..., -0.24345013,
        -0.00677433,  0.3256448 ]])

In [220]:
# print("Train accuracy: ", pipe.score(X_train, y_train))
# print("Test accuracy: ", pipe.score(X_test, y_test))

### k-NN Classification

In [46]:
knn = KNeighborsClassifier()
pipe_knn = Pipeline([('imputer', imputer),
                 ('scaler', scaler), 
                 ('knn', knn)])
# pipeline steps are automatically assigned names by make_pipeline

NameError: name 'imputer' is not defined

In [195]:
param_grid = {}
param_grid['imputer__strategy'] = ["mean", "median"]
param_grid['scaler__with_mean'] = [True, False]
param_grid['scaler__with_std'] = [True, False]
param_grid['knn__n_neighbors'] = [15, 20, 25, 30, 10, 50] 
param_grid['knn__weights'] = ['uniform', 'distance'] 
param_grid['knn__algorithm'] = ['auto', 'ball_tree', 'kd_tree', 'brute']
param_grid

{'imputer__strategy': ['mean', 'median'],
 'scaler__with_mean': [True, False],
 'scaler__with_std': [True, False],
 'knn__n_neighbors': [15, 20, 25, 30, 10, 50],
 'knn__weights': ['uniform', 'distance'],
 'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

In [196]:
rand_knn = RandomizedSearchCV(pipe_knn, param_grid, cv=5, scoring='accuracy', n_iter=5, random_state=1)

In [197]:
# time the randomized search
%time rand_knn.fit(X_train, y_train)

CPU times: user 3min 13s, sys: 1.99 s, total: 3min 15s
Wall time: 3min 9s


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))]),
          fit_params=None, iid='warn', n_iter=5, n_jobs=None,
          param_distributions={'imputer__strategy': ['mean', 'median'], 'scaler__with_mean': [True, False], 'scaler__with_std': [True, False], 'knn__n_neighbors': [15, 20, 25, 30, 10, 50], 'knn__weights': ['uniform', 'distance'], 'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']},
          pre_dispatch='2*n_jobs', random_state=1, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=0)

In [198]:
print(rand_knn.best_score_) # hold-out set
print(rand_knn.best_params_)

0.4379310344827586
{'scaler__with_std': False, 'scaler__with_mean': True, 'knn__weights': 'uniform', 'knn__n_neighbors': 50, 'knn__algorithm': 'brute', 'imputer__strategy': 'mean'}


In [199]:
# print the best model found by RandomizedSearchCV
print(rand_knn.best_estimator_)

Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=False)), ('knn', KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=50, p=2,
           weights='uniform'))])


In [203]:
# predictions on train and test data with best estimator
y_trainpred_knn = rand_knn.predict(X_train)
y_pred_knn = rand_knn.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred_knn))
print(metrics.f1_score(y_test, y_pred_knn, average='macro'))

0.45191313340227507
0.4493592262021139


In [204]:
# train set
print(metrics.accuracy_score(y_train, y_trainpred_knn))
print(metrics.f1_score(y_train, y_trainpred_knn, average='macro'))

0.4748275862068966
0.4693067758301785


In [201]:
# pipe_knn.fit(X_train, y_train) # add randomized CV/grid search
# print("Train accuracy: ", pipe_knn.score(X_train, y_train))
# print("Test accuracy: ", pipe_knn.score(X_test, y_test))

### Random Forest Classification

In [206]:
rf = RandomForestClassifier(random_state=0)
pipe_rf = Pipeline([('imputer', imputer),
                 ('scaler', scaler), 
                 ('rf', rf)])


In [210]:
param_grid = {}
param_grid['imputer__strategy'] = ["mean", "median"]
param_grid['scaler__with_mean'] = [True, False]
param_grid['scaler__with_std'] = [True, False]
param_grid['rf__n_estimators'] = [50, 100, 150, 200, 300, 500]  # how many trees to use in the forest
param_grid['rf__max_depth'] = [3, 5, 7, 9] # max depth
param_grid['rf__criterion'] = ['gini', 'entropy']
param_grid['rf__max_features'] = ['auto', 'log2'] # like mtry
param_grid

{'imputer__strategy': ['mean', 'median'],
 'scaler__with_mean': [True, False],
 'scaler__with_std': [True, False],
 'rf__n_estimators': [50, 100, 150, 200, 300, 500],
 'rf__max_depth': [3, 5, 7, 9],
 'rf__criterion': ['gini', 'entropy'],
 'rf__max_features': ['auto', 'log2']}

In [211]:
rand_rf = RandomizedSearchCV(pipe_rf, param_grid, cv=5, scoring='accuracy', n_iter=5, random_state=1)

In [212]:
# time the randomized search
%time rand_rf.fit(X_train, y_train)

CPU times: user 25.1 s, sys: 930 ms, total: 26 s
Wall time: 16.9 s


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, m...ors='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False))]),
          fit_params=None, iid='warn', n_iter=5, n_jobs=None,
          param_distributions={'imputer__strategy': ['mean', 'median'], 'scaler__with_mean': [True, False], 'scaler__with_std': [True, False], 'rf__n_estimators': [50, 100, 150, 200, 300, 500], 'rf__max_depth': [3, 5, 7, 9], 'rf__criterion': ['gini', 'entropy'], 'rf__max_features': ['auto', 'log2']},
          pre_dispatch='2*n_jobs', random_state=1, refit=True,
          return_train_score='warn', scoring='accuracy', verbo

In [213]:
print(rand_rf.best_score_) # hold-out set
print(rand_rf.best_params_)

0.4696551724137931
{'scaler__with_std': False, 'scaler__with_mean': False, 'rf__n_estimators': 500, 'rf__max_features': 'log2', 'rf__max_depth': 9, 'rf__criterion': 'entropy', 'imputer__strategy': 'median'}


In [214]:
# print the best model found by RandomizedSearchCV
print(rand_rf.best_estimator_)

Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=False, with_std=False)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=...mators=500, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False))])


In [215]:
# predictions on train and test data with best estimator
y_trainpred_rf = rand_rf.predict(X_train)
y_pred_rf = rand_rf.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred_rf))
print(metrics.f1_score(y_test, y_pred_rf, average='macro'))

0.49948293691830403
0.46833752264815975


In [216]:
# train set
print(metrics.accuracy_score(y_train, y_trainpred_rf))
print(metrics.f1_score(y_train, y_trainpred_rf, average='macro'))

0.636551724137931
0.6191621151260325


In [219]:
# pipe_rf.fit(X_train, y_train) # add randomized CV/grid search
# print("Train accuracy: ", pipe_rf.score(X_train, y_train))
# print("Test accuracy: ", pipe_rf.score(X_test, y_test))

In [218]:

# rf.feature_importances - cannot do this when using pipeline?

### TODO modelling
0. Finalize evaluation metric - 1-away classification accuracy could be good - https://sud3010ganesh.github.io/2018-05-29-boxofficerevenueprediction/
1. Interpreting models - e.g., for RF, visualizing feature importance - https://towardsdatascience.com/how-to-visualize-a-decision-tree-from-a-random-forest-in-python-using-scikit-learn-38ad2d75f21c
2. Choose any other candidate models - neural networks, for example (like SNAP paper) or NB
3. Ensembling models/add to hyperparameter tuning above to improve performance
4. extension - setting y-variable (gross category) by K-means clustering gross revenue 