## Model Building for Movie Data

This part mainly focuses on building models to predict movie revenue for the test data. We will use linear regression model as a baseline and then try tree-based models.

In [1]:
# Make the chunk output all results not only the last result
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')
import datetime
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, KFold
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
import os
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import xgboost as xgb
import lightgbm as lgb
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
# from catboost import CatBoostRegressor
# import eli5
# import shap


Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_9.4.1) compiler.
This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.



In [11]:
# Load data
train=pd.read_csv("new_train_2.csv")
test=pd.read_csv("new_test_2.csv")

# Look at the data
train.head(10)

# Shape
train.shape
test.shape

Unnamed: 0,id,budget,original_language,original_title,overview,popularity,runtime,status,tagline,title,Keywords,cast,crew,revenue,collection,has_collection,log_budget,genres_name,n_genres,all_genres,genre_Drama,genre_Comedy,genre_Thriller,genre_Action,genre_Romance,genre_Crime,genre_Adventure,genre_Horror,genre_Science Fiction,genre_Family,genre_Fantasy,has_homepage,log_popularity,company,n_company,all_production_companies,production_company_Warner Bros.,production_company_Universal Pictures,production_company_Paramount Pictures,production_company_Twentieth Century Fox Film Corporation,production_company_Columbia Pictures,production_company_Metro-Goldwyn-Mayer (MGM),production_company_New Line Cinema,production_company_Touchstone Pictures,production_company_Walt Disney Pictures,production_company_Columbia Pictures Corporation,production_company_TriStar Pictures,production_company_Relativity Media,production_company_Canal+,production_company_United Artists,production_company_Miramax Films,production_company_Village Roadshow Pictures,production_company_Regency Enterprises,production_company_BBC Films,production_company_Dune Entertainment,production_company_Working Title Films,country,n_country,all_production_countries,production_country_United States of America,production_country_United Kingdom,production_country_France,production_country_Germany,production_country_Canada,production_country_India,production_country_Italy,production_country_Japan,production_country_Australia,production_country_Russia,production_country_Spain,production_country_China,production_country_Hong Kong,production_country_Ireland,production_country_Belgium,release_time,year,month,DOW,season,Weekend,language,n_lan,all_spoken_languages,lan_English,lan_Français,lan_Español,lan_Deutsch,lan_Pусский,lan_Italiano,lan_日本語,lan_普通话,word,word_div,Crew,n_crew,initial_crew,crew_top,crew_top_ratio,Cast,n_cast,initial_cast,cast_top,cast_top_ratio,log_revenue
0,1,14000000,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,93.0,Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651,Hot Tub Time Machine Collection,1,16.454568,['Comedy'],1,Comedy,0,1,0,0,0,0,0,0,0,0,0,0,2.024905,"['Paramount Pictures', 'United Artists', 'Metr...",3,Metro-Goldwyn-Mayer (MGM) Paramount Pictures U...,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,['United States of America'],1,United States of America,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2015-02-20,2015,2,4,4,0,['English'],1,English,1,0,0,0,0,0,0,0,"['time travel', 'sequel', 'hot tub', 'duringcr...",4,"['Kelly Cantley', 'Steve Pink', 'Josh Heald', ...",72,"['Kelly Cantley', 'Steve Pink', 'Josh Heald', ...",1,0.013889,"['Rob Corddry', 'Craig Robinson', 'Clark Duke'...",24,"['Rob Corddry', 'Craig Robinson', 'Clark Duke'...",3,0.125,16.3263
1,2,40000000,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,113.0,Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435,The Princess Diaries Collection,1,17.50439,"['Comedy', 'Drama', 'Family', 'Romance']",4,Comedy Drama Family Romance,1,1,0,0,1,0,0,0,0,1,0,0,2.224504,['Walt Disney Pictures'],1,Walt Disney Pictures,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,['United States of America'],1,United States of America,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2004-08-06,2004,8,4,2,0,['English'],1,English,1,0,0,0,0,0,0,0,"['coronation', 'duty', 'marriage', 'falling in...",4,"['Garry Marshall', 'Charles Minsky', 'John Deb...",9,"['Garry Marshall', 'Charles Minsky', 'John Deb...",2,0.222222,"['Anne Hathaway', 'Julie Andrews', 'H√©ctor El...",20,"['Anne Hathaway', 'Julie Andrews', 'H√©ctor El...",4,0.2,18.370959
2,3,3300000,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,105.0,Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000,,0,15.009433,['Drama'],1,Drama,1,0,0,0,0,0,0,0,0,0,0,1,4.178992,"['Bold Films', 'Blumhouse Productions', 'Right...",3,Blumhouse Productions Bold Films Right of Way ...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,['United States of America'],1,United States of America,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2014-10-10,2014,10,4,3,0,['English'],1,English,1,0,0,0,0,0,0,0,"['jazz', 'obsession', 'conservatory', 'music t...",12,"['Terri Taylor', 'Richard Henderson', 'Jeffrey...",64,"['Terri Taylor', 'Richard Henderson', 'Jeffrey...",3,0.046875,"['Miles Teller', 'J.K. Simmons', 'Melissa Beno...",51,"['Miles Teller', 'J.K. Simmons', 'Melissa Beno...",1,0.019608,16.387512
3,4,1200000,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,122.0,Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000,,0,13.997833,"['Thriller', 'Drama']",2,Drama Thriller,1,0,1,0,0,0,0,0,0,0,0,1,1.429099,,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,['India'],1,India,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2012-03-09,2012,3,4,1,0,"['English', 'हिन्दी']",2,English हिन्दी,1,0,0,0,0,0,0,0,"['mystery', 'bollywood', 'police corruption', ...",7,"['Sujoy Ghosh', 'Sujoy Ghosh', 'Sujoy Ghosh']",3,"['Sujoy Ghosh', 'Sujoy Ghosh', 'Sujoy Ghosh']",0,0.0,"['Vidya Balan', 'Nawazuddin Siddiqui', 'Paramb...",7,"['Vidya Balan', 'Nawazuddin Siddiqui', 'Paramb...",0,0.0,16.588099
4,5,0,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,118.0,Released,,Marine Boy,{},"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970,,0,0.0,"['Action', 'Thriller']",2,Action Thriller,0,0,1,1,0,0,0,0,0,0,0,0,0.76457,,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,['South Korea'],1,South Korea,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2009-02-05,2009,2,3,4,0,['한국어/조선말'],1,한국어/조선말,0,0,0,0,0,0,0,0,,0,"['Jong-seok Yoon', 'Jong-seok Yoon']",2,"['Jong-seok Yoon', 'Jong-seok Yoon']",0,0.0,"['Kim Kang-woo', 'Jo Jae-hyeon', 'Park Si-yeon...",4,"['Kim Kang-woo', 'Jo Jae-hyeon', 'Park Si-yeon...",0,0.0,15.182615
5,6,8000000,en,Pinocchio and the Emperor of the Night,"Pinocchio and his friends, a glow worm and a m...",0.743274,83.0,Released,,Pinocchio and the Emperor of the Night,{},"[{'cast_id': 6, 'character': 'Pinocchio (voice...","[{'credit_id': '52fe46f49251416c9106558b', 'de...",3261638,,0,15.894952,"['Animation', 'Adventure', 'Family']",3,Adventure Animation Family,0,0,0,0,0,0,1,0,0,1,0,0,0.555765,,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1987-08-06,1987,8,3,2,0,['English'],1,English,1,0,0,0,0,0,0,0,,0,"['Collodi', 'Hal Sutherland', ""Dennis O'Flaher...",11,"['Collodi', 'Hal Sutherland', ""Dennis O'Flaher...",1,0.090909,"['Scott Grimes', 'Tom Bosley', 'Rickie Lee Jon...",4,"['Scott Grimes', 'Tom Bosley', 'Rickie Lee Jon...",1,0.25,14.99774
6,7,14000000,en,The Possession,A young girl buys an antique box at a yard sal...,7.286477,92.0,Released,Fear The Demon That Doesn't Fear God,The Possession,{},"[{'cast_id': 23, 'character': 'Clyde', 'credit...","[{'credit_id': '52fe4981c3a368484e12ee29', 'de...",85446075,,0,16.454568,"['Horror', 'Thriller']",2,Horror Thriller,0,0,1,0,0,0,0,1,0,0,0,1,2.114625,"['Ghost House Pictures', 'North Box Productions']",2,Ghost House Pictures North Box Productions,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"['United States of America', 'Canada']",2,Canada United States of America,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2012-08-30,2012,8,3,2,0,['English'],1,English,1,0,0,0,0,0,0,0,,0,"['Sam Raimi', 'Nancy Nayor', 'Anders Villadsen...",77,"['Sam Raimi', 'Nancy Nayor', 'Anders Villadsen...",3,0.038961,"['Jeffrey Dean Morgan', 'Natasha Calis', 'Madi...",14,"['Jeffrey Dean Morgan', 'Natasha Calis', 'Madi...",0,0.0,18.263396
7,8,0,en,Control Room,A chronicle which provides a rare window into ...,1.949044,84.0,Released,Different channels. Different truths.,Control Room,"[{'id': 917, 'name': 'journalism'}, {'id': 163...","[{'cast_id': 2, 'character': 'Himself', 'credi...","[{'credit_id': '52fe47a69251416c750a0daf', 'de...",2586511,,0,0.0,['Documentary'],1,Documentary,0,0,0,0,0,0,0,0,0,0,0,0,1.081481,,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2004-01-15,2004,1,3,4,0,"['العربية', 'English']",2,English العربية,1,0,0,0,0,0,0,0,"['journalism', 'translation', 'television', 'm...",7,['Jehane Noujaim'],1,['Jehane Noujaim'],0,0.0,"['Samir Khader', 'Josh Rushing', 'George W. Bu...",4,"['Samir Khader', 'Josh Rushing', 'George W. Bu...",0,0.0,14.765821
8,9,0,en,Muppet Treasure Island,After telling the story of Flint's last journe...,6.902423,100.0,Released,Set sail for Muppet mayhem!,Muppet Treasure Island,"[{'id': 2041, 'name': 'island'}, {'id': 4418, ...","[{'cast_id': 1, 'character': 'Long John Silver...","[{'credit_id': '52fe43c89251416c7501deb3', 'de...",34327391,The Muppet Collection,1,0.0,"['Action', 'Comedy', 'Music', 'Family', 'Adven...",5,Action Adventure Comedy Family Music,0,1,0,1,0,0,1,0,0,1,0,0,2.067169,"['Walt Disney Pictures', 'Jim Henson Productio...",3,"Jim Henson Company, The Jim Henson Productions...",0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,['United States of America'],1,United States of America,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1996-02-16,1996,2,4,4,0,['English'],1,English,1,0,0,0,0,0,0,0,"['island', 'pirate gang', 'puppet', 'treasure ...",4,"['Brian Henson', 'Frank Oz', 'Brian Henson', '...",8,"['Brian Henson', 'Frank Oz', 'Brian Henson', '...",1,0.125,"['Tim Curry', 'Kevin Bishop', 'Jennifer Saunde...",12,"['Tim Curry', 'Kevin Bishop', 'Jennifer Saunde...",1,0.083333,17.351454
9,10,6000000,en,A Mighty Wind,"In ""A Mighty Wind"", director Christopher Guest...",4.672036,91.0,Released,"Back together for the first time, again.",A Mighty Wind,"[{'id': 11800, 'name': 'mockumentary'}, {'id':...","[{'cast_id': 24, 'character': 'Jonathan Steinb...","[{'credit_id': '52fe45609251416c750545b3', 'de...",18750246,,0,15.60727,"['Comedy', 'Music']",2,Comedy Music,0,1,0,0,0,0,0,0,0,0,0,0,1.735548,['Castle Rock Entertainment'],1,Castle Rock Entertainment,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,['United States of America'],1,United States of America,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2003-04-16,2003,4,2,1,0,['English'],1,English,1,0,0,0,0,0,0,0,"['mockumentary', 'folk singer']",2,"['Christopher Guest', 'Christopher Guest', 'Eu...",11,"['Christopher Guest', 'Christopher Guest', 'Eu...",1,0.090909,"['Bob Balaban', 'Christopher Guest', 'John Mic...",20,"['Bob Balaban', 'Christopher Guest', 'John Mic...",10,0.5,16.746717


(3000, 104)

(4398, 102)

In [12]:
list(train.columns)

['id',
 'budget',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'runtime',
 'status',
 'tagline',
 'title',
 'Keywords',
 'cast',
 'crew',
 'revenue',
 'collection',
 'has_collection',
 'log_budget',
 'genres_name',
 'n_genres',
 'all_genres',
 'genre_Drama',
 'genre_Comedy',
 'genre_Thriller',
 'genre_Action',
 'genre_Romance',
 'genre_Crime',
 'genre_Adventure',
 'genre_Horror',
 'genre_Science Fiction',
 'genre_Family',
 'genre_Fantasy',
 'has_homepage',
 'log_popularity',
 'company',
 'n_company',
 'all_production_companies',
 'production_company_Warner Bros.',
 'production_company_Universal Pictures',
 'production_company_Paramount Pictures',
 'production_company_Twentieth Century Fox Film Corporation',
 'production_company_Columbia Pictures',
 'production_company_Metro-Goldwyn-Mayer (MGM)',
 'production_company_New Line Cinema',
 'production_company_Touchstone Pictures',
 'production_company_Walt Disney Pictures',
 'production_company_Columbia Pictures Cor

In [21]:
X_train=train.drop(['original_language','original_title','overview','status','tagline',
                    'title','Keywords','cast','crew','collection','log_budget','genres_name',
                    'all_genres','log_popularity','company','all_production_companies','country',
                    'all_production_countries','release_time','language','all_spoken_languages',
                    'word','Crew','initial_crew','crew_top','Cast','initial_cast','cast_top',
                    'log_revenue','revenue'], axis=1)
Y_train=train.revenue
X_test=test.drop(['original_language','original_title','overview','status','tagline',
                    'title','Keywords','cast','crew','collection','log_budget','genres_name',
                    'all_genres','log_popularity','company','all_production_companies','country',
                    'all_production_countries','release_time','language','all_spoken_languages',
                    'word','Crew','initial_crew','crew_top','Cast','initial_cast','cast_top',], axis=1)

In [None]:
# Fix previous data problem

In [22]:
# Split the train data
# Since the train data does not have many observations, we split the train and valid dataset 
# as 90% and 10% separately
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size=0.1)

### Linear Regression Model

In [None]:
# Build a linear regression model as a baseine
linear_model=LinearRegression()
linear_model.fit(X_train, Y_train)

### Tree-Based Model

In [16]:
# The first model is built using lightgbm
params = {'num_leaves': 30,
         'min_data_in_leaf': 20,
         'objective': 'regression',
         'max_depth': 5,
         'learning_rate': 0.01,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.2,
         "verbosity": -1}
model1 = lgb.LGBMRegressor(**params, n_estimators = 20000, nthread = 4, n_jobs = -1)
model1.fit(X_train, Y_train, 
           eval_set=[(X_train, Y_train), (X_valid, Y_valid)], eval_metric='rmse',
           verbose=1000, early_stopping_rounds=200)

Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[547]	training's rmse: 5.42874e+07	valid_1's rmse: 7.68631e+07


LGBMRegressor(bagging_fraction=0.9, bagging_freq=1, bagging_seed=11,
       boosting='gbdt', boosting_type='gbdt', class_weight=None,
       colsample_bytree=1.0, feature_fraction=0.9, importance_type='split',
       lambda_l1=0.2, learning_rate=0.01, max_depth=5, metric='rmse',
       min_child_samples=20, min_child_weight=0.001, min_data_in_leaf=20,
       min_split_gain=0.0, n_estimators=20000, n_jobs=-1, nthread=4,
       num_leaves=30, objective='regression', random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0, verbosity=-1)

In [None]:
# Let's look at how much each predictor can explain
explainer = shap.TreeExplainer(model1, X_train)
shap_values = explainer.shap_values(X_train)

shap.summary_plot(shap_values, X_train)