In [1]:
import pandas as pd
pd.set_option('display.max_columns', 300)
import numpy as np
from matplotlib import pyplot as plt
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import seaborn as sns
sns.set(style="whitegrid")

In [4]:
data = pd.read_csv('resources/movies_training.csv', index_col=0)

In [7]:
data.columns

Index(['num_critic_for_reviews', 'duration', 'director_facebook_likes',
       'actor_3_facebook_likes', 'actor_1_facebook_likes',
       'cast_total_facebook_likes', 'facenumber_in_poster',
       'num_user_for_reviews', 'budget', 'title_year',
       'actor_2_facebook_likes', 'imdb_score', 'G', 'PG', 'PG-13', 'R',
       'gross'],
      dtype='object')

## Determine our best model 

Do we want to transform the target variable by taking the log or square root of it?

In [18]:
target = data['gross']
features = data.drop(['gross'], axis=1)


Create a train test split of our data.

In [23]:
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=22,test_size=0.2)


In [24]:
X_train.head()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,G,PG,PG-13,R
2830,157.0,125.0,448.0,93.0,273.0,529,1.0,140.0,10000000.0,2004.0,114.0,8.1,0,0,1,0
1065,142.0,102.0,148.0,135.0,8000.0,9176,4.0,343.0,48000000.0,2001.0,973.0,5.7,0,0,1,0
80,453.0,124.0,4000.0,4000.0,21000.0,48638,4.0,723.0,200000000.0,2010.0,19000.0,7.0,0,0,1,0
3764,61.0,90.0,4.0,556.0,2000.0,5202,0.0,202.0,36998505.3,2006.0,936.0,7.0,0,0,0,1
2402,195.0,98.0,72.0,566.0,1000.0,3151,2.0,485.0,18000000.0,2003.0,624.0,7.1,0,0,0,1


How do you want to handle extreme values for some of the continuous columns?

In [25]:
X_train.describe()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,G,PG,PG-13,R
count,1938.0,1938.0,1938.0,1938.0,1938.0,1938.0,1938.0,1938.0,1938.0,1938.0,1938.0,1938.0,1938.0,1938.0,1938.0,1938.0
mean,188.203818,109.404541,752.603199,898.403095,8448.017544,12749.148607,1.436691,359.930341,48480010.0,2006.297214,2316.650155,6.402425,0.0258,0.148607,0.410733,0.409701
std,124.788044,19.787551,2946.005715,2145.737933,11738.06775,17123.788297,2.077983,415.673179,74672600.0,5.543967,5270.858278,1.057827,0.158578,0.355792,0.492094,0.491905
min,3.0,42.0,0.0,0.0,0.0,0.0,0.0,5.0,15000.0,1996.0,0.0,1.6,0.0,0.0,0.0,0.0
25%,97.0,96.0,12.0,223.0,820.0,2176.0,0.0,127.25,15000000.0,2002.0,436.0,5.8,0.0,0.0,0.0,0.0
50%,160.0,106.0,65.0,471.5,2000.0,4861.5,1.0,233.0,34000000.0,2006.0,722.0,6.5,0.0,0.0,0.0,0.0
75%,248.0,119.0,234.0,735.5,14000.0,17880.5,2.0,432.0,60000000.0,2011.0,1000.0,7.1,0.0,0.0,1.0,1.0
max,813.0,280.0,23000.0,23000.0,164000.0,303717.0,31.0,5060.0,2400000000.0,2016.0,137000.0,8.9,1.0,1.0,1.0,1.0


In [26]:
extreme_cols = ['budget', 'cast_total_facebook_likes', 'director_facebook_likes', 'actor_3_facebook_likes']

Create any polynomial or interaction terms?

What type of scaler do we want to use?

What feature selection methods will we implement?

Fit our model

Evaluate how our model performance on the test set. 

## Once we are happy with our performance and have solidified the process, let's refit this process to the entire dataset.  

## Now with our model including more data points, let's apply it to the holdout set. 

In [15]:
holdout = pd.read_csv('resources/movies_holdout_features.csv', index_col=0)

Remember we have to perform the same transformations on our holdout data (feature engineering, extreme values and scaling) that we performed on the original data.  

Predict using the fitted model. 

Grade our predictions.  