In [1]:
import os, pickle
import pandas as pd
import numpy as np
from sklearn.metrics import *
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
DATA_DIR = '..\\data\\raw'
INTERIM_DIR = '..\\data\\interim'
FINAL_DIR = '..\\data\\final'
MODELS_DIR = '..\\models'

### Read data files

In [3]:
train_set = pd.read_csv(os.path.join(FINAL_DIR, 'train_set.csv'))
test_set = pd.read_csv(os.path.join(FINAL_DIR, 'test_set.csv'))

In [4]:
train_set.columns

Index(['isAdult', 'startYear', 'runtimeMinutes', 'averageRating', 'numVotes',
       'pi_mean_mean', 'pi_mean_std', 'pi_med_mean', 'pi_med_std',
       'pi_std_mean', 'pi_std_std', 'Action', 'Adult', 'Adventure',
       'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Family', 'Fantasy', 'Film-Noir', 'Game-Show', 'History', 'Horror',
       'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance',
       'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western'],
      dtype='object')

In [5]:
train_y = train_set['averageRating'].values.reshape(-1,1)
test_y = test_set['averageRating'].values.reshape(-1,1)

del train_set['averageRating']
del test_set['averageRating']

# In my opinion we cannot use numvotes as this feature would not be available for future movies
del train_set['numVotes']
del test_set['numVotes']

train_X = train_set.values
test_X = test_set.values

for array in [train_X, test_X, train_y, test_y]:
    
    print('Shape of array is : {}'.format(array.shape))

Shape of array is : (185439, 37)
Shape of array is : (79480, 37)
Shape of array is : (185439, 1)
Shape of array is : (79480, 1)


#### Fitting the XGBoost regressor 

In [6]:
def evaluate_regression_model(model, 
                              X_train=train_X, y_train=train_y, 
                              X_test=test_X, y_test=test_y):
    '''
    Calculates and prints metrics for the model
    '''
    print("#"*50+"Train set Results"+"#"*50)
    
    y_pred = model.predict(X_train)
    r2_error = r2_score(y_train, y_pred)
    print("R2 score: {}".format(r2_error))
    
    print("\n"+"#"*50+"Test set Results"+"#"*50)
    
    y_pred = model.predict(X_test)
    r2_error = r2_score(y_test, y_pred)
    print("R2 score: {}".format(r2_error))
    
    print("#"*120)
    

In [7]:
def save_model(model, filename):
    '''
    Saves trained models to Models directory for later use
    '''
    with open(os.path.join(MODELS_DIR, filename), 'wb') as f:
        pickle.dump(model, f)

In [8]:
xgb_model = XGBRegressor(n_jobs=-1)
xgb_model.fit(train_X, train_y)
evaluate_regression_model(xgb_model)

##################################################Train set Results##################################################
R2 score: 0.7034459079078683

##################################################Test set Results##################################################
R2 score: 0.6631668025639121
########################################################################################################################


In [9]:
save_model(xgb_model, 'vanilla_xgb.pkl')

In [10]:
rf_model = RandomForestRegressor(n_jobs=-1)
rf_model.fit(train_X, train_y)
evaluate_regression_model(rf_model)

##################################################Train set Results##################################################
R2 score: 0.9512853635787828

##################################################Test set Results##################################################
R2 score: 0.654231694976209
########################################################################################################################


In [11]:
save_model(xgb_model, 'vanilla_rf.pkl')

In [12]:
lr_model = LinearRegression(n_jobs=-1)
lr_model.fit(train_X, train_y)
evaluate_regression_model(lr_model)

##################################################Train set Results##################################################
R2 score: 0.5014457975664124

##################################################Test set Results##################################################
R2 score: 0.5091145805225719
########################################################################################################################


In [13]:
save_model(lr_model, "vanilla_lr.pkl")

### Manually changing parameters

In [14]:
xgb_model = XGBRegressor(max_depth=10, n_jobs=-1)

xgb_model.fit(train_X, train_y)
evaluate_regression_model(xgb_model)

##################################################Train set Results##################################################
R2 score: 0.807983056608621

##################################################Test set Results##################################################
R2 score: 0.6502518069212846
########################################################################################################################


In [15]:
xgb_model = XGBRegressor(max_depth=4, n_jobs=-1)

xgb_model.fit(train_X, train_y)
evaluate_regression_model(xgb_model)

##################################################Train set Results##################################################
R2 score: 0.6714511021272971

##################################################Test set Results##################################################
R2 score: 0.6590955940040311
########################################################################################################################


In [16]:
xgb_model = XGBRegressor(subsample=0.8, n_jobs=-1)

xgb_model.fit(train_X, train_y)
evaluate_regression_model(xgb_model)

##################################################Train set Results##################################################
R2 score: 0.7019585115453526

##################################################Test set Results##################################################
R2 score: 0.6619330271533119
########################################################################################################################


## Conclusions:

1. Our best XGBoost model has an accuracy of 66% on the test set.
2. Its performance on the training set of 70% suggests overfitting.

However, XGBoost, as per the paper describing it, is designed to be robust against overfitting. 
Following options are available for further exploration:
 - Changing parameters of the estimator (worked to some extent in this notebook)
 - Artificial Neural Networks and other algorithms 
 - Other Ensemble of estimators (Bagging, Boosting, Stacking)
 - Analyzing features and whether they are misleading the model in some way because we have detected spurious behavior in the popularity index.