# Video Game Sales Prediction
---
## Problem Statement
(Fill in later)

### Load Libraries & Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBRegressor


In [2]:
# Load data
video_games = pd.read_csv('../data/train.csv')
video_games.head()

Unnamed: 0,name,platform,genre,publisher,developer,rating,year_of_release,na_sales,eu_sales,jp_sales,other_sales,global_sales,critic_score,critic_count,user_score,user_count
0,Warriors Orochi 3,XOne,Action,Tecmo Koei,unknown,E,2014.0,0.01,0.03,0.0,0.0,0.04,68.997119,26.440992,7.1269,163.008846
1,Shooter: Starfighter Sanvein,PS,Shooter,Midas Interactive Entertainment,unknown,E,2000.0,0.01,0.01,0.0,0.0,0.02,68.997119,26.440992,7.1269,163.008846
2,CIMA: The Enemy,GBA,Role-Playing,Marvelous Interactive,Neverland,E,2003.0,0.02,0.01,0.0,0.0,0.03,70.0,11.0,7.1269,163.008846
3,Borderlands: The Pre-Sequel,PS3,Shooter,Take-Two Interactive,2K Australia,M,2014.0,0.26,0.21,0.05,0.1,0.61,77.0,24.0,6.3,130.0
4,Destiny,XOne,Shooter,Activision,"Bungie Software, Bungie",T,2014.0,2.14,0.92,0.0,0.31,3.37,75.0,11.0,5.5,1735.0


## Modeling

### Model Preparation

In [3]:
# select model features
X = video_games.drop(columns=['jp_sales', 'other_sales', 'global_sales', 'name'])
# select model target 
y = video_games['global_sales']

# split train data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)

### Random Forest

In [11]:
%%time
# Random Forest pipeline
forest_pipe = Pipeline([
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore')),
    ('sc', StandardScaler()),
    ('rf', RandomForestRegressor())
])

# set up pipe parameters
forest_params = {
    'rf__n_estimators': np.arange(100, 1100, 100),
    'rf__max_depth': np.arange(1,11,1),
    'rf__max_features': ['sqrt', 'auto', round(np.random.uniform(0.1, 1.0), 2)]
}

# instantiate RandomizedSearch with pipe and params
forest_rs = RandomizedSearchCV(forest_pipe, forest_params, n_iter=20, cv=5, n_jobs=-1, verbose=1)

# fit RandomizedSearch model with train data
forest_rs.fit(X_train, y_train)

# print best score from best model
print('RandomForest Best Score:', forest_rs.best_score_)

# print parameters from best model
forest_rs.best_params_

Fitting 5 folds for each of 20 candidates, totalling 100 fits
RandomForest Best Score: 0.1905655092068375
CPU times: user 10min 36s, sys: 855 ms, total: 10min 37s
Wall time: 48min 5s


{'rf__n_estimators': 400, 'rf__max_features': 'auto', 'rf__max_depth': 10}

In [13]:
# score model on training data (R-squared)
forest_rs.score(X_train, y_train)

0.5790750693175881

In [14]:
# score model on validation data (R-squared)
forest_rs.score(X_val, y_val)

0.20146201475239767

In [16]:
# RMSE for Train Data
forest_preds = forest_rs.predict(X_train)
print('RMSE Train:', mean_squared_error(y_train, forest_preds, squared=False))

RMSE Train: 0.9138949432228005


In [17]:
# RMSE for Validation Data
forest_preds = forest_rs.predict(X_val)
print('RMSE Train:', mean_squared_error(y_val, forest_preds, squared=False))

RMSE Train: 1.255653451126081


Overall the tuned RandomForest is performing worse than the untuned RandomForest. Theres potential to explore values closer to the default values of RandomForest over a gridsearch. 

### XGBoost

In [12]:
%%time
# XGBoost pipeline
xgb_pipe = Pipeline([
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore')),
    ('sc', StandardScaler()),
    ('xgb', XGBRegressor())
])

# set up pipe parameters
xgb_params = {
    'xgb__n_estimators': np.arange(100, 1100, 100),
    'xgb__learning_rate': np.arange(0.01, 1.0, 0.01),
    'xgb__max_depth': np.arange(1,11,1),
    'xgb__subsample': np.arange(0.5, 1.0, 0.1),
    'xgb__colsample_bytree': np.arange(0.4, 1.0, 0.1),
    'xgb__colsample_bylevel': np.arange(0.4, 1.0, 0.1)
}

# instaniate Randomized Search with pipe and params
xgb_rs = RandomizedSearchCV(xgb_pipe, xgb_params, n_iter=20, cv=5, n_jobs=-1, verbose=1)

# fit Randomized Search model with train data
xgb_rs.fit(X_train, y_train)

# print best score from best model
print('XGBoost Best Score:', xgb_rs.best_score_)

# print parameters from best model
xgb_rs.best_params_

Fitting 5 folds for each of 20 candidates, totalling 100 fits
XGBoost Best Score: 0.43455558227709146
CPU times: user 35min 57s, sys: 6.6 s, total: 36min 3s
Wall time: 56min 8s


{'xgb__subsample': 0.8999999999999999,
 'xgb__n_estimators': 900,
 'xgb__max_depth': 10,
 'xgb__learning_rate': 0.15000000000000002,
 'xgb__colsample_bytree': 0.6,
 'xgb__colsample_bylevel': 0.8999999999999999}

In [18]:
# score model on training data (R-squared)
xgb_rs.score(X_train, y_train)

0.9970510178885698

In [19]:
# score model on validation data (R-squared)
xgb_rs.score(X_val, y_val)

0.3638125089510651

In [20]:
# RMSE for Train Data
xgb_preds = xgb_rs.predict(X_train)
print('RMSE Train:', mean_squared_error(y_train, xgb_preds, squared=False))

RMSE Train: 0.07649446628439566


In [21]:
# RMSE for Validation Data
xgb_preds = xgb_rs.predict(X_val)
print('RMSE Train:', mean_squared_error(y_val, xgb_preds, squared=False))

RMSE Train: 1.1207650194336796


Overall XGBoost is over fitting. It isn't performing than the baseline of 0.53. There is slightly more overfitting occurring than the untuned XGBoost.