# Video Game Sales Prediction
---
## Problem Statement
(Fill in later)

### Load Libraries & Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV


In [2]:
# Load data
video_games = pd.read_csv('../data/train.csv')
video_games.head()

Unnamed: 0,name,platform,genre,publisher,developer,rating,year_of_release,na_sales,eu_sales,jp_sales,other_sales,global_sales,critic_score,critic_count,user_score,user_count
0,Warriors Orochi 3,XOne,Action,Tecmo Koei,unknown,E,2014.0,0.01,0.03,0.0,0.0,0.04,68.997119,26.440992,7.1269,163.008846
1,Shooter: Starfighter Sanvein,PS,Shooter,Midas Interactive Entertainment,unknown,E,2000.0,0.01,0.01,0.0,0.0,0.02,68.997119,26.440992,7.1269,163.008846
2,CIMA: The Enemy,GBA,Role-Playing,Marvelous Interactive,Neverland,E,2003.0,0.02,0.01,0.0,0.0,0.03,70.0,11.0,7.1269,163.008846
3,Borderlands: The Pre-Sequel,PS3,Shooter,Take-Two Interactive,2K Australia,M,2014.0,0.26,0.21,0.05,0.1,0.61,77.0,24.0,6.3,130.0
4,Destiny,XOne,Shooter,Activision,"Bungie Software, Bungie",T,2014.0,2.14,0.92,0.0,0.31,3.37,75.0,11.0,5.5,1735.0


## Modeling

### Model Preparation

In [4]:
# select model features
X = video_games.drop(columns=['jp_sales', 'other_sales', 'global_sales', 'name'])
# select model target 
y = video_games['global_sales']

# split train data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)

### Random Forest Fine Tuning

In [5]:
np.arange(100, 600, 100)

array([100, 200, 300, 400, 500])

In [11]:
np.arange(3,10,2)

array([3, 5, 7, 9])

In [13]:
%%time
# Random Forest pipeline
forest_pipe = Pipeline([
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore')),
    ('sc', StandardScaler()),
    ('rf', RandomForestRegressor())
])

# set up pipe parameters
forest_params = {
    'rf__n_estimators': [400, 500, 600],
    'rf__max_depth': [10, 15, 20],
    'rf__max_features': ['sqrt', 'auto']
}

# instantiate RandomizedSearch with pipe and params
forest_grid = GridSearchCV(forest_pipe, forest_params, cv=5, n_jobs=-1, verbose=1)

# fit RandomizedSearch model with train data
forest_grid.fit(X_train, y_train)

# print best score from best model
print('RandomForest Best Score:', forest_grid.best_score_)

# print parameters from best model
forest_grid.best_params_

Fitting 5 folds for each of 18 candidates, totalling 90 fits
RandomForest Best Score: 0.2548281373217177
CPU times: user 21min 6s, sys: 971 ms, total: 21min 7s
Wall time: 1h 16min 49s


{'rf__max_depth': 20, 'rf__max_features': 'auto', 'rf__n_estimators': 600}

In [14]:
# score model on training data (R-squared)
forest_grid.score(X_train, y_train)

0.7256874445326937

In [15]:
# score model on validation data (R-squared)
forest_grid.score(X_val, y_val)

0.25111282319171524

In [16]:
# RMSE for Train Data
forest_preds = forest_grid.predict(X_train)
print('RMSE Train:', mean_squared_error(y_train, forest_preds, squared=False))

RMSE Train: 0.7377628673204125


In [17]:
# RMSE for Validation Data
forest_preds = forest_grid.predict(X_val)
print('RMSE Train:', mean_squared_error(y_val, forest_preds, squared=False))

RMSE Train: 1.215990555712607


### Random Forest Further Tuning

In [5]:
%%time
# Random Forest pipeline
forest_pipe = Pipeline([
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore')),
    ('sc', StandardScaler()),
    ('rf', RandomForestRegressor(n_jobs=12))
])

# set up pipe parameters
forest_params = {
    'rf__n_estimators': [600, 1000],
    'rf__max_depth': [100, None],
    'rf__max_features': ['auto']
}

# instantiate RandomizedSearch with pipe and params
forest_grid = GridSearchCV(forest_pipe, forest_params, cv=5, n_jobs=24, verbose=1)

# fit RandomizedSearch model with train data
forest_grid.fit(X_train, y_train)

# print best score from best model
print('RandomForest Best Score:', forest_grid.best_score_)

# print parameters from best model
forest_grid.best_params_

Fitting 5 folds for each of 4 candidates, totalling 20 fits
RandomForest Best Score: 0.3590417861812004
CPU times: user 1h 4min 37s, sys: 1.13 s, total: 1h 4min 38s
Wall time: 1h 2min 37s


{'rf__max_depth': None, 'rf__max_features': 'auto', 'rf__n_estimators': 1000}

In [6]:
# score model on training data (R-squared)
forest_grid.score(X_train, y_train)

0.9119134063622014

In [7]:
# score model on validation data (R-squared)
forest_grid.score(X_val, y_val)

0.34930892442713224

In [8]:
# RMSE for Train Data
forest_preds = forest_grid.predict(X_train)
print('RMSE Train:', mean_squared_error(y_train, forest_preds, squared=False))

RMSE Train: 0.4180698587581727


In [9]:
# RMSE for Validation Data
forest_preds = forest_grid.predict(X_val)
print('RMSE Train:', mean_squared_error(y_val, forest_preds, squared=False))

RMSE Train: 1.1334684339448633


After get best params and scores and metrics:
- create one final model with the best params
- look for most significant features
- make conclusion and recommendations

### Final Model

In [32]:
# Data pipeline
pipe = Pipeline([
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore')),
    ('sc', StandardScaler()),
    ('rf', RandomForestRegressor(n_jobs=-1))
])

pipe.fit(X_train, y_train)

Pipeline(steps=[('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False)),
                ('sc', StandardScaler()),
                ('rf', RandomForestRegressor(n_jobs=-1))])

In [44]:
columns = pipe.named_steps['ohe'].get_feature_names()
feature_importances = pipe.named_steps['rf'].feature_importances_
pd.Series(feature_importances, columns).sort_values(ascending=False)[:20]

x2_Nintendo             0.058298
x7_3.58                 0.032417
x6_23.2                 0.029523
x7_0.0                  0.029019
x6_29.08                0.028393
x6_26.93                0.028300
x3_Polyphony Digital    0.026840
x7_8.89                 0.024975
x3_Rockstar North       0.022875
x6_15.68                0.022521
x7_12.76                0.021970
x6_11.27                0.018685
x8_97.0                 0.012453
x7_0.02                 0.012203
x3_Infinity Ward        0.012092
x7_0.01                 0.011838
x6_9.0                  0.011694
x7_6.18                 0.011609
x7_2.26                 0.010226
x11_709.0               0.010119
dtype: float64

### Conclusion