(This notebook took ~1.5 hours to put together)

# Imports

In [1]:
import datetime
import numpy as np
import pandas as pd
from scipy.stats import mode

from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet

import matplotlib.pyplot as plt
import matplotlib as mpl

  from numpy.core.umath_tests import inner1d


# Pull in the Data and Perform Train/Validation Split

In [2]:
X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_train = pd.read_csv('../data/y_train.csv')
y_test = pd.read_csv('../data/y_test.csv')

In [3]:
X_train.head()

Unnamed: 0,Rating,Required_Age,Is_Multiplayer,Accounting,Action,Adventure,Animation & Modeling,Audio Production,Casual,Design & Illustration,...,Racing,Simulation,Software Training,Sports,Strategy,Utilities,Video Production,Web Publishing,MeanPlaytime,MedianPlaytime
0,,,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1356.944444,192.5
1,87.0,0.0,0.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,62.5,62.5
2,88.0,0.0,1.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,
3,,,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,882.25,223.5
4,96.0,0.0,1.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,15.5,15.5


In [4]:
y_train.head()

Unnamed: 0,labels
0,18.0
1,421.0
2,34.0
3,36.0
4,1.0


### Deal with NaNs

In [5]:
X_train_imputed = X_train.copy()
X_train_imputed['Rating'] = (
    X_train_imputed['Rating']
    .fillna(
        np.average(X_train_imputed.loc[~np.isnan(X_train_imputed.Rating)])
    )
)
for col in X_train_imputed.columns[1:25]:
    X_train_imputed[col] = (
        X_train_imputed[col]
        .fillna(
            mode(X_train_imputed.loc[~np.isnan(X_train_imputed[col]), col])[0][0]
        )
    )
    
X_train_imputed['MeanPlaytime'] = (
    X_train_imputed['MeanPlaytime']
    .fillna(
        np.average(X_train_imputed.loc[~np.isnan(X_train_imputed.MeanPlaytime), 'MeanPlaytime'])
    )
)
X_train_imputed['MedianPlaytime'] = (
    X_train_imputed['MedianPlaytime']
    .fillna(
        np.average(X_train_imputed.loc[~np.isnan(X_train_imputed.MedianPlaytime), 'MedianPlaytime'])
    )
)

In [6]:
X_test_imputed = X_test.copy()
X_test_imputed['Rating'] = (
    X_test_imputed['Rating']
    .fillna(
        np.average(X_test_imputed.loc[~np.isnan(X_test_imputed.Rating)])
    )
)
for col in X_test_imputed.columns[1:25]:
    X_test_imputed[col] = (
        X_test_imputed[col]
        .fillna(
            mode(X_test_imputed.loc[~np.isnan(X_test_imputed[col]), col])[0][0]
        )
    )
    
X_test_imputed['MeanPlaytime'] = (
    X_test_imputed['MeanPlaytime']
    .fillna(
        np.average(X_test_imputed.loc[~np.isnan(X_test_imputed.MeanPlaytime), 'MeanPlaytime'])
    )
)
X_test_imputed['MedianPlaytime'] = (
    X_test_imputed['MedianPlaytime']
    .fillna(
        np.average(X_test_imputed.loc[~np.isnan(X_test_imputed.MedianPlaytime), 'MedianPlaytime'])
    )
)

# Modeling

## Elastic Net Regression

In [7]:
en = ElasticNet()

cv_results = cross_validate(
    en, 
    X_train_imputed.fillna(0), 
    y_train.values.T[0], 
    cv=3, 
    scoring='neg_mean_absolute_error'
)
print("(Validation) Negative Mean Absolute Error:", cv_results['test_score'])

(Validation) Negative Mean Absolute Error: [-3293.50777121 -3432.42283431 -3773.74970426]


In [8]:
en.fit(    
    X_train_imputed.fillna(0), 
    y_train.values.T[0]
)
test_predictions = en.predict(X_test_imputed.fillna(0))
mae = metrics.mean_absolute_error(y_test.values.T[0], test_predictions)
print("(Test) Negative Mean Absolute Error:", -1 * mae)

(Test) Negative Mean Absolute Error: -5897.859789998183


## Random Forest Regressor

In [9]:
rfg = RandomForestRegressor()

cv_results = cross_validate(
    rfg, 
    X_train_imputed.fillna(0), 
    y_train.values.T[0], 
    cv=3, 
    scoring='neg_mean_absolute_error'
)
print("(Validation) Negative Mean Absolute Error:", cv_results['test_score'])

(Validation) Negative Mean Absolute Error: [-3588.04373362 -3155.55087574 -4019.04351146]


In [10]:
rfg.fit(    
    X_train_imputed.fillna(0), 
    y_train.values.T[0]
)
test_predictions = rfg.predict(X_test_imputed.fillna(0))
mae = metrics.mean_absolute_error(y_test.values.T[0], test_predictions)
print("(Test) Negative Mean Absolute Error:", -1 * mae)

(Test) Negative Mean Absolute Error: -16644.07420405255


## Gradient Boosted Regressor

In [11]:
gbr = GradientBoostingRegressor()

cv_results = cross_validate(
    gbr, 
    X_train_imputed.fillna(0), 
    y_train.values.T[0], 
    cv=3, 
    scoring='neg_mean_absolute_error'
)
print("Negative Mean Absolute Error:", cv_results['test_score'])

Negative Mean Absolute Error: [-3380.3891141  -3083.27886352 -3781.52395872]


In [12]:
gbr.fit(    
    X_train_imputed.fillna(0), 
    y_train.values.T[0]
)
test_predictions = gbr.predict(X_test_imputed.fillna(0))
mae = metrics.mean_absolute_error(y_test.values.T[0], test_predictions)
print("(Test) Negative Mean Absolute Error:", -1 * mae)

(Test) Negative Mean Absolute Error: -6720.782806195966


# Conclusions

The Elastic Net model performs best for the holdout set, so would be the model we should go with.

# Next Steps

So what are the next steps we could take for this modeling process?<br>
(1) Compare our error distribution to the overall distribution of play times to determine if our model has any real predictive power.
(2) Experiment with hyperparameter tuning.<br>
(3) Add in new features.<br>
(4) Run some experiments to see if the difference in validation and test performance is from seasonalities/shifts in the data or if it is from overfitting.<br>
(5) Train an XGBoost model.<br>
(6) Train a shallow Neural Net.