In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import xgboost as xgb

In [2]:
data_path = '../data/interim'
Xtrain = pd.read_csv(data_path+'wods_Xtrain')
ytrain = pd.read_csv(data_path+'wods_ytrain')
Xtest = pd.read_csv(data_path+'wods_Xtest')
ytest = pd.read_csv(data_path+'wods_ytest')

In [3]:
ytrain=ytrain['Likes']
ytest=ytest['Likes']
wodsDF=pd.read_csv('../data/processed/wodsEngineered.csv')

# Model Selection

In the following notebook, we will be testing a few different models and tuning their hyper parameters to determine which model is best suited for our data set.

In [4]:
maxDepth = list(range(1,30))
maxDepth.append(None)
paramGrid = {'criterion':['squared_error', 'poisson'],'max_depth':maxDepth,'min_samples_leaf':np.arange(1,30)}
rf = RandomForestRegressor(random_state=42)
rf_cv = GridSearchCV(rf,paramGrid,cv=5)
rf_cv.fit(X=Xtrain,y=ytrain)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid={'criterion': ['squared_error', 'poisson'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                       23, 24, 25, 26, 27, 28, 29, None],
                         'min_samples_leaf': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])})

In [5]:
rf_cv.best_params_

{'criterion': 'poisson', 'max_depth': 1, 'min_samples_leaf': 5}

In [6]:
rf_cv.best_score_

0.005892158443725282

# Linear Regression

In [7]:
linearParams = {'fit_intercept':[True,False]}
lr = LinearRegression()
lr_cv = GridSearchCV(lr,linearParams,cv=5)
lr_cv.fit(Xtrain,ytrain)

GridSearchCV(cv=5, estimator=LinearRegression(),
             param_grid={'fit_intercept': [True, False]})

In [8]:
lr_cv.best_params_

{'fit_intercept': False}

In [9]:
lr_cv.best_score_

-0.5626495545736722

# XGBoost

In [10]:
xgbModel = xgb.XGBRegressor(objective= 'reg:squarederror',nthread=4,seed=42)
XGBparam = {'max_depth': range (2, 10, 1),'n_estimators': range(60, 220, 40),'learning_rate': [0.1, 0.01, 0.05],'min_child_weight':range(0,20),'subsample':[.4,.5,.6]}
xgb_cv = GridSearchCV(estimator=xgbModel,param_grid=XGBparam,n_jobs = 10,cv = 5,verbose=True)
xgb_cv.fit(Xtrain,ytrain)

Fitting 5 folds for each of 5760 candidates, totalling 28800 fits


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n...
                                    num_parallel_tree=None, predictor=None,
                                    random_state=None, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
                                 

In [11]:
xgb_cv.best_params_

{'learning_rate': 0.01,
 'max_depth': 2,
 'min_child_weight': 3,
 'n_estimators': 180,
 'subsample': 0.6}

In [12]:
xgb_cv.best_score_

0.0022210147127976086

In [14]:
xgbPred = xgb_cv.predict(Xtest)
mean_squared_error(ytest,xgbPred)**(1/2)

287.7255437933179

In [15]:
mean_absolute_error(ytest,xgbPred)

107.72765351071185

In [17]:
rfPred = rf_cv.predict(Xtest)
mean_squared_error(ytest,rfPred)**(1/2)

287.5297591059017

In [18]:
mean_absolute_error(ytest,rfPred)

118.34649200108831