In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from math import sqrt
import numpy as np
import pandas as pd

In [4]:
reduced = pd.read_csv("../data/reduced.csv")

In [5]:
reduced = reduced.drop(columns = reduced.columns[0])
reduced.head()

Unnamed: 0,scores,predscores,tmhalfsc,opphalfsc,patt,ypa,comppct,int,ratt,ypr,sacks,sackyds,fum,fuml
0,4.0,24.75,6,13,19,6.736842,0.578947,2,8,1.5,2,-11,0,0
1,6.164414,24.75,10,7,25,7.0,0.64,0,13,3.384615,1,-6,1,0
2,4.472136,17.75,13,10,11,4.272727,0.636364,0,15,3.466667,2,-10,2,0
3,2.645751,24.75,0,10,15,5.466667,0.666667,0,10,2.2,0,0,1,1
4,3.741657,25.5,7,6,21,8.333333,0.714286,1,15,2.733333,1,-7,1,1


In [6]:
base_X = reduced.drop(columns="scores")

In [7]:
base_y = reduced["scores"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(base_X, base_y, test_size=0.25)

In [14]:
base_rf = RandomForestRegressor(n_estimators=100)
base_rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [15]:
base_rf_pred = base_rf.predict(X_test)

In [16]:
def rmse(pred, y):
    n = len(y)
    RMSE = sqrt(sum((pred - y)**2) / n)
    return RMSE

In [17]:
base_rf_RMSE = rmse(base_rf_pred, y_test)
base_rf_r2 = base_rf.score(X_test, y_test)
print(f"Baseline Random Forest Regressor RMSE: {base_rf_RMSE:.3f}")
print(f"Baseline Random Forest Regressor R^2: {base_rf_r2:.3f}")

Baseline Random Forest Regressor RMSE: 0.844
Baseline Random Forest Regressor R^2: 0.540


In [18]:
base_dt = DecisionTreeRegressor()
base_dt.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [19]:
base_dt_pred = base_dt.predict(X_test)

In [20]:
base_dt_RMSE = rmse(base_dt_pred, y_test)
base_dt_r2 = base_dt.score(X_test, y_test)
print(f"Baseline Decision Tree Regressor RMSE: {base_dt_RMSE:.3f}")
print(f"Baseline Decision Truee Regressor R^2: {base_dt_r2:.3f}")

Baseline Decision Tree Regressor RMSE: 1.151
Baseline Decision Truee Regressor R^2: 0.143


In [21]:
"""Hand Picked Variables from R:
predscores+tmhalfsc+patt+ypa+ratt+ypr+sackyds"""
manual = reduced[["scores", "predscores", "tmhalfsc", "patt", "ypa", "ratt", "ypr", "sackyds"]]

In [22]:
man_X = manual.drop(columns="scores")

In [23]:
man_y = manual["scores"]

In [24]:
man_X_train, man_X_test, man_y_train, man_y_test = train_test_split(man_X, man_y, test_size=0.25)

In [25]:
man_rf = RandomForestRegressor(n_estimators=100)
man_rf.fit(man_X_train, man_y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [26]:
man_rf_pred = man_rf.predict(man_X_test)

In [28]:
man_rf_RMSE = rmse(man_rf_pred, man_y_test)
man_rf_r2 = man_rf.score(man_X_test, man_y_test)
print(f"Manual Random Forest Regressor RMSE: {man_rf_RMSE:.3f}")
print(f"Manual Random Forest Regressor R^2: {man_rf_r2:.3f}")

Manual Random Forest Regressor RMSE: 0.821
Manual Random Forest Regressor R^2: 0.534


In [29]:
man_dt = DecisionTreeRegressor()
man_dt.fit(man_X_train, man_y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [30]:
man_dt_pred = man_dt.predict(man_X_test)

In [31]:
man_dt_RMSE = rmse(man_dt_pred, man_y_test)
man_dt_r2 = man_dt.score(man_X_test, man_y_test)
print(f"Manual Decision Tree Regressor RMSE: {man_dt_RMSE:.3f}")
print(f"Manual Decision Tree Regressor R^2: {man_dt_r2:.3f}")

Manual Decision Tree Regressor RMSE: 1.128
Manual Decision Tree Regressor R^2: 0.119


## Grid Search for Manual Random Forest Regressor

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold

In [33]:
rf = RandomForestRegressor()
search = {"criterion":["mse"], "n_estimators":list(range(95, 105, 1)), 
         "max_depth":[5], "max_features":["auto"]}

In [34]:
rf_gs = GridSearchCV(rf, search, cv=10)

In [35]:
rf_gs.fit(man_X_train, man_y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'criterion': ['mse'], 'n_estimators': [95, 96, 97, 98, 99, 100, 101, 102, 103, 104], 'max_depth': [5], 'max_features': ['auto']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [36]:
best_prms = rf_gs.best_params_
for param in best_prms:
    print(param, " -> ", best_prms[param])

criterion  ->  mse
max_depth  ->  5
max_features  ->  auto
n_estimators  ->  96


In [37]:
best_rf = RandomForestRegressor(criterion="mse", max_depth=5, max_features="auto", n_estimators=97)
best_rf.fit(man_X_train, man_y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=97, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [38]:
best_rf_pred = best_rf.predict(man_X_test)

In [39]:
best_rf_RMSE = rmse(best_rf_pred, man_y_test)
best_rf_r2 = best_rf.score(man_X_test, man_y_test)
print(f"Best Random Forest Regressor RMSE: {best_rf_RMSE:.3f}")
print(f"Best Random Forest Regressor R^2: {best_rf_r2:.3f}")

Best Random Forest Regressor RMSE: 0.811
Best Random Forest Regressor R^2: 0.545
