In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('C:/Users/Tad/Documents/faceoffs/data_imputed_new.csv')
print("data read")

data read


In [18]:
# Principal Component Analysis
data = data.select_dtypes(['number']).dropna()
x = data.loc[:, data.columns != 'faceoff_losing_team_xG_since_faceoff']
x = x.loc[:, x.columns != 'faceoff_winning_team_xG_since_faceoff']
y = data['faceoff_winning_team_xG_since_faceoff']
pca = PCA(n_components = 100)
print("pca fitting")
principal_components = pca.fit_transform(x)
print(principal_components)
principal_components_df = pd.DataFrame(principal_components)
principal_components_df.to_csv("principal_components_new.csv")

pca fitting
[[ 4.97259596e+03  1.70833652e+03 -1.57724395e+03 ... -7.63164774e+00
   3.19821289e+01 -1.12290280e+01]
 [ 4.40869453e+03  1.52419538e+03  2.17256673e+03 ... -6.61638556e+00
   4.33062007e+01 -1.39907698e+01]
 [ 4.25187020e+03  1.96086799e+03 -1.36093998e+03 ... -1.84588734e+01
   9.64696894e+00 -8.61302938e+00]
 ...
 [ 1.20446786e+03  3.26507015e+03  3.07518015e+02 ...  1.97183387e+01
   3.54795547e+00  1.85534938e+01]
 [-4.87271497e+02 -6.98950537e+02 -7.26494534e+02 ...  1.27280030e+01
  -1.78135390e+01 -3.81481005e+01]
 [ 1.29348065e+03 -1.81383214e+03  1.90210819e+03 ...  1.36607628e+01
   9.76680195e+00 -6.53558105e+00]]


In [20]:
# Prep Train and Test Data
objectives = data['faceoff_winning_team_xG_since_faceoff']
principal_components_df['faceoff_winning_team_xG_since_faceoff'] = objectives
data_no_na = principal_components_df.dropna()
X = data_no_na.loc[:, data_no_na.columns != 'faceoff_losing_team_xG_since_faceoff']
X = X.loc[:, X.columns != 'faceoff_winning_team_xG_since_faceoff']
y = data_no_na['faceoff_winning_team_xG_since_faceoff']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print("completed train-test split")

completed train-test split


In [None]:
# Build initial, untuned random forest model
print(datetime.now())
rf_initial = RandomForestRegressor().fit(x_train, y_train)
print("completed RandomForestRegressor initial fitting")
prediction_initial = rf_initial.predict(x_test)
mse_initial = mean_squared_error(y_test, prediction_initial)
rmse_initial = mse_initial ** .5
print(mse_initial)
print(rmse_initial)
print(datetime.now())

2022-12-06 22:56:37.864961


In [None]:
# Initial Big Picture Tuning using RandomizedSearchCV
print(datetime.now())
# random_grid = {
#     'bootstrap': [True, False],
#     'max_depth': [10, 25, 50, 100, 500, 1000],
#     'max_features': ['sqrt', 'auto'],
#     'min_samples_leaf': [5, 10, 25, 50],
#     'min_samples_split': [5, 10, 25, 50, 100],
#     'n_estimators': [100, 200, 400, 600, 8000, 1000, 2000, 5000],
# }

random_grid = {
    'bootstrap': [True],
    'max_depth': [5],
    'max_features': ['auto'],
    'min_samples_leaf': [2],
    'min_samples_split': [2],
    'n_estimators': [15],
}

rf_tuning = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf_tuning, param_distributions = random_grid, n_iter = 10, cv = 3, verbose = 2, random_state = 42, n_jobs = -1)
rf_fit_output = rf_random.fit(x_train, y_train)

print(datetime.now())