In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
import numpy as np

In [5]:
data = pd.read_csv('C:/Users/Tad/Documents/faceoffs/data_imputed.csv')
print("data read")

In [6]:
# Principal Component Analysis
x_cols = data.columns[23:]
print(x_cols)
x = data[x_cols].select_dtypes(['number'])
y = data['faceoff_winning_team_xG_since_faceoff']
pca = PCA(n_components = 100)
print("pca fitting")
principal_components = pca.fit_transform(x)
print(principal_components)
principal_components_df = pd.DataFrame(principal_components)
principal_components_df.to_csv("one_hundred_principal_components.csv")

data read
Index(['Win_F1', 'Win_F2', 'Win_F3', 'Win_D1', 'Win_D2', 'Lose_F1', 'Lose_F2',
       'Lose_F3', 'Lose_D1', 'Lose_D2',
       ...
       'Take_GAR_Lose_D2', 'Draw_GAR_Lose_D2', 'Off_GAR_Lose_D2',
       'Def_GAR_Lose_D2', 'Pens_GAR_Lose_D2', 'GAR_Lose_D2', 'WAR_Lose_D2',
       'SPAR_Lose_D2', 'TOI_EV_Lose_D2', 'TOI_SH_Lose_D2'],
      dtype='object', length=921)
pca fitting
[[-4.35263253e+03  2.03215141e+03 -1.40874567e+03 ... -9.17585110e+00
   4.60007741e+00  2.11879477e+00]
 [-3.07137967e+03  2.59781139e+03 -7.74033636e+02 ... -8.03291504e+00
   1.66141100e+00  4.58476928e+00]
 [-4.32316910e+03  1.76040652e+03  2.96909743e+03 ... -2.24916233e+01
  -1.17338624e+01  7.98808095e+00]
 ...
 [-1.19827945e+02 -3.89750703e+02 -3.46107725e+02 ... -4.80142585e+00
  -3.45083879e+00  1.62418568e+01]
 [ 1.63543495e+02 -5.91956201e+02 -2.98748435e+02 ...  9.56263145e+00
   4.80871156e+00  4.00089211e+00]
 [-1.56792493e+03  3.10130163e+03  5.69968687e+01 ...  5.73664749e+00
  -1.6435656

In [None]:
# Prep Train and Test Data
cols_of_interest = principal_components_df.columns
objectives = principal_components_df['faceoff_winning_team_xG_since_faceoff']
principal_components_df['faceoff_winning_team_xG_since_faceoff'] = objectives
data_no_na = principal_components_df[cols_of_interest].dropna()
X = data_no_na
y = data_no_na['faceoff_winning_team_xG_since_faceoff']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print("completed train-test split")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  principal_components_df['faceoff_winning_team_xG_since_faceoff'] = objectives


completed train-test split
completed RandomForestRegressor initial fitting
6.10010672253427e-06
0.002469839412296733
creating GridSearch
running GridSearch
2022-12-06 11:53:11.582332




KeyboardInterrupt: 

In [None]:
# Build initial, untuned random forest model
print(datetime.now())
rf_initial = RandomForestRegressor().fit(x_train, y_train)
print("completed RandomForestRegressor initial fitting")
prediction_initial = rf_initial.predict(x_test)
mse_initial = mean_squared_error(y_test, prediction_initial)
rmse_initial = mse_initial ** .5
print(mse_initial)
print(rmse_initial)
print(datetime.now())

In [None]:
# Initial Big Picture Tuning using RandomizedSearchCV
print(datetime.now())
random_grid = {
    'bootstrap': [True, False],
    'max_depth': [10, 25, 50, 100, 500, 1000],
    'max_features': ['sqrt', 'auto'],
    'min_samples_leaf': [5, 10, 25, 50],
    'min_samples_split': [5, 10, 25, 50, 100],
    'n_estimators': [100, 200, 400, 600, 8000, 1000, 2000, 5000],
}

rf_tuning = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf_tuning, param_distributions = random_grid, n_iter = 100, cv = 3, verbose = 2, random_state = 42, n_jobs = -1)
rf_random.fit(x_train, y_train)

print(datetime.now())