In [60]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from joblib import dump, load
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [61]:
data = pd.read_csv('C:/Users/Tad/Documents/faceoffs/data_imputed_with_event_zone_2016_to_2018.csv')
print("data read")

data read


In [62]:
# Principal Component Analysis
data_all = data.dropna()
#data_all = data_all[data_all['event_zone'] == 'Off']
data = data_all.select_dtypes(['number'])
x = data.loc[:, data.columns != 'faceoff_losing_team_xG_since_faceoff']
x = x.loc[:, x.columns != 'faceoff_winning_team_xG_since_faceoff']
x = x.loc[:, x.columns != 'event_zone'] # We don't want event_zone in the principal components
y = data['faceoff_winning_team_xG_since_faceoff']
pca = PCA(n_components = 100)
print("pca fitting")
principal_components = pca.fit_transform(x)
print(principal_components)
principal_components_df = pd.DataFrame(principal_components)
principal_components_df.to_csv("principal_components_newest.csv")

pca fitting
[[-3.87663838e+03  7.64215754e+02  2.83555365e+03 ...  5.30867811e+00
  -8.54660376e+00 -1.25299812e+01]
 [-3.98792117e+03  8.42796593e+02  2.70425309e+03 ... -1.06508736e+01
  -1.79328258e+00 -1.65613936e+01]
 [-4.99695840e+03  6.96332881e+02  2.24469185e+03 ... -9.97324552e+00
  -8.93760115e-01 -1.67742471e+01]
 ...
 [-4.53634080e+02 -3.25884430e+02 -8.32333959e+02 ... -2.54542553e+01
   1.22326075e+01 -5.19838461e+00]
 [-1.28605756e+03 -1.97785370e+03  1.73194605e+03 ...  1.30259586e+01
   2.55679535e+01 -3.15952177e+01]
 [ 4.09892570e+01  2.81688319e+02 -8.37661043e+02 ...  1.87607242e+01
  -2.22754125e-01 -3.36240524e+00]]


In [63]:
# Prep Train and Test Data
objectives = data['faceoff_winning_team_xG_since_faceoff']
principal_components_df['faceoff_winning_team_xG_since_faceoff'] = objectives
data_no_na = principal_components_df.dropna() # FLAG
X = data_no_na.loc[:, data_no_na.columns != 'faceoff_losing_team_xG_since_faceoff']
X = X.loc[:, X.columns != 'faceoff_winning_team_xG_since_faceoff']
X = OneHotEncoder(categories = 'auto').fit_transform(X)
y = data_no_na['faceoff_winning_team_xG_since_faceoff']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print("completed train-test split")

completed train-test split


In [64]:
# Build initial, untuned random forest model
print(datetime.now())
# rf_initial = RandomForestRegressor().fit(x_train, y_train)
# print("completed RandomForestRegressor initial fitting")
# prediction_initial = rf_initial.predict(x_test)
# mse_initial = mean_squared_error(y_test, prediction_initial)
# rmse_initial = mse_initial ** .5
# print(mse_initial)
# print(rmse_initial)
# print(datetime.now())

2022-12-07 03:32:28.865340


In [None]:
# Initial Big Picture Tuning using RandomizedSearchCV
print(datetime.now())

# random_grid = { # FLAG
#     'bootstrap': [True],
#     'max_depth': [10, 25, 50, 100, 500, 1000],
#     'max_features': ['sqrt', 'log2'],
#     'min_samples_leaf': [5, 10],
#     'min_samples_split': [5, 10, 25, 50],
#     'n_estimators': [100, 250, 500, 1000],
# }
random_grid = { # FLAG
    'bootstrap': [True, False],
    'max_depth': [10, 25, 50, 100, 500, 1000],
    'max_features': ['sqrt', 'log2'],
    'min_samples_leaf': [5, 10, 25, 50],
    'min_samples_split': [5, 10, 25, 50, 100],
    'n_estimators': [100, 200, 400, 600, 800, 1000, 2000, 5000],
}

# SOLELY FOR TESTING WHETHER CODE COMPILES (FLAG)
# random_grid = {
#     'bootstrap': [True],
#     'max_depth': [5],
#     'max_features': ['auto'],
#     'min_samples_leaf': [2],
#     'min_samples_split': [2],
#     'n_estimators': [15],
# }

rf_tuning = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf_tuning, param_distributions = random_grid, n_iter = 10, cv = 3, verbose = 2, random_state = 42, n_jobs = -1) # FLAG (n_iter)
rf_fit_output = rf_random.fit(x_train, y_train)
print("writing joblib")
dump(rf_fit_output, 'rf_randomized_search_cv_fit_off_win.joblib') #FLAG... change suffix
print(datetime.now())

In [None]:
# def evaluate(model, test_features, test_labels):
#     predictions = model.predict(test_features)
#     errors = abs(predictions - test_labels)
#     mape = 100 * np.mean(errors / test_labels)
#     accuracy = 100 - mape
#     print('Model Performance')
#     print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
#     print('Accuracy = {:0.2f}%.'.format(accuracy))
#
#     return accuracy

In [None]:
best_random = rf_random.best_estimator_
# random_accuracy = evaluate(best_random, x_test, y_test)

predictions = rf_fit_output.predict(x_test)
errors = abs(predictions - y_test)
# from sklearn.metrics import r2_score
# r_sq = r2_score(y_test, predictions)
# mse_grid = mean_squared_error(y_test, predictions)
# print(mse_grid)
# rmse_grid = mse_grid ** .5
# print(rmse_grid)
#
# # Display the performance metrics
# print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
# mape = np.mean(100 * (errors / y_test))
# accuracy = 100 - np.mean(mape[np.isfinite(mape)])
# print('Accuracy:', round(accuracy, 2), '%.')
# See https://stackoverflow.com/questions/58067438/mape-mean-absolute-percentage-error-measurement-in-python-result-in-error#:~:text=mape%20%3D%20100%20*%20(errors%20%2F,')

best_bootstrap = best_random.bootstrap
best_max_depth = best_random.max_depth
best_max_features = best_random.max_features
best_min_samples_leaf = best_random.min_samples_leaf
best_min_samples_split = best_random.min_samples_split
best_n_estimators = best_random.n_estimators

best_parameters = rf_fit_output.best_params_


In [None]:
rf_best_param = RandomForestRegressor(bootstrap = best_bootstrap,
                                      max_depth = best_max_depth,
                                      max_features = best_max_features,
                                      min_samples_leaf = best_min_samples_leaf,
                                      min_samples_split = best_min_samples_split,
                                      n_estimators = best_n_estimators)
rf_best_param_fit = rf_best_param.fit(x_train, y_train)

dump(rf_best_param_fit, 'rf_best_parameters_fit_off_win.joblib') # FLAG... change suffix

In [None]:
# var_imps = rf_best_param_fit.feature_importances_
#
# def plot_feature_importance(importance,names,model_type):
#     #Create arrays from feature importance and feature names
#     feature_importance = np.array(importance)
#     feature_names = np.array(names)
#
#     #Create a DataFrame using a Dictionary
#     df ={'feature_names':feature_names,'feature_importance':feature_importance}
#     fi_df = pd.DataFrame(df)
#
#     #Sort the DataFrame in order decreasing feature importance
#     fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
#
#     #Define size of bar plot
#     plt.figure(figsize=(10,8))
#     #Plot Searborn bar chart
#     sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
#     #Add chart labels
#     plt.title(model_type + 'FEATURE IMPORTANCE')
#     plt.xlabel('FEATURE IMPORTANCE')
#     plt.ylabel('FEATURE NAMES')
# plot_feature_importance(rf_best_param_fit.feature_importances_,x_train.columns,'RANDOM FOREST')
#
# feat_importances_series = pd.Series(var_imps, index=x_train.columns)
# feat_importances_series.nlargest(20).plot(kind='barh')