#Notebook Setup

In [3]:
import pandas as pd
import numpy as np
import math
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

#Confusion Matrix Code

In [1]:
class ConfusionMatrix():
  def __init__(self,names, performance_matrix,
               figsize=(850,800), ax_title=('Predicted labels','Real labels'), colbar=True, labels=True):
    from plotly.offline import iplot
    from plotly.graph_objs import Figure, Data, Scatter

    np.seterr(divide='ignore', invalid='ignore')

   
    real_sum = performance_matrix.sum(axis=1)
    color_matrix = np.zeros((len(names),len(names)))
    for i in range(len(names)):
      angle_sum = performance_matrix[i,i:].sum() + performance_matrix[i:,i].sum() - performance_matrix[i,i]
      color_matrix[i,i:] = np.true_divide(performance_matrix[i,i:],angle_sum)*100
      color_matrix[i:,i] = np.true_divide(performance_matrix[i:,i],angle_sum)*100
    color_matrix[-1,-1] = 0

    precision = []
    recall = []
    for i in range(len(performance_matrix[0,:-1])):
      if not np.isnan(performance_matrix[i,i]/performance_matrix[:,i].sum()):
        precision.append(performance_matrix[i,i]/performance_matrix[:,i].sum()*100)
      else:
        precision.append(0)
      if not np.isnan(performance_matrix[i,i]/performance_matrix[i,:].sum()):
        recall.append(performance_matrix[i,i]/performance_matrix[i,:].sum()*100)
      else:
        recall.append(0)

    ratios = performance_matrix[:-1,:].sum(axis=1)/performance_matrix[:-1,:].sum()
    precision = np.array([prec if ratios[i]>0 else np.nan for i,prec in enumerate(precision)])
    recall = np.array([rec if ratios[i]>0 else np.nan for i,rec in enumerate(recall)])

    trace1 = {"type": "heatmap",
              "x": names,
              "y": names, 
              "z": color_matrix,
              "colorscale": "Greys",
              "zmin": 0,
              "zmax": 100,
              "showscale": False,
            }
    if colbar:
      trace2 = {"type": "heatmap",
                "x": ["<b>Recall</b>"],
                "y": names, 
                "z": recall[...,np.newaxis],
                "colorscale": "Greens",
                "zmin": 50 if min([min(recall), min(precision)]) > 50 else 0,
                "zmax": 100,
                "showscale": True,
                "xaxis":"x2",
                "yaxis":"y1",
                'colorbar':dict(thickness=30,
                            ticklen=3, tickcolor='#003366',
                            tickfont=dict(size=16, color='#003366'))
              }

              
      trace3 = {"type": "heatmap",
                "x": names,
                "y": ["<b>Precision</b>"], 
                "z": precision[np.newaxis,...],
                "colorscale": "Greens",
                "zmin": 50 if min([min(recall), min(precision)]) > 50 else 0,
                "zmax": 100,
                "showscale": False,
                "xaxis":"x1",
                "yaxis":"y3",
              }
    else:
      trace2 = {}
      trace3 = {}
    data = [trace1,trace2,trace3]
    annotations= []

    if colbar:
      for j in range(len(names)-1):
          #Recall array annotation
          annotations.append({"x": 1,
                              "y": names[j],
                              "text": f"{int(round(recall[j]))}%" if recall[j] > 0 else '',
                              "xref": "x2", 
                              "yref": "y1",
                              "showarrow": False,
                              "font": {"color": "black",
                                      "size": 18}
                            })
      for i in range(len(names)-1):
          #Precision array annotation
          annotations.append({"x": names[i],
                              "y": -1,
                              "text": f"{int(round(precision[i]))}%" if precision[i] > 0 else '',
                              "xref": "x1", 
                              "yref": "y3",
                              "showarrow": False,
                              "font": {"color": "black",
                                      "size": 18}
                            })
    for i in range(len(names)):
        for j in range(len(names)):
            #Main matrix annotation
            annotations.append({"x": names[i],
                                "y": names[j],
                                "text": f"<b>{int(performance_matrix[j,i])}</b>" if performance_matrix[j,i] != 0 else '' ,
                                "xref": "x1", 
                                "yref": "y1",
                                "showarrow": False,
                                "font": {"color": "black" if color_matrix[j,i]<50 else 'white',
                                        "size": 20}
                              })
    layout = {"autosize": False,
              "width": figsize[0],
              "height": figsize[1],
              "margin":{"l":130,
                        "r":100,
                        "b":40,
                        "t":130,
                        #"pad": 4,
                      },
              "xaxis": {"title": {"text":f"<------------------- {ax_title[0]} ------------------->" if labels else '',
                                  "font": {"size": 19 if len(names) == 2 else 23}
                                },
                        "side": "top",
                        "tickangle": -45,
                        "automargin":True,
                        "domain": [0,0.8],
                        "showgrid": False,
                        "tickfont": {"size":16},
                      }, 
              "yaxis": {"title": {"text":f"<----------- {ax_title[1]} ----------->" if labels else '',
                                  "font": {"size": 19 if len(names) == 2 else 23}
                                },
                        "autorange": "reversed",
                        "automargin":True,
                        "domain": [0.2,1],
                        "showgrid": False,
                        "tickfont": {"size":16},
                      },
              "xaxis2":{"domain": [0.85,1],
                        "showgrid": False,
                        "side": "top",
                        "tickangle": -45,
                        "tickfont": {"size":16},
                      },
              "yaxis3":{"domain": [0,0.15],
                        "showgrid": False,
                        "tickfont": {"size":16},
                      },
              "annotations": annotations,
                }
    self.fig = Figure(data=data,
                layout=layout)
  def get_fig(self):
    return self.fig

#Load Test/Train Splits

In [3]:
feature_names = ['offender_x', 'offender_y', 'passer_x', 'passer_y', 'linesman_x',
       'linesman_y', 'last_defender_x', 'last_defender_y',
       'second_last_defender_x', 'second_last_defender_y',
       'x_last_defender_offender', 'y_last_defender_offender',
       'euclid_last_defender_offender', 'x_passer_offender',
       'y_passer_offender', 'euclid_passer_offender',
       'x_last_defender_second_defender', 'y_last_defender_second_defender',
       'euclid_last_defender_second_defender', 'x_linesman_offender',
       'y_linesman_offender', 'euclid_linesman_offender', 'x_linesman_passer',
       'y_linesman_passer', 'euclid_linesman_passer',
       'number_defender_defensive_line', 'number_attacker_defensive_line',
       'number_defender_near_offside_player',
       'number_attacker_near_offside_player', 'initiator_x_velo',
       'initiator_y_velo', 'initiator_total_velo', 'offender_x_velo',
       'offender_y_velo', 'offender_total_velo', 'last_defender_x_velo',
       'last_defender_y_velo', 'last_defender_total_velo',
       'second_defender_x_velo', 'second_defender_y_velo',
       'second_defender_total_velo', 'linesman_x_velo', 'linesman_y_velo',
       'linesman_total_velo', 'max_x_velo', 'max_y_velo', 'max_total_velo']

feature_names = ['x_last_defender_offender', 'number_attacker_defensive_line']
'''
feature_names = [
       'x_last_defender_offender', 'y_last_defender_offender','x_passer_offender',
       'y_passer_offender',
       'x_last_defender_second_defender', 'y_last_defender_second_defender',
       'x_linesman_offender',
       'y_linesman_offender', 'x_linesman_passer',
       'y_linesman_passer',
       'number_defender_defensive_line', 'number_attacker_defensive_line',
       'number_defender_near_offside_player',
       'number_attacker_near_offside_player', 'initiator_x_velo',
       'initiator_y_velo', 'offender_x_velo',
       'offender_y_velo', 'last_defender_x_velo',
       'last_defender_y_velo',
       'second_defender_x_velo', 'second_defender_y_velo',
       'linesman_x_velo',
       'max_x_velo', 'max_y_velo']

feature_names = [
              'x_last_defender_offender', 'x_passer_offender',
              'x_linesman_offender',
              'x_linesman_passer',
              'number_defender_defensive_line', 'number_attacker_defensive_line',
              'number_defender_near_offside_player',
              'number_attacker_near_offside_player', 'initiator_x_velo',
              'offender_x_velo',
              'last_defender_x_velo',
              'second_defender_x_velo',
                    'linesman_x_velo']

feature_names = [
'x_last_defender_offender', 'x_passer_offender',
'x_linesman_offender', 'initiator_x_velo',
'offender_x_velo',
'last_defender_x_velo',
'second_defender_x_velo',
'linesman_x_velo']
'''

event_labels = ['mode_difficulty_rating']

##Balanced Split

In [4]:
balanced_training = pd.read_csv('training_balanced_123.csv')
balanced_testing = pd.read_csv('testing_balanced_123.csv')

X_train_balanced = balanced_training[feature_names]
y_train_balanced = balanced_training[event_labels]

X_test_balanced = balanced_testing[feature_names]
y_test_balanced = balanced_testing[event_labels]

##Unbalanced Split

In [5]:
unbalanced_training = pd.read_csv('training_unbalanced_123.csv')
unbalanced_testing = pd.read_csv('testing_unbalanced_123.csv')

X_train_unbalanced = unbalanced_training[feature_names]
y_train_unbalanced = unbalanced_training[event_labels]

X_test_unbalanced = unbalanced_testing[feature_names]
y_test_unbalanced = unbalanced_testing[event_labels]

#Gradient Boosted Classification (Balanced)

In [None]:
model_params = {'learning_rate': [0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1],
              'n_estimators': [10, 100, 300, 500, 800],
              'max_depth':[2, 3, 4, 5, 6, 7] 
             }

In [None]:
from joblib import parallel_backend
with parallel_backend('threading', n_jobs=4):

  gradient_boosting_output = GridSearchCV(GradientBoostingClassifier(min_samples_split=10,
                                                      min_samples_leaf=2,
                                                      max_features='sqrt', 
                                                      random_state=123
                                                      ), 
                                        param_grid=model_params, 
                                        scoring="accuracy",
                                        verbose=1,
                                          ).fit(X_train_balanced, y_train_balanced.values.ravel())

Fitting 5 folds for each of 210 candidates, totalling 1050 fits


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   49.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  7.2min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed: 16.5min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed: 24.5min
[Parallel(n_jobs=4)]: Done 1050 out of 1050 | elapsed: 29.2min finished


In [None]:
print(gradient_boosting_output.best_params_)

best_model = gradient_boosting_output.best_estimator_

y_pred = best_model.predict(X_test_balanced)
accuracy = accuracy_score(y_test_balanced, y_pred)

print('ACCURACY: {}'.format(accuracy))

{'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 300}
ACCURACY: 0.3741496598639456


In [None]:
c_matrix = confusion_matrix(y_test_balanced, y_pred)

print(c_matrix)

names = ['easy', 'medium','hard']
CM = ConfusionMatrix(names, c_matrix,
                     figsize=(800,800), colbar=True, labels=True)
fig = CM.get_fig()
fig.show()

[[33 16  0]
 [37 12  0]
 [26 13 10]]


#Gradient Boosted Regression (Balanced)

In [12]:
def adjust_regression(value):
  if value >= 3:
    return 3
  elif value <= 1:
    return 1
  else:
    return round(value)

In [None]:
model_params = {'learning_rate': [0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 1],
              'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500, 600, 700, 800],
              'max_depth':[7, 8, 9, 10, 11] 
             }

In [None]:
from joblib import parallel_backend
with parallel_backend('threading', n_jobs=4):

  gradient_boosting_reg_output = GridSearchCV(GradientBoostingRegressor(min_samples_split=10,
                                                      min_samples_leaf=2,
                                                      max_features='sqrt', 
                                                      random_state=123
                                                      ), 
                                        param_grid=model_params, 
                                        scoring="neg_root_mean_squared_error",
                                        verbose=1,
                                          ).fit(X_train_balanced, y_train_balanced.values.ravel())

Fitting 5 folds for each of 480 candidates, totalling 2400 fits


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    9.9s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  2.4min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  4.3min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  6.4min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  8.7min
[Parallel(n_jobs=4)]: Done 2400 out of 2400 | elapsed: 10.8min finished


In [None]:
print(gradient_boosting_reg_output.best_params_)

best_model = gradient_boosting_reg_output.best_estimator_

y_pred = best_model.predict(X_test_balanced)

y_pred_class = [adjust_regression(value) for value in y_pred]

MSE = mean_squared_error(y_test_balanced, y_pred)
RMSE = math.sqrt(MSE)

MAE = mean_absolute_error(y_test_balanced, y_pred)

acc = accuracy_score(y_test_balanced, y_pred_class)

R_square = best_model.score(X_test_balanced, y_test_balanced)

print('RMSE: {}'.format(RMSE))
print('MAE: {}'.format(MAE))
print('ACC: {}'.format(acc))
print('R^2: {}'.format(R_square))
print(best_model.feature_importances_)

{'learning_rate': 0.075, 'max_depth': 10, 'n_estimators': 200}
RMSE: 0.9119755420120373
MAE: 0.7768987909118673
ACC: 0.38095238095238093
R^2: -0.24754908384222385
[0.1504989  0.10885574 0.14571516 0.14054142 0.11892222 0.10231815
 0.09757351 0.1355749 ]


In [None]:
from sklearn.preprocessing import StandardScaler

model = LinearRegression()
model.fit(StandardScaler().fit_transform(X_train_balanced), y_train_balanced)
print(model.score(StandardScaler().fit_transform(X_test_balanced), y_test_balanced))
print(model.coef_)



-0.11852742917040726
[[-0.27854337  0.13931409  0.11476363  0.26694674  0.21324321 -0.34975222
   0.02087025 -0.02652279]]


#Gradient Boosted Regression (Unbalanced)

In [16]:
model_params = {'learning_rate': [0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1],
              'n_estimators': [10, 25, 50, 75, 100, 200, 300, 400, 500, 600, 700, 800],
              'max_depth':[2, 3, 4, 5, 6, 7, 8, 9],
              'max_features':['auto', 'sqrt', 'log2']
             }

In [None]:
model_params = {'learning_rate': [0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1],
              'n_estimators': [10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75],
              'max_depth':[2, 3, 4, 5, 6],
              'max_features':['auto', 'sqrt', 'log2']
             }

In [None]:
model_params = {'learning_rate': [0.005, 0.01, 0.05, 0.1, 0.25],
              'n_estimators': [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30],
              'max_depth':[2, 3, 4, 5, 6],
              'max_features':['auto', 'sqrt', 'log2']
             }

In [17]:
from joblib import parallel_backend
with parallel_backend('threading', n_jobs=4):

  gradient_boosting_reg_output = GridSearchCV(GradientBoostingRegressor(min_samples_split=10,
                                                      min_samples_leaf=2,
                                                      max_features='sqrt', 
                                                      random_state=123
                                                      ), 
                                        param_grid=model_params, 
                                        scoring="neg_root_mean_squared_error",
                                        verbose=1,
                                          ).fit(X_train_unbalanced, y_train_unbalanced.values.ravel())

Fitting 5 folds for each of 2016 candidates, totalling 10080 fits


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   30.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  3.6min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  5.2min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:  7.1min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed:  9.2min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed: 11.6min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed: 14.3min
[Parallel(n_jobs=4)]: Done 6042 tasks      | elapsed: 17.3min
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed: 20.7min
[Parallel(n_jobs=4)]: Done 8442 tasks      | elapsed: 24.2min
[Parallel(n_jobs=4)]: Done 9792 tasks      | elapsed: 27.9min
[Parallel(n_jobs=4)]: Done 10080 out of 10080 | elapsed:

In [19]:
print(gradient_boosting_reg_output.best_params_)

best_model = gradient_boosting_reg_output.best_estimator_

y_pred = best_model.predict(X_test_unbalanced)

y_pred_class = [adjust_regression(value) for value in y_pred]

MSE = mean_squared_error(y_test_unbalanced, y_pred)
RMSE = math.sqrt(MSE)
MAE = mean_absolute_error(y_test_unbalanced, y_pred)
acc = accuracy_score(y_test_unbalanced, y_pred_class)
R_square = best_model.score(X_test_unbalanced, y_test_unbalanced)

labels = list(y_test_unbalanced.values.ravel())

one_count = sum([1 if i == 1 else 0 for i in labels])
two_count = sum([1 if i == 2 else 0 for i in labels])
three_count = sum([1 if i == 3 else 0 for i in labels])
total = len(labels)

print('\nPercent 1: {}'.format(one_count/total))
print('Percent 2: {}'.format(two_count/total))
print('Percent 3: {}\n'.format(three_count/total))

print('RMSE: {}'.format(RMSE))
print('MAE: {}'.format(MAE))
print('ACC: {}'.format(acc))
print('R^2: {}\n'.format(R_square))

print(best_model.feature_importances_)

{'learning_rate': 0.01, 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 75}

Percent 1: 0.5212765957446809
Percent 2: 0.3617021276595745
Percent 3: 0.11702127659574468

RMSE: 0.7090387025155945
MAE: 0.5973488174039788
ACC: 0.48936170212765956
R^2: -0.05866879179979034

[0.91189933 0.08810067]


In [15]:
model = LinearRegression(normalize=True)
model.fit(X_train_unbalanced, y_train_unbalanced)
print(model.coef_)

score = model.score(X_test_unbalanced, y_test_unbalanced)

print(score)

[[-0.01193067  0.12588847]]
0.017410606273345453


#Gradient Boosted Regression (Sythetic Train, Unbalanced Test)

In [None]:
model_params = {'learning_rate': [0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1],
              'n_estimators': [10, 25, 50, 75, 100, 200, 300, 400, 500, 600, 700, 800],
              'max_depth':[2, 3, 4, 5, 6, 7, 8, 9],
              'max_features':['auto', 'sqrt', 'log2']
             }

In [None]:

model_params = {'learning_rate': [0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1],
              'n_estimators': [600, 700, 800, 900, 1000, 1250, 1500, 1750, 2000],
              'max_depth':[7, 8, 9, 10, 11, 12, 13, 14],
              'max_features':['auto', 'sqrt', 'log2']
             }


In [None]:
"""
model_params = {'learning_rate': [0.005, 0.01, 0.05, 0.1, 0.25],
              'n_estimators': [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30],
              'max_depth':[2, 3, 4, 5, 6],
              'max_features':['auto', 'sqrt', 'log2']
             }
"""

In [None]:
from joblib import parallel_backend
with parallel_backend('threading', n_jobs=4):

  gradient_boosting_reg_output = GridSearchCV(GradientBoostingRegressor(min_samples_split=10,
                                                      min_samples_leaf=2,
                                                      max_features='sqrt', 
                                                      random_state=123
                                                      ), 
                                        param_grid=model_params, 
                                        scoring="neg_root_mean_squared_error",
                                        verbose=1,
                                          ).fit(X_train_balanced, y_train_balanced.values.ravel())

Fitting 5 folds for each of 1512 candidates, totalling 7560 fits


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


KeyboardInterrupt: ignored

In [None]:
print(gradient_boosting_reg_output.best_params_)

best_model = gradient_boosting_reg_output.best_estimator_

y_pred = best_model.predict(X_test_unbalanced)

y_pred_class = [adjust_regression(value) for value in y_pred]

MSE = mean_squared_error(y_test_unbalanced, y_pred)
RMSE = math.sqrt(MSE)
MAE = mean_absolute_error(y_test_unbalanced, y_pred)
acc = accuracy_score(y_test_unbalanced, y_pred_class)
R_square = best_model.score(X_test_unbalanced, y_test_unbalanced)

labels = list(y_test_unbalanced.values.ravel())

one_count = sum([1 if i == 1 else 0 for i in labels])
two_count = sum([1 if i == 2 else 0 for i in labels])
three_count = sum([1 if i == 3 else 0 for i in labels])
total = len(labels)

print('\nPercent 1: {}'.format(one_count/total))
print('Percent 2: {}'.format(two_count/total))
print('Percent 3: {}\n'.format(three_count/total))

print('RMSE: {}'.format(RMSE))
print('MAE: {}'.format(MAE))
print('ACC: {}'.format(acc))
print('R^2: {}\n'.format(R_square))

print(best_model.feature_importances_)

c_matrix = confusion_matrix(y_test_unbalanced, y_pred_class)

print(c_matrix)

names = ['easy', 'medium','hard']
CM = ConfusionMatrix(names, c_matrix,
                     figsize=(800,800), colbar=True, labels=True)
fig = CM.get_fig()
fig.show()

{'learning_rate': 0.01, 'max_depth': 9, 'max_features': 'sqrt', 'n_estimators': 800}

Percent 1: 0.5212765957446809
Percent 2: 0.3617021276595745
Percent 3: 0.11702127659574468

RMSE: 0.6782773429477819
MAE: 0.5901023137844529
ACC: 0.4574468085106383
R^2: 0.03119839838944416

[0.02474633 0.04194615 0.02188854 0.01672821 0.02370696 0.00124203
 0.01490963 0.02841828 0.02223968 0.01441938 0.05628539 0.01756613
 0.01833561 0.01452452 0.01518396 0.01886595 0.05685973 0.02631541
 0.02603351 0.04651099 0.030954   0.02775602 0.01695575 0.01204397
 0.01694354 0.006742   0.02493224 0.00018665 0.         0.03825028
 0.02388206 0.02941392 0.01699693 0.01743899 0.02054235 0.01548645
 0.03387353 0.01858943 0.01487781 0.01478764 0.02522672 0.01910075
 0.         0.01546417 0.01901807 0.02012424 0.01368606]
[[23 26  0]
 [14 20  0]
 [ 3  8  0]]
