In [4]:
!pip install category_encoders --quiet
!pip install pdpbox --quiet
!pip install scikit-garden --quiet

In [None]:
import pandas as pd
import numpy as np
import datetime as datetime
import matplotlib.pyplot as plt
from category_encoders import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.inspection import permutation_importance
from xgboost import XGBRegressor 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_poisson_deviance
from skgarden.quantile import RandomForestQuantileRegressor


URL = 'https://raw.githubusercontent.com/yaobviously/DataCup2021/main/modifiedohl.csv'
shots_next_ten = pd.read_csv('https://raw.githubusercontent.com/yaobviously/DataCup2021/main/shotsnext10.csv',
                             index_col = 'game_date')

In [8]:
def wrangle(URL):
  df = pd.read_csv(URL,
                   parse_dates = ['game_date'],
                   index_col = 'game_date')
  
  # dropping columns i made awhile ago that i don't need to build models, 
  # although they may be handy for communicating insights. all of the score
  # and player differentials can now, i see, be reduced to single columns for
  # the purposes of model building

  col_drop = ['Unnamed: 0', 'possSet', 'shots_next_ten', 'goal_next_ten', 
              '5on5', '5on4home', '5on3home', '5on4away', '5on3away', 
              '4on4', 'tie', 'home_ahead_1', 'home_ahead_2', 'home_ahead_3ormore',
              'away_ahead_1', 'away_ahead_2', 'away_ahead_3ormore', 'home_team',
              'away_team', 'x_coordinate_2', 'y_coordinate_2', 'is_shot', 'player_2']

  df = df.drop(columns = col_drop).copy()
  df = df.rename(columns = {'detail_3' : 'traffic', 'detail_4' : 'one_timer',
                            'detail_1' : 'shot_type', 'detail_2' : 'shot_result'})
  
  # condensing the above columns

  df['home_skater_adv'] = df['home_team_skaters'] - df['away_team_skaters']
  df['home_score_diff'] = df['home_team_goals'] - df['away_team_goals']
  df['is_shot'] = [1 if event in ['Goal', 'Shot'] else 0 for event in df['event']]

  # converting the clock column to the seconds remaining in the period in the
  # ugliest way imaginable
  
  df['min'] = df.clock.apply(lambda x: datetime.datetime.strptime(x,'%M:%S')).dt.minute
  df['sec'] = df.clock.apply(lambda x: datetime.datetime.strptime(x, '%M:%S')).dt.second
  df['period_sec_rem'] = df['min'] * 60 + df['sec']
  
  df.drop(columns = 'clock', inplace = True)

  # calculating the seconds remaining in the game, mostly for practice (needed!)

  def gamesecs(x):
  
    if x['period'] == 1:
      return x['period_sec_rem'] + 2400

    if x['period'] == 2:
      return x['period_sec_rem'] + 1200
  
    else:
      return x['period_sec_rem']

  df['game_sec_rem'] = df.apply(gamesecs, axis = 1)

  # creating new columns that contain info on prior game states

  df['x_coordinate_1back'] = df['x_coordinate'].shift()
  df['y_coordinate_1back'] = df['y_coordinate'].shift()
  df['x_coordinate_2back'] = df['x_coordinate'].shift(2)
  df['y_coordinate_2back'] = df['y_coordinate'].shift(2)
  df['event_1back'] = df['event'].shift()
  df['event_2back'] = df['event'].shift(2)

  df['sec_last_event'] = np.abs(df.groupby(['gameid', 'period'])['period_sec_rem'].diff())
  df['sec_2ndlast_event'] = np.abs(df.groupby(['gameid', 'period'])['period_sec_rem'].diff(2))

  # dropping redundant and leaky columns. note 'event' may be useful for
  # finding and creating useful (and non-leaky) features
  
  df.drop(columns = ['min', 'sec', 'is_shot'], inplace = True)

  # dropping columns with negligible importance. note 'team' & 'period may be 
  # useful for finding and creating useful (and non-leaky) features.

  lowimp_columns = ['away_team_goals', 'home_team_goals', 
                    'player', 'home_team_skaters',
                    'away_team_skaters', 'traffic', 'gameid', 'period', 'team',
                    'one_timer', 'is_goal', 'period_sec_rem']

  

  df.drop(lowimp_columns, axis = 1, inplace = True)

  # I used the below code to create a column counting the shots in the next 10
  # seconds of gameplay. because it takes so long to run, i decided to save it directly
  # to a new .csv file
  
  # sums = []
  # for game, period, sec in zip(df['gameid'], df['period'], df['period_sec_rem']):
    
  # sec_plus_10 = sec - 10  
  # mask = ((df['gameid'] == game) & \
  # (df['period'] == period) & \
  # (df['period_sec_rem'] >= sec_plus_10) & \
  # (df['period_sec_rem'] < sec))
  # sum_shots = df.loc[mask]['is_shot'].sum()
  # sums.append(sum_shots)

  # adding the shots column
  df['shots_next_ten'] = shots_next_ten['shots_next_10']
  

  return df

In [9]:
df = wrangle(URL)

In [73]:
target = 'shots_next_ten'

train_threshold = '2020-01-15'
test_threshold = '2020-02-13'

mask = df.index < train_threshold
mask2 = df.index > test_threshold

X = df.drop(target, axis = 1)
y = df[target]

X_train, y_train = X[mask], y[mask]
X_val, y_val = X[~mask & ~mask2], y[~mask & ~mask2]
X_test, y_test = X[mask2], y[mask2]

In [95]:
naive_pred_train = [y_train.mean()] * len(y_train)
baseline_MAE_train = mean_absolute_error(y_train, naive_pred_train)
baseline_poisson_deviance = mean_poisson_deviance(y_train, naive_pred_train)
print('The mean absolute error of our naive estimator is:', baseline_MAE_train)
print('The baseline Poisson deviance is:', baseline_poisson_deviance)

The mean absolute error of our naive estimator is: 0.5034843933841647
The baseline Poisson deviance is: 0.9052658453895543


In [None]:
model_xg = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(strategy = 'median'),
    XGBRegressor(objective = 'count:poisson')
)

model_xg.fit(X_train, y_train)

In [None]:
# hyperparamter tuning the XGBoost model

In [2]:
# creating another model using the QuantileRandomForestRegressor from scikit garden

model_rfq = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(strategy = 'median'),
    RandomForestQuantileRegressor()
)

model_rfq.fit(X_train, y_train);

NameError: ignored

NameError: ignored

In [96]:
model_MAE = mean_absolute_error(y_val, model_xg.predict(X_val))
model_rsquared = model_xg.score(X_val, y_val)
model_poisson_deviance = mean_poisson_deviance(y_val, model_xg.predict(X_val))
print('The MAE of my first model is:', model_MAE)
print('The R Squared of my first model is:', model_rsquared)
print('The model Poisson deviance is:', model_poisson_deviance)

The MAE of my first model is: 0.4524271821130668
The R Squared of my first model is: 0.07900258547127381
The model Poisson deviance is: 0.8182386347929967


In [None]:
features = model_xg.named_steps['ordinalencoder'].get_feature_names()
feature_importances = model_xg.named_steps['xgbregressor'].feature_importances_

feature_series = pd.Series(feature_importances, index = features).sort_values()

feature_series.plot.barh()
plt.title('Feature Importances for the XGB Regressor Model')
plt.xlabel('Importances')
plt.show();

In [87]:
perm_imp_regressor = permutation_importance(model_xg, X_val, y_val, n_repeats = 5)

perm_dict = {
    'importance_mean' : perm_imp_regressor['importances_mean'],
    'importance_std' : perm_imp_regressor['importances_std']
}


In [None]:
perm_df = pd.DataFrame(perm_dict, index = features)
perm_df