In [1]:
import pandas as pd

data = pd.read_csv('games.csv', index_col=0)

In [2]:
# Cleaning / getting data ready for machine learning
data['venue'] = data['venue'].map({'Home' : 1, 'Away' : 0})     # convert venue to 1's and 0's
data['Date'] = pd.to_datetime(data['Date'])
data['Opponent'] = data['Opponent'].astype('category').cat.codes
data = data.drop(columns=['Time'], inplace=False)
data = data[data["Team"] != "Arizona Coyotes"]

predict = ['Opponent', 'venue']

In [3]:
data

Unnamed: 0,Date,Team,Opponent,venue,Att.,G,GA,S,S%,SV%,PIM,SA,SA%,Opponent SV%,Opponent PIM,Result
0,2021-12-12,Anaheim Ducks,25,0,17010.0,3,2,39,7.7,0.920,2,25,8.0,0.923,2,1
1,2023-01-28,Anaheim Ducks,1,1,16126.0,2,1,45,4.4,0.971,20,34,2.9,0.956,10,1
2,2022-01-14,Anaheim Ducks,14,0,18300.0,3,7,42,7.1,0.793,5,42,16.7,0.929,7,0
3,2024-11-05,Anaheim Ducks,29,1,13538.0,1,5,22,4.5,0.865,10,37,13.5,0.955,8,0
4,2022-10-12,Anaheim Ducks,24,1,17530.0,5,4,27,18.5,0.917,15,48,8.3,0.815,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10491,2024-01-02,Winnipeg Jets,26,1,14157.0,4,2,28,14.3,0.941,6,34,5.9,0.889,6,1
10492,2025-03-07,Winnipeg Jets,17,0,16088.0,6,1,35,17.1,0.957,4,23,4.3,0.829,4,1
10493,2023-02-20,Winnipeg Jets,19,0,18006.0,4,1,21,19.0,0.980,13,51,2.0,0.810,7,1
10494,2025-03-09,Winnipeg Jets,5,0,18700.0,2,4,22,9.1,0.885,24,27,14.8,0.909,8,0


In [38]:
def rolling_averages(team, cols, new_cols):
    team = team.sort_values("Date")    # Getting team data organized chronologically
    rolling = team[cols].rolling(3, closed='left').mean()   # closed=left to ignore current row in sliding window
    team[new_cols] = rolling
    team = team.dropna(subset=new_cols) # dropping first rows because not enough data
    return team

In [39]:
cols = ['G', 'GA', 'S', 'S%', 'SV%', 'PIM', 'Result']   # wanted columns for rolling
new_cols = [f"{c}_rolling" for c in cols]
predictors = new_cols + predict


games_data = data.groupby('Team').apply(lambda x: rolling_averages(x, cols, new_cols))
games_data = games_data.droplevel("Team")
games_data.index = range(games_data.shape[0])  # fixing index level
games_data

  games_data = data.groupby('Team').apply(lambda x: rolling_averages(x, cols, new_cols))


Unnamed: 0,Date,Team,Opponent,venue,Att.,G,GA,S,S%,SV%,...,Opponent SV%,Opponent PIM,Result,G_rolling,GA_rolling,S_rolling,S%_rolling,SV%_rolling,PIM_rolling,Result_rolling
0,2021-10-19,Anaheim Ducks,11,0,14082.0,5,6,36,13.9,0.861,...,0.733,4,0,2.666667,1.666667,26.000000,10.900000,0.959000,20.000000,0.666667
1,2021-10-21,Anaheim Ducks,32,0,13886.0,1,5,39,2.6,0.846,...,0.974,6,0,3.000000,3.333333,30.666667,9.466667,0.922333,16.000000,0.333333
2,2021-10-23,Anaheim Ducks,14,0,18055.0,3,4,24,12.5,0.889,...,0.875,10,0,3.000000,4.333333,34.000000,9.200000,0.886667,9.000000,0.333333
3,2021-10-26,Anaheim Ducks,32,1,11951.0,3,4,35,8.6,0.840,...,0.914,6,0,3.000000,5.000000,33.000000,9.666667,0.865333,9.333333,0.000000
4,2021-10-28,Anaheim Ducks,3,1,12014.0,3,4,37,8.1,0.862,...,0.919,2,0,2.333333,4.333333,32.666667,7.900000,0.858333,6.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10146,2025-04-07,Winnipeg Jets,25,1,15225.0,3,1,26,11.5,0.933,...,0.920,2,1,2.000000,2.666667,24.666667,8.833333,0.910333,3.333333,0.333333
10147,2025-04-10,Winnipeg Jets,9,0,18532.0,4,0,35,11.4,1.000,...,0.886,7,1,2.666667,1.666667,27.000000,10.900000,0.940667,4.000000,0.666667
10148,2025-04-12,Winnipeg Jets,6,0,20634.0,5,4,42,9.5,0.875,...,0.905,4,1,2.666667,1.666667,31.333333,8.633333,0.940667,5.666667,0.666667
10149,2025-04-13,Winnipeg Jets,11,1,15225.0,1,4,18,5.6,0.921,...,0.944,4,0,4.000000,1.666667,34.333333,10.800000,0.936000,5.000000,1.000000


In [35]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

model = XGBClassifier(random_state=10)

In [36]:
from sklearn.metrics import precision_score


def make_predictions(data, predictors, model):
    train = data[data['Date'] < '2024-04-19']
    test = data[data['Date'] > '2024-04-19']
    model.fit(train[predictors], train['Result'])
    preds = model.predict(test[predictors])
    combined  = pd.DataFrame(dict(actual=test['Result'], prediction = preds), index=test.index)
    precision = precision_score(test['Result'], preds)
    return combined, precision

In [8]:
# Defining search space for GridSearchCV
search_grid = {
    'n_estimators' : [50, 100, 200, 500],
    'max_depth' : [3, 6, 9],
    'learning_rate' : [0.01, 0.1],
    'reg_alpha': [0.1, 1, 5, 10],
    'reg_lambda': [0.1, 1, 5, 10, 20],

}

GS = GridSearchCV(
    estimator = model,
    param_grid = search_grid,
    scoring = ["accuracy"],
    refit = "accuracy",
    cv = 5,
    verbose= 4
)

training = games_data[games_data['Date'] < '2024-04-19']  # Training using 2021-2024 data
testing = games_data[games_data['Date'] > '2024-04-19']   # Testing on most recent season (2024-2025)


In [10]:
GS.fit(training[predictors], training['Result'])

Fitting 5 folds for each of 480 candidates, totalling 2400 fits
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=50, reg_alpha=0.1, reg_lambda=0.1; accuracy: (test=0.594) total time=   0.0s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=50, reg_alpha=0.1, reg_lambda=0.1; accuracy: (test=0.554) total time=   0.0s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=50, reg_alpha=0.1, reg_lambda=0.1; accuracy: (test=0.566) total time=   0.0s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=50, reg_alpha=0.1, reg_lambda=0.1; accuracy: (test=0.531) total time=   0.0s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=50, reg_alpha=0.1, reg_lambda=0.1; accuracy: (test=0.527) total time=   0.0s
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=50, reg_alpha=0.1, reg_lambda=1; accuracy: (test=0.593) total time=   0.0s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=50, reg_alpha=0.1, reg_lambda=1; accuracy: (test=0.553) total t

0,1,2
,estimator,"XGBClassifier...ree=None, ...)"
,param_grid,"{'learning_rate': [0.01, 0.1], 'max_depth': [3, 6, ...], 'n_estimators': [50, 100, ...], 'reg_alpha': [0.1, 1, ...], ...}"
,scoring,['accuracy']
,n_jobs,
,refit,'accuracy'
,cv,5
,verbose,4
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [11]:
GS.best_score_

np.float64(0.5695883134130146)

In [12]:
new_model = GS.best_estimator_
new_model

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [40]:
combined, precision = make_predictions(games_data, predictors, new_model)
precision

0.5970443349753695

In [41]:
combined

Unnamed: 0,actual,prediction
243,1,1
244,0,0
245,1,0
246,0,0
247,0,0
...,...,...
10146,1,0
10147,1,0
10148,1,1
10149,0,1


In [42]:
combined = combined.merge(games_data[['Date', 'Team', 'Opponent', 'Result']], left_index=True, right_index=True)
combined

Unnamed: 0,actual,prediction,Date,Team,Opponent,Result
243,1,1,2024-10-12,Anaheim Ducks,23,1
244,0,0,2024-10-13,Anaheim Ducks,30,0
245,1,0,2024-10-16,Anaheim Ducks,28,1
246,0,0,2024-10-18,Anaheim Ducks,7,0
247,0,0,2024-10-20,Anaheim Ducks,13,0
...,...,...,...,...,...,...
10146,1,0,2025-04-07,Winnipeg Jets,25,1
10147,1,0,2025-04-10,Winnipeg Jets,9,1
10148,1,1,2025-04-12,Winnipeg Jets,6,1
10149,0,1,2025-04-13,Winnipeg Jets,11,0
