In [None]:
import pandas as pd

data = pd.read_csv('games.csv', index_col=0)

In [None]:
# Cleaning / getting data ready for machine learning
data['venue'] = data['venue'].map({'Home' : 1, 'Away' : 0})     # convert venue to 1's and 0's

In [None]:
data

In [None]:
data['Opponent'] = data['Opponent'].astype('category').cat.codes      # opponent is now represented by a number
data

In [None]:
games = data.drop(columns=['Time'], inplace=False)
games

In [None]:
# Dropping data for Arizona Coyotes (not a team anymore)
games = games[games["Team"] != "Arizona Coyotes"]

In [None]:
training = games[games['Date'] < '2024-04-19']  # Training using 2021-2024 data
testing = games[games['Date'] > '2024-04-19']   # Testing on most recent season (2024-2025)

import xgboost as xgb

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(random_state=10)

In [None]:
# Defining search space for GridSearchCV
search_grid = {
    'n_estimators' : [50, 100, 200, 500],
    'max_depth' : [3, 6, 9],
    'learning_rate' : [0.001, 0.01, 0.1],
    'reg_alpha': [0, 0.1, 1, 5, 10],
    'reg_lambda': [0.1, 1, 5, 10, 20],

}

In [None]:
games

In [None]:
# Computing rolling averages
games_o = games.groupby("Team")
team = games_o.get_group("Anaheim Ducks").sort_values("Date")

def rolling_averages(team, cols, new_cols):
    team = team.sort_values("Date")    # Getting team data organized chronologically
    rolling = team[cols].rolling(3, closed='left').mean()   # closed=left to ignore current row in sliding window
    team[new_cols] = rolling
    team = team.dropna(subset=new_cols) # dropping first rows because not enough data
    return team


In [None]:
team

In [None]:
cols = ['G', 'GA', 'S', 'S%', 'SV%', 'PIM']   # wanted columns for rolling
new_cols = [f"{c}_rolling" for c in cols]

In [None]:
games_data = games.groupby('Team').apply(lambda x: rolling_averages(x, cols, new_cols))
games_data = games_data.droplevel("Team")
games_data

In [None]:
# Making sure number of games dropped add up
# Dropped 3 games for 33 teams (32 + Utah Hockey club added last season). Expecting 99
print(f"Number of rows before: {games.shape[0]}")
print(f"Number of rows after: {games_data.shape[0]}")
print(f"Rows removed: {(games.shape[0]) - (games_data.shape[0])}")

In [None]:
games_data.index = range(games_data.shape[0])  # fixing index level
games_data

In [None]:
from sklearn.model_selection import GridSearchCV

GS = GridSearchCV(
    estimator = model,
    param_grid = search_grid,
    scoring = ["accuracy"],
    refit = "accuracy",
    cv = 5,
    verbose= 4
)

In [None]:
training = games_data[games_data['Date'] < '2024-04-19']  # Training using 2021-2024 data
testing = games_data[games_data['Date'] > '2024-04-19']   # Testing on most recent season (2024-2025)


In [None]:
# Dropping 'Date' column because not needed anymore
training = training.drop(columns=['Date'])
testing = testing.drop(columns=['Date'])

In [None]:
# Getting training setup
x_train = training.drop(columns=['Result', 'Team'])
y_train = training['Result']

# Getting testing setup
x_test = testing.drop(columns=['Result', 'Team'])
y_test = testing['Result']

In [None]:
GS.fit(x_train, y_train)

In [None]:
GS.best_score_

In [None]:
actual_model = GS.best_estimator_

In [None]:
predictions = actual_model.predict(x_test)

In [None]:
combined = pd.DataFrame(dict(actual=y_test, predicted=predictions), index = x_test.index)

In [None]:
combined

In [None]:
combined = combined.merge(games_data[['Date', 'Team', 'Opponent']], left_index=True, right_index=True)

In [None]:
combined

In [None]:
test = combined[combined['actual'] != combined['predicted']]

In [None]:
test

In [None]:
x_train

In [None]:
y_train

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

In [None]:
import numpy as np
y_train_shuffled = np.random.permutation(y_train)
GS.fit(x_train, y_train_shuffled)

In [None]:
GS.best_score_