In [1]:
import pandas as pd

data = pd.read_csv('newdata.csv', index_col=0)
data

Unnamed: 0,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,...,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,Attendance,Opponent,Date,Result
0,Pittsburgh Penguins,60:00,58,55,51.33,47,40,54.02,35,28,...,100.00,11.11,100.00,17.14,92.86,1.100,19092,Tampa Bay Lightning,2021-10-12,1
1,Tampa Bay Lightning,60:00,55,58,48.67,40,47,45.98,28,35,...,0.00,0.00,88.89,7.14,82.86,0.900,19092,Pittsburgh Penguins,2021-10-12,0
2,Seattle Kraken,60:00,63,58,52.07,44,47,48.35,31,30,...,100.00,6.67,100.00,9.68,86.67,0.963,18431,Vegas Golden Knights,2021-10-12,0
3,Vegas Golden Knights,60:00,58,63,47.93,47,44,51.65,30,31,...,0.00,0.00,93.33,13.33,90.32,1.037,18431,Seattle Kraken,2021-10-12,1
4,Montreal Canadiens,60:00,55,64,46.22,41,51,44.57,32,30,...,-,0.00,100.00,3.13,93.33,0.965,18493,Toronto Maple Leafs,2021-10-13,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10489,Tampa Bay Lightning,60:00,50,46,52.08,36,35,50.70,27,22,...,-,0.00,100.00,0.00,81.82,0.818,18006,New York Rangers,2025-04-17,0
10490,Pittsburgh Penguins,60:00,74,44,62.71,51,33,60.71,38,20,...,0.00,0.00,90.00,13.16,90.00,1.032,18348,Washington Capitals,2025-04-17,1
10491,Washington Capitals,60:00,44,74,37.29,33,51,39.29,20,38,...,100.00,10.00,100.00,10.00,86.84,0.968,18348,Pittsburgh Penguins,2025-04-17,0
10492,Columbus Blue Jackets,60:00,63,72,46.67,44,56,44.00,26,37,...,100.00,16.67,100.00,23.08,97.30,1.204,18874,New York Islanders,2025-04-17,1


In [2]:

def rolling_averages(team, cols, new_cols, window=5):
    team = team.sort_values("Date")    # Getting team data organized chronologically
    rolling = team[cols].rolling(window, closed='left').mean()   # closed=left to ignore current row in sliding window
    team[new_cols] = rolling
    team = team.dropna(subset=new_cols) # dropping first rows because not enough data
    return team


In [3]:
data_opp = data.drop(columns=['Opponent']).rename(columns={
    'Team': 'Opponent',
    'CF': 'Opponent_CF',
    'CA': 'Opponent_CA',
    'CF%': 'Opponent_CF%',
    'FF': 'Opponent_FF',
    'FA': 'Opponent_FA',
    'FF%': 'Opponent_FF%',
    'SF': 'Opponent_SF',
    'SA': 'Opponent_SA',
    'GF': 'Opponent_GF',
    'GA': 'Opponent_GA',
    'xGF': 'Opponent_xGF',
    'xGA': 'Opponent_xGA',
    'xGF%': 'Opponent_xGF%',
    'HDCF' : 'Opponent_HDCF',
    'HDCF%' : 'Opponent_HDCF%',
    'SCF' : 'Opponent_SCF',
    'PDO' : 'Opponent_PDO'
})


merged = data.merge(
    data_opp,
    left_on=['Date', 'Opponent'],
    right_on=['Date', 'Opponent'],
    how='inner',
    suffixes=('', '_y')  # Avoids conflicts if any columns aren’t renamed
)

In [4]:
merged

Unnamed: 0,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,...,LDGF_y,LDGA_y,LDGF%_y,LDSH%_y,LDSV%_y,SH%_y,SV%_y,Opponent_PDO,Attendance_y,Result_y
0,Pittsburgh Penguins,60:00,58,55,51.33,47,40,54.02,35,28,...,0,1,0.00,0.00,88.89,7.14,82.86,0.900,19092,0
1,Tampa Bay Lightning,60:00,55,58,48.67,40,47,45.98,28,35,...,1,0,100.00,11.11,100.00,17.14,92.86,1.100,19092,1
2,Seattle Kraken,60:00,63,58,52.07,44,47,48.35,31,30,...,0,1,0.00,0.00,93.33,13.33,90.32,1.037,18431,1
3,Vegas Golden Knights,60:00,58,63,47.93,47,44,51.65,30,31,...,1,0,100.00,6.67,100.00,9.68,86.67,0.963,18431,0
4,Montreal Canadiens,60:00,55,64,46.22,41,51,44.57,32,30,...,0,0,-,0.00,100.00,6.67,96.88,1.035,18493,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10489,Tampa Bay Lightning,60:00,50,46,52.08,36,35,50.70,27,22,...,0,0,-,0.00,100.00,18.18,100.00,1.182,18006,1
10490,Pittsburgh Penguins,60:00,74,44,62.71,51,33,60.71,38,20,...,1,0,100.00,10.00,100.00,10.00,86.84,0.968,18348,0
10491,Washington Capitals,60:00,44,74,37.29,33,51,39.29,20,38,...,0,1,0.00,0.00,90.00,13.16,90.00,1.032,18348,1
10492,Columbus Blue Jackets,60:00,63,72,46.67,44,56,44.00,26,37,...,0,2,0.00,0.00,83.33,2.70,76.92,0.796,18874,0


In [5]:
merged['CF_diff'] = merged['CF'] - merged['Opponent_CF']
merged['CF%_diff'] = merged['CF%'] - merged['Opponent_CF%']
merged['GF_diff'] = merged['GF'] - merged['Opponent_GF']
merged['xGF_diff'] = merged['xGF'] - merged['Opponent_xGF']
merged['HDCF_diff'] = merged['HDCF'] - merged['Opponent_HDCF']
merged['HDCF%_diff'] = merged['HDCF%'] - merged['Opponent_HDCF%']
merged['FF_diff'] = merged['FF'] - merged['Opponent_FF']
merged['FF%_diff'] = merged['FF%'] - merged['Opponent_FF%']
merged['SCF_diff'] = merged['SCF'] - merged['Opponent_SCF']
merged['PDO_diff'] = merged['PDO'] - merged['Opponent_PDO']


In [6]:
merged

Unnamed: 0,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,...,CF_diff,CF%_diff,GF_diff,xGF_diff,HDCF_diff,HDCF%_diff,FF_diff,FF%_diff,SCF_diff,PDO_diff
0,Pittsburgh Penguins,60:00,58,55,51.33,47,40,54.02,35,28,...,3,2.66,4,2.10,4,20.00,7,8.04,11,0.200
1,Tampa Bay Lightning,60:00,55,58,48.67,40,47,45.98,28,35,...,-3,-2.66,-4,-2.10,-4,-20.00,-7,-8.04,-11,-0.200
2,Seattle Kraken,60:00,63,58,52.07,44,47,48.35,31,30,...,5,4.14,-1,-1.17,1,4.00,-3,-3.30,0,-0.074
3,Vegas Golden Knights,60:00,58,63,47.93,47,44,51.65,30,31,...,-5,-4.14,1,1.17,-1,-4.00,3,3.30,0,0.074
4,Montreal Canadiens,60:00,55,64,46.22,41,51,44.57,32,30,...,-9,-7.56,-1,-1.21,-8,-33.34,-10,-10.86,-16,-0.070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10489,Tampa Bay Lightning,60:00,50,46,52.08,36,35,50.70,27,22,...,4,4.16,-4,-0.05,2,12.50,1,1.40,6,-0.364
10490,Pittsburgh Penguins,60:00,74,44,62.71,51,33,60.71,38,20,...,30,25.42,3,0.83,3,13.04,18,21.42,19,0.064
10491,Washington Capitals,60:00,44,74,37.29,33,51,39.29,20,38,...,-30,-25.42,-3,-0.83,-3,-13.04,-18,-21.42,-19,-0.064
10492,Columbus Blue Jackets,60:00,63,72,46.67,44,56,44.00,26,37,...,-9,-6.66,5,-0.11,2,9.10,-12,-12.00,-8,0.408


In [7]:
merged['GF%'] = pd.to_numeric(merged['GF%'], errors='coerce')
merged['xGF%'] = pd.to_numeric(merged['xGF%'], errors='coerce')

print(merged[['GF%', 'xGF%']].dtypes)

GF%     float64
xGF%    float64
dtype: object


In [8]:
feature = [
    'CF%', 'FF%', 'SF%', 'xGF%', 'SCF%', 'HDCF%', 'GF%', 'SH%', 'SV%', 'HDCA', 'xGA', 'PDO'
]
feature += ['CF', 'CA', 'FF', 'FA', 'SF', 'GA', 'GF', 'SCA', 'SCF', 'HDCF']
feature_diff = ['CF_diff', 'CF%_diff', 'GF_diff', 'xGF_diff', 'HDCF_diff', 'HDCF%_diff', 'FF_diff', 'FF%_diff', 'SCF_diff', 'PDO_diff']

features = feature + feature_diff

features = ['SF%', 'PDO', 'PDO_diff', 'SV%', 'CF%', 'CF%_diff', 'FF%', 'FF%_diff', 'HDCF%', 'HDCF%_diff', 'SCF%', 'SCF_diff', 'GF%', 'xGF%']

#features = ['SCF%', 'xGF%', 'CF%_diff']

#all_predictors = ['CF%', 'SF%', 'xGF%', 'SV%']

new_cols = [f'{c}_rolling' for c in features]

predictors = new_cols




In [9]:
print(merged[features].dtypes)

SF%           float64
PDO           float64
PDO_diff      float64
SV%           float64
CF%           float64
CF%_diff      float64
FF%           float64
FF%_diff      float64
HDCF%         float64
HDCF%_diff    float64
SCF%          float64
SCF_diff        int64
GF%           float64
xGF%          float64
dtype: object


In [10]:
# Adjusting columns order for debugging / clarity
columns = list(merged.columns)

columns.remove('Date')
columns.remove('Result')

columns.insert(1, 'Date')
columns.insert(2, 'Result')

merged = merged[columns]
merged

Unnamed: 0,Team,Date,Result,TOI,CF,CA,CF%,FF,FA,FF%,...,CF_diff,CF%_diff,GF_diff,xGF_diff,HDCF_diff,HDCF%_diff,FF_diff,FF%_diff,SCF_diff,PDO_diff
0,Pittsburgh Penguins,2021-10-12,1,60:00,58,55,51.33,47,40,54.02,...,3,2.66,4,2.10,4,20.00,7,8.04,11,0.200
1,Tampa Bay Lightning,2021-10-12,0,60:00,55,58,48.67,40,47,45.98,...,-3,-2.66,-4,-2.10,-4,-20.00,-7,-8.04,-11,-0.200
2,Seattle Kraken,2021-10-12,0,60:00,63,58,52.07,44,47,48.35,...,5,4.14,-1,-1.17,1,4.00,-3,-3.30,0,-0.074
3,Vegas Golden Knights,2021-10-12,1,60:00,58,63,47.93,47,44,51.65,...,-5,-4.14,1,1.17,-1,-4.00,3,3.30,0,0.074
4,Montreal Canadiens,2021-10-13,0,60:00,55,64,46.22,41,51,44.57,...,-9,-7.56,-1,-1.21,-8,-33.34,-10,-10.86,-16,-0.070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10489,Tampa Bay Lightning,2025-04-17,0,60:00,50,46,52.08,36,35,50.70,...,4,4.16,-4,-0.05,2,12.50,1,1.40,6,-0.364
10490,Pittsburgh Penguins,2025-04-17,1,60:00,74,44,62.71,51,33,60.71,...,30,25.42,3,0.83,3,13.04,18,21.42,19,0.064
10491,Washington Capitals,2025-04-17,0,60:00,44,74,37.29,33,51,39.29,...,-30,-25.42,-3,-0.83,-3,-13.04,-18,-21.42,-19,-0.064
10492,Columbus Blue Jackets,2025-04-17,1,60:00,63,72,46.67,44,56,44.00,...,-9,-6.66,5,-0.11,2,9.10,-12,-12.00,-8,0.408


In [11]:
merged = merged[merged['Team'] != 'Arizona Coyotes']
merged = merged[merged['Opponent'] != 'Arizona Coyotes']
merged

Unnamed: 0,Team,Date,Result,TOI,CF,CA,CF%,FF,FA,FF%,...,CF_diff,CF%_diff,GF_diff,xGF_diff,HDCF_diff,HDCF%_diff,FF_diff,FF%_diff,SCF_diff,PDO_diff
0,Pittsburgh Penguins,2021-10-12,1,60:00,58,55,51.33,47,40,54.02,...,3,2.66,4,2.10,4,20.00,7,8.04,11,0.200
1,Tampa Bay Lightning,2021-10-12,0,60:00,55,58,48.67,40,47,45.98,...,-3,-2.66,-4,-2.10,-4,-20.00,-7,-8.04,-11,-0.200
2,Seattle Kraken,2021-10-12,0,60:00,63,58,52.07,44,47,48.35,...,5,4.14,-1,-1.17,1,4.00,-3,-3.30,0,-0.074
3,Vegas Golden Knights,2021-10-12,1,60:00,58,63,47.93,47,44,51.65,...,-5,-4.14,1,1.17,-1,-4.00,3,3.30,0,0.074
4,Montreal Canadiens,2021-10-13,0,60:00,55,64,46.22,41,51,44.57,...,-9,-7.56,-1,-1.21,-8,-33.34,-10,-10.86,-16,-0.070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10489,Tampa Bay Lightning,2025-04-17,0,60:00,50,46,52.08,36,35,50.70,...,4,4.16,-4,-0.05,2,12.50,1,1.40,6,-0.364
10490,Pittsburgh Penguins,2025-04-17,1,60:00,74,44,62.71,51,33,60.71,...,30,25.42,3,0.83,3,13.04,18,21.42,19,0.064
10491,Washington Capitals,2025-04-17,0,60:00,44,74,37.29,33,51,39.29,...,-30,-25.42,-3,-0.83,-3,-13.04,-18,-21.42,-19,-0.064
10492,Columbus Blue Jackets,2025-04-17,1,60:00,63,72,46.67,44,56,44.00,...,-9,-6.66,5,-0.11,2,9.10,-12,-12.00,-8,0.408


In [12]:
merged['Date'] = pd.to_datetime(merged['Date'])

data = merged.groupby('Team').apply(lambda x: rolling_averages(x, features, new_cols, 3))
data = data.droplevel('Team')
data.index = range(data.shape[0])
data

  data = merged.groupby('Team').apply(lambda x: rolling_averages(x, features, new_cols, 3))


Unnamed: 0,Team,Date,Result,TOI,CF,CA,CF%,FF,FA,FF%,...,CF%_rolling,CF%_diff_rolling,FF%_rolling,FF%_diff_rolling,HDCF%_rolling,HDCF%_diff_rolling,SCF%_rolling,SCF_diff_rolling,GF%_rolling,xGF%_rolling
0,Anaheim Ducks,2021-10-19,0,60:00,63,61,50.81,46,47,49.46,...,39.750000,-20.500000,41.586667,-16.826667,43.136667,-13.726667,37.813333,-13.666667,57.776667,41.020000
1,Anaheim Ducks,2021-10-21,0,60:00,68,50,57.63,50,38,56.82,...,44.423333,-11.153333,44.493333,-11.013333,31.056667,-37.886667,38.960000,-13.000000,46.260000,35.170000
2,Anaheim Ducks,2021-10-23,0,64:47,51,63,44.74,32,43,42.67,...,48.986667,-2.026667,49.516667,-0.966667,42.586667,-14.826667,44.060000,-7.333333,40.706667,38.920000
3,Anaheim Ducks,2021-10-26,0,60:00,62,53,53.91,51,39,56.67,...,51.060000,2.120000,49.650000,-0.700000,40.190000,-19.620000,46.070000,-4.333333,34.993333,37.603333
4,Anaheim Ducks,2021-10-28,0,64:16,56,53,51.38,47,42,52.81,...,52.093333,4.186667,52.053333,4.106667,49.606667,-0.786667,49.943333,0.666667,34.130000,45.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9865,Winnipeg Jets,2025-04-07,1,60:00,64,40,61.54,43,21,67.19,...,50.793333,1.586667,49.960000,-0.080000,56.196667,12.393333,52.853333,3.333333,46.666667,52.583333
9866,Winnipeg Jets,2025-04-10,1,60:00,60,59,50.42,47,43,52.22,...,54.330000,8.660000,55.183333,10.366667,69.290000,38.580000,56.283333,6.333333,65.000000,60.800000
9867,Winnipeg Jets,2025-04-12,0,65:00,76,49,60.80,59,42,58.42,...,56.480000,12.960000,58.596667,17.193333,66.960000,33.920000,60.673333,10.666667,65.000000,61.426667
9868,Winnipeg Jets,2025-04-13,0,60:00,41,75,35.34,31,57,35.23,...,57.586667,15.173333,59.276667,18.553333,66.736667,33.473333,62.636667,13.333333,75.000000,61.123333


In [13]:
data.drop(columns=['Result_y'], inplace=True)

In [14]:
data

Unnamed: 0,Team,Date,Result,TOI,CF,CA,CF%,FF,FA,FF%,...,CF%_rolling,CF%_diff_rolling,FF%_rolling,FF%_diff_rolling,HDCF%_rolling,HDCF%_diff_rolling,SCF%_rolling,SCF_diff_rolling,GF%_rolling,xGF%_rolling
0,Anaheim Ducks,2021-10-19,0,60:00,63,61,50.81,46,47,49.46,...,39.750000,-20.500000,41.586667,-16.826667,43.136667,-13.726667,37.813333,-13.666667,57.776667,41.020000
1,Anaheim Ducks,2021-10-21,0,60:00,68,50,57.63,50,38,56.82,...,44.423333,-11.153333,44.493333,-11.013333,31.056667,-37.886667,38.960000,-13.000000,46.260000,35.170000
2,Anaheim Ducks,2021-10-23,0,64:47,51,63,44.74,32,43,42.67,...,48.986667,-2.026667,49.516667,-0.966667,42.586667,-14.826667,44.060000,-7.333333,40.706667,38.920000
3,Anaheim Ducks,2021-10-26,0,60:00,62,53,53.91,51,39,56.67,...,51.060000,2.120000,49.650000,-0.700000,40.190000,-19.620000,46.070000,-4.333333,34.993333,37.603333
4,Anaheim Ducks,2021-10-28,0,64:16,56,53,51.38,47,42,52.81,...,52.093333,4.186667,52.053333,4.106667,49.606667,-0.786667,49.943333,0.666667,34.130000,45.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9865,Winnipeg Jets,2025-04-07,1,60:00,64,40,61.54,43,21,67.19,...,50.793333,1.586667,49.960000,-0.080000,56.196667,12.393333,52.853333,3.333333,46.666667,52.583333
9866,Winnipeg Jets,2025-04-10,1,60:00,60,59,50.42,47,43,52.22,...,54.330000,8.660000,55.183333,10.366667,69.290000,38.580000,56.283333,6.333333,65.000000,60.800000
9867,Winnipeg Jets,2025-04-12,0,65:00,76,49,60.80,59,42,58.42,...,56.480000,12.960000,58.596667,17.193333,66.960000,33.920000,60.673333,10.666667,65.000000,61.426667
9868,Winnipeg Jets,2025-04-13,0,60:00,41,75,35.34,31,57,35.23,...,57.586667,15.173333,59.276667,18.553333,66.736667,33.473333,62.636667,13.333333,75.000000,61.123333


In [15]:
# Try scaling features

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(data[predictors])
data[predictors] = scaler.transform(data[predictors])
data

Unnamed: 0,Team,Date,Result,TOI,CF,CA,CF%,FF,FA,FF%,...,CF%_rolling,CF%_diff_rolling,FF%_rolling,FF%_diff_rolling,HDCF%_rolling,HDCF%_diff_rolling,SCF%_rolling,SCF_diff_rolling,GF%_rolling,xGF%_rolling
0,Anaheim Ducks,2021-10-19,0,60:00,63,61,50.81,46,47,49.46,...,-1.863736,-1.863691,-1.518873,-1.518828,-0.809523,-0.809348,-1.838340,-1.775814,0.535401,-1.247155
1,Anaheim Ducks,2021-10-21,0,60:00,68,50,57.63,50,38,56.82,...,-1.014220,-1.014131,-0.994029,-0.993943,-2.233918,-2.233742,-1.665393,-1.689200,-0.256436,-2.059763
2,Anaheim Ducks,2021-10-23,0,64:47,51,63,44.74,32,43,42.67,...,-0.184699,-0.184569,-0.086989,-0.086831,-0.874375,-0.874200,-0.896181,-0.952979,-0.638259,-1.538860
3,Anaheim Ducks,2021-10-26,0,60:00,62,53,53.91,51,39,56.67,...,0.192190,0.192340,-0.062914,-0.062754,-1.156975,-1.156800,-0.593021,-0.563216,-1.031083,-1.721755
4,Anaheim Ducks,2021-10-28,0,64:16,56,53,51.38,47,42,52.81,...,0.380029,0.380188,0.371045,0.371239,-0.046622,-0.046448,-0.008822,0.086390,-1.090442,-0.648001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9865,Winnipeg Jets,2025-04-07,1,60:00,64,40,61.54,43,21,67.19,...,0.143716,0.143863,-0.006938,-0.006774,0.730428,0.730602,0.430082,0.432847,-0.228475,0.359077
9866,Winnipeg Jets,2025-04-10,1,60:00,60,59,50.42,47,43,52.22,...,0.786609,0.786789,0.936215,0.936454,2.274310,2.274482,0.947415,0.822611,1.032046,1.500432
9867,Winnipeg Jets,2025-04-12,0,65:00,76,49,60.80,59,42,58.42,...,1.177435,1.177635,1.552545,1.552832,1.999571,1.999743,1.609541,1.385603,1.032046,1.587481
9868,Winnipeg Jets,2025-04-13,0,60:00,41,75,35.34,31,57,35.23,...,1.378604,1.378815,1.675330,1.675627,1.973237,1.973409,1.905662,1.732059,1.719603,1.545345


In [16]:
#####
# 1. Initialize Elo ratings
initial_elo = 1500
teams = data['Team'].unique()
elo_ratings = {team: initial_elo for team in teams}

elo_features = []

# 2. Loop through each game and update ratings
for idx, row in data.iterrows():
    team = row['Team']
    opponent = row['Opponent']
    result = row['Result']  # 1 if win, 0 if loss

    # Optional: home-ice advantage
    team_elo = elo_ratings[team]
    opponent_elo = elo_ratings[opponent]

    # Store Elo features BEFORE the game
    elo_features.append({
        'team_elo': team_elo,
        'opponent_elo': opponent_elo,
        'elo_diff': team_elo - opponent_elo
    })

    # Calculate expected outcome
    expected_win = 1 / (1 + 10 ** ((opponent_elo - team_elo) / 400))

    # Elo update (K-factor can be tuned)
    #k = 40

    k = 30
    change = k * (result - expected_win)
    elo_ratings[team] += change
    elo_ratings[opponent] -= change

# Convert Elo features to DataFrame
elo_df = pd.DataFrame(elo_features)

# Merge with combined_team_view
dataset = pd.concat([data.reset_index(drop=True), elo_df], axis=1)

####
new = ['team_elo', 'opponent_elo', 'elo_diff']
predictors = predictors + new

In [17]:
# Computation for scale_pos_weight
class_counts = data['Result'].value_counts()

count_class_0 = class_counts[0]
count_class_1 = class_counts[1]

print(f"Losses (0): {count_class_0}")
print(f"Wins   (1): {count_class_1}")

scale = count_class_0 / count_class_1

Losses (0): 5285
Wins   (1): 4585


In [18]:
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression

model = XGBClassifier(scale_pos_weight = scale, random_state=10)

# Defining Time Series Split
TSS = TimeSeriesSplit(n_splits=10)

test_model = RandomForestClassifier(random_state=10)

lin = BaggingClassifier(LogisticRegression(random_state=10, solver='liblinear', penalty='l2', max_iter=1000))


In [19]:
from sklearn.metrics import precision_score

# Function to make predictions given the data, input features and chosen model

def make_predictions(data, predictors, model):
    train = data[data['Date'] < '2024-04-19']
    #train = train[train['Date'] > '2022-10-06']
    test = data[data['Date'] > '2024-04-19']
    model.fit(train[predictors], train['Result'])
    preds = model.predict(test[predictors])
    combined  = pd.DataFrame(dict(actual=test['Result'], prediction = preds), index=test.index)
    precision = precision_score(test['Result'], preds)
    return combined, precision

In [20]:
# Defining search space for GridSearchCV
search_grid = {
    'n_estimators': [50, 75, 100, 150],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.03, 0.05],
    'reg_alpha': [1, 5, 10],
    'reg_lambda': [1, 5, 10]

}

alt_search_grid = {
    'n_estimators' : [50, 100, 200, 500],
    'max_depth' : [3, 6, 9],
    'min_samples_split': [3, 5, 10]
}

lin_search_grid = {
    # Logistic Regression hyperparameters (base_estimator__)
    'estimator__C' : [0.5, 0.8, 1.0],
    'n_estimators': [3, 5, 10, 50, 100],
}

GS = GridSearchCV(
    estimator = test_model,
    param_grid = alt_search_grid,
    scoring = 'accuracy', #neg_log_loss',
    refit = True,
    cv = TSS,
    verbose= 4
)

training = dataset[dataset['Date'] < '2024-04-19']  # Training using 2021-2024 data
#training = training[training['Date'] > '2022-10-06']
testing = dataset[dataset['Date'] > '2024-04-19']   # Testing on most recent season (2024-2025)
print(training.columns[training.columns.str.contains('Result')])

Index(['Result'], dtype='object')


In [21]:
testing

Unnamed: 0,Team,Date,Result,TOI,CF,CA,CF%,FF,FA,FF%,...,FF%_diff_rolling,HDCF%_rolling,HDCF%_diff_rolling,SCF%_rolling,SCF_diff_rolling,GF%_rolling,xGF%_rolling,team_elo,opponent_elo,elo_diff
234,Anaheim Ducks,2024-10-12,1,60:00,52,65,44.44,41,46,47.13,...,-1.281065,-1.641206,-1.641031,-2.425556,-2.122270,-0.266749,-2.166258,1358.992011,1429.237499,-70.245488
235,Anaheim Ducks,2024-10-13,0,60:00,62,71,46.62,39,55,41.49,...,-0.952410,-1.572423,-1.572248,-2.202333,-1.775814,1.261232,-1.374024,1376.984093,1478.639022,-101.654929
236,Anaheim Ducks,2024-10-16,1,60:54,67,67,50.00,46,46,50.00,...,-1.238930,-1.965468,-1.965293,-1.992183,-1.732507,1.261232,-1.215669,1366.251797,1500.000000,-133.748203
237,Anaheim Ducks,2024-10-18,0,64:19,58,99,36.94,38,68,35.85,...,-0.684550,-1.188811,-1.188636,-0.802166,-0.823058,0.701102,-0.228038,1386.756882,1551.594064,-164.837181
238,Anaheim Ducks,2024-10-20,0,60:00,37,72,33.94,26,53,32.91,...,-1.363530,-1.436430,-1.436254,-0.694074,-0.952979,-0.608465,-1.077688,1378.383573,1554.894615,-176.511042
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9865,Winnipeg Jets,2025-04-07,1,60:00,64,40,61.54,43,21,67.19,...,-0.006774,0.730428,0.730602,0.430082,0.432847,-0.228475,0.359077,1600.138532,1488.894445,111.244087
9866,Winnipeg Jets,2025-04-10,1,60:00,60,59,50.42,47,43,52.22,...,0.936454,2.274310,2.274482,0.947415,0.822611,1.032046,1.500432,1610.493403,1588.241474,22.251929
9867,Winnipeg Jets,2025-04-12,0,65:00,76,49,60.80,59,42,58.42,...,1.552832,1.999571,1.999743,1.609541,1.385603,1.032046,1.587481,1624.534021,1325.523635,299.010386
9868,Winnipeg Jets,2025-04-13,0,60:00,41,75,35.34,31,57,35.23,...,1.675627,1.973237,1.973409,1.905662,1.732059,1.719603,1.545345,1599.085358,1527.939768,71.145591


In [22]:
predictors

['SF%_rolling',
 'PDO_rolling',
 'PDO_diff_rolling',
 'SV%_rolling',
 'CF%_rolling',
 'CF%_diff_rolling',
 'FF%_rolling',
 'FF%_diff_rolling',
 'HDCF%_rolling',
 'HDCF%_diff_rolling',
 'SCF%_rolling',
 'SCF_diff_rolling',
 'GF%_rolling',
 'xGF%_rolling',
 'team_elo',
 'opponent_elo',
 'elo_diff']

In [23]:
dataset

Unnamed: 0,Team,Date,Result,TOI,CF,CA,CF%,FF,FA,FF%,...,FF%_diff_rolling,HDCF%_rolling,HDCF%_diff_rolling,SCF%_rolling,SCF_diff_rolling,GF%_rolling,xGF%_rolling,team_elo,opponent_elo,elo_diff
0,Anaheim Ducks,2021-10-19,0,60:00,63,61,50.81,46,47,49.46,...,-1.518828,-0.809523,-0.809348,-1.838340,-1.775814,0.535401,-1.247155,1500.000000,1500.000000,0.000000
1,Anaheim Ducks,2021-10-21,0,60:00,68,50,57.63,50,38,56.82,...,-0.993943,-2.233918,-2.233742,-1.665393,-1.689200,-0.256436,-2.059763,1485.000000,1500.000000,-15.000000
2,Anaheim Ducks,2021-10-23,0,64:47,51,63,44.74,32,43,42.67,...,-0.086831,-0.874375,-0.874200,-0.896181,-0.952979,-0.638259,-1.538860,1470.647200,1500.000000,-29.352800
3,Anaheim Ducks,2021-10-26,0,60:00,62,53,53.91,51,39,56.67,...,-0.062754,-1.156975,-1.156800,-0.593021,-0.563216,-1.031083,-1.721755,1456.911456,1514.352800,-57.441344
4,Anaheim Ducks,2021-10-28,0,64:16,56,53,51.38,47,42,52.81,...,0.371239,-0.046622,-0.046448,-0.008822,0.086390,-1.090442,-0.648001,1444.369047,1500.000000,-55.630953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9865,Winnipeg Jets,2025-04-07,1,60:00,64,40,61.54,43,21,67.19,...,-0.006774,0.730428,0.730602,0.430082,0.432847,-0.228475,0.359077,1600.138532,1488.894445,111.244087
9866,Winnipeg Jets,2025-04-10,1,60:00,60,59,50.42,47,43,52.22,...,0.936454,2.274310,2.274482,0.947415,0.822611,1.032046,1.500432,1610.493403,1588.241474,22.251929
9867,Winnipeg Jets,2025-04-12,0,65:00,76,49,60.80,59,42,58.42,...,1.552832,1.999571,1.999743,1.609541,1.385603,1.032046,1.587481,1624.534021,1325.523635,299.010386
9868,Winnipeg Jets,2025-04-13,0,60:00,41,75,35.34,31,57,35.23,...,1.675627,1.973237,1.973409,1.905662,1.732059,1.719603,1.545345,1599.085358,1527.939768,71.145591


In [24]:
print(type(training['Result']))         # should be <class 'pandas.Series'>
print(training['Result'].shape)        # should be (n_samples,)

<class 'pandas.core.series.Series'>
(7255,)


In [25]:
GS.fit(training[predictors], training['Result'])     # Training

Fitting 10 folds for each of 36 candidates, totalling 360 fits
[CV 1/10] END max_depth=3, min_samples_split=3, n_estimators=50;, score=0.578 total time=   0.0s
[CV 2/10] END max_depth=3, min_samples_split=3, n_estimators=50;, score=0.619 total time=   0.0s
[CV 3/10] END max_depth=3, min_samples_split=3, n_estimators=50;, score=0.595 total time=   0.0s
[CV 4/10] END max_depth=3, min_samples_split=3, n_estimators=50;, score=0.574 total time=   0.0s
[CV 5/10] END max_depth=3, min_samples_split=3, n_estimators=50;, score=0.604 total time=   0.0s
[CV 6/10] END max_depth=3, min_samples_split=3, n_estimators=50;, score=0.563 total time=   0.0s
[CV 7/10] END max_depth=3, min_samples_split=3, n_estimators=50;, score=0.608 total time=   0.1s
[CV 8/10] END max_depth=3, min_samples_split=3, n_estimators=50;, score=0.613 total time=   0.1s
[CV 9/10] END max_depth=3, min_samples_split=3, n_estimators=50;, score=0.558 total time=   0.1s
[CV 10/10] END max_depth=3, min_samples_split=3, n_estimators=50

0,1,2
,estimator,RandomForestC...ndom_state=10)
,param_grid,"{'max_depth': [3, 6, ...], 'min_samples_split': [3, 5, ...], 'n_estimators': [50, 100, ...]}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,TimeSeriesSpl...est_size=None)
,verbose,4
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,6
,min_samples_split,3
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [26]:
GS.best_score_

np.float64(0.5899848254931715)

In [27]:
new_model = GS.best_estimator_
new_model

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,6
,min_samples_split,3
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [28]:
combined, precision = make_predictions(dataset, predictors, new_model)
precision

0.5616717635066258

In [29]:
from sklearn.metrics import classification_report, roc_auc_score, log_loss

predictions = new_model.predict(testing[predictors])

print(classification_report(testing['Result'], predictions))

              precision    recall  f1-score   support

           0       0.59      0.69      0.63      1386
           1       0.56      0.45      0.50      1229

    accuracy                           0.58      2615
   macro avg       0.57      0.57      0.57      2615
weighted avg       0.57      0.58      0.57      2615



In [30]:
# Create DataFrame pairing features with their importances
importances = pd.DataFrame({
    'Feature': predictors,
    'Importance': new_model.feature_importances_
})

# Sort by importance
importances = importances.sort_values(by='Importance', ascending=False)

# Display top features
print(importances.head(10))

             Feature  Importance
16          elo_diff    0.195937
14          team_elo    0.131573
15      opponent_elo    0.125605
0        SF%_rolling    0.053048
10      SCF%_rolling    0.051483
13      xGF%_rolling    0.050909
11  SCF_diff_rolling    0.045060
6        FF%_rolling    0.040722
7   FF%_diff_rolling    0.039649
4        CF%_rolling    0.038149


In [31]:
print(dataset['Result'].unique())
print(training['Result'].value_counts())

[0 1]
Result
0    3899
1    3356
Name: count, dtype: int64


In [32]:
print(training[new_cols].info())
print(training[new_cols].isna().sum())
print(training[new_cols].describe())


<class 'pandas.core.frame.DataFrame'>
Index: 7255 entries, 0 to 9787
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   SF%_rolling         7255 non-null   float64
 1   PDO_rolling         7255 non-null   float64
 2   PDO_diff_rolling    7255 non-null   float64
 3   SV%_rolling         7255 non-null   float64
 4   CF%_rolling         7255 non-null   float64
 5   CF%_diff_rolling    7255 non-null   float64
 6   FF%_rolling         7255 non-null   float64
 7   FF%_diff_rolling    7255 non-null   float64
 8   HDCF%_rolling       7255 non-null   float64
 9   HDCF%_diff_rolling  7255 non-null   float64
 10  SCF%_rolling        7255 non-null   float64
 11  SCF_diff_rolling    7255 non-null   float64
 12  GF%_rolling         7255 non-null   float64
 13  xGF%_rolling        7255 non-null   float64
dtypes: float64(14)
memory usage: 850.2 KB
None
SF%_rolling           0
PDO_rolling           0
PDO_diff_rolling  

In [33]:
import numpy as np
# Check class balance after preprocessing
print(data['Result'].value_counts())

# Check if model predicted any 1s
print(np.unique(predictions, return_counts=True))

# Look at rolling feature distribution
print(training[new_cols].describe())

Result
0    5285
1    4585
Name: count, dtype: int64
(array([0, 1]), array([1634,  981]))
       SF%_rolling  PDO_rolling  PDO_diff_rolling  SV%_rolling  CF%_rolling  \
count  7255.000000  7255.000000       7255.000000  7255.000000  7255.000000   
mean     -0.000775    -0.000550         -0.000536     0.049561    -0.000679   
std       1.001218     0.977646          0.977637     0.977028     1.008017   
min      -3.392208    -3.724152         -3.724171    -4.326644    -3.122256   
25%      -0.694124    -0.675397         -0.675359    -0.596027    -0.697620   
50%       0.003813     0.006799          0.003832     0.099311     0.009805   
75%       0.692648     0.664847          0.664911     0.749143     0.694205   
max       3.312923     3.798122          3.795225     2.852449     3.456344   

       CF%_diff_rolling  FF%_rolling  FF%_diff_rolling  HDCF%_rolling  \
count       7255.000000  7255.000000       7255.000000    7255.000000   
mean          -0.000728    -0.000460         -0.0005

In [34]:
combined = combined.merge(dataset[['Date', 'Team', 'Opponent', 'Result']], left_index=True, right_index=True)
combined

Unnamed: 0,actual,prediction,Date,Team,Opponent,Result
234,1,0,2024-10-12,Anaheim Ducks,San Jose Sharks,1
235,0,0,2024-10-13,Anaheim Ducks,Vegas Golden Knights,0
236,1,0,2024-10-16,Anaheim Ducks,Utah Hockey Club,1
237,0,0,2024-10-18,Anaheim Ducks,Colorado Avalanche,0
238,0,0,2024-10-20,Anaheim Ducks,Los Angeles Kings,0
...,...,...,...,...,...,...
9865,1,1,2025-04-07,Winnipeg Jets,St Louis Blues,1
9866,1,1,2025-04-10,Winnipeg Jets,Dallas Stars,1
9867,0,1,2025-04-12,Winnipeg Jets,Chicago Blackhawks,0
9868,0,1,2025-04-13,Winnipeg Jets,Edmonton Oilers,0


In [35]:
final = combined.merge(combined, left_on=['Date', 'Team'], right_on=['Date', 'Opponent'])  # few games will drop due to rolling windows
final

Unnamed: 0,actual_x,prediction_x,Date,Team_x,Opponent_x,Result_x,actual_y,prediction_y,Team_y,Opponent_y,Result_y
0,1,0,2024-10-12,Anaheim Ducks,San Jose Sharks,1,0,0,San Jose Sharks,Anaheim Ducks,0
1,0,0,2024-10-13,Anaheim Ducks,Vegas Golden Knights,0,1,1,Vegas Golden Knights,Anaheim Ducks,1
2,1,0,2024-10-16,Anaheim Ducks,Utah Hockey Club,1,0,1,Utah Hockey Club,Anaheim Ducks,0
3,0,0,2024-10-18,Anaheim Ducks,Colorado Avalanche,0,1,1,Colorado Avalanche,Anaheim Ducks,1
4,0,0,2024-10-20,Anaheim Ducks,Los Angeles Kings,0,1,1,Los Angeles Kings,Anaheim Ducks,1
...,...,...,...,...,...,...,...,...,...,...,...
2601,1,1,2025-04-07,Winnipeg Jets,St Louis Blues,1,0,0,St Louis Blues,Winnipeg Jets,0
2602,1,1,2025-04-10,Winnipeg Jets,Dallas Stars,1,0,0,Dallas Stars,Winnipeg Jets,0
2603,0,1,2025-04-12,Winnipeg Jets,Chicago Blackhawks,0,0,0,Chicago Blackhawks,Winnipeg Jets,0
2604,0,1,2025-04-13,Winnipeg Jets,Edmonton Oilers,0,1,0,Edmonton Oilers,Winnipeg Jets,1


In [36]:
final[(final['prediction_x'] == 1) & (final['prediction_y'] == 0)]['actual_x'].value_counts()

actual_x
1    488
0    361
Name: count, dtype: int64

In [37]:
533 / (533 + 382)

0.5825136612021858

In [38]:
print(dataset['Result'].value_counts())

Result
0    5285
1    4585
Name: count, dtype: int64


In [39]:
def make_pred(data, predictors, model):
    train = data[data['Date'] < '2024-04-19']
    test = data[data['Date'] > '2024-04-19']

    model.fit(train[predictors], train['Result'])

    preds = model.predict(test[predictors])
    probs = model.predict_proba(test[predictors])[:, 1]  # Probability of class 1 (win)

    combined = pd.DataFrame({
        'actual': test['Result'],
        'prediction': preds,
        'win_probability': probs
    }, index=test.index)

    # Keep useful columns
    combined = pd.concat([combined, test[['Team', 'Opponent', 'Date']]], axis=1)

    precision = precision_score(test['Result'], preds)
    return combined, precision

In [40]:
combined, precision = make_pred(dataset, predictors, new_model)
combined

Unnamed: 0,actual,prediction,win_probability,Team,Opponent,Date
234,1,0,0.300098,Anaheim Ducks,San Jose Sharks,2024-10-12
235,0,0,0.307236,Anaheim Ducks,Vegas Golden Knights,2024-10-13
236,1,0,0.281756,Anaheim Ducks,Utah Hockey Club,2024-10-16
237,0,0,0.286994,Anaheim Ducks,Colorado Avalanche,2024-10-18
238,0,0,0.270019,Anaheim Ducks,Los Angeles Kings,2024-10-20
...,...,...,...,...,...,...
9865,1,1,0.547259,Winnipeg Jets,St Louis Blues,2025-04-07
9866,1,1,0.533147,Winnipeg Jets,Dallas Stars,2025-04-10
9867,0,1,0.620901,Winnipeg Jets,Chicago Blackhawks,2025-04-12
9868,0,1,0.549877,Winnipeg Jets,Edmonton Oilers,2025-04-13


In [41]:
# Merge team vs opponent predictions
paired = combined.merge(
    combined,
    left_on=['Date', 'Team'],
    right_on=['Date', 'Opponent'],
    suffixes=('_team', '_opp')
)

# Filter out same-team merges (shouldn't happen if data is clean)
paired = paired[paired['Team_team'] != paired['Team_opp']]
paired

Unnamed: 0,actual_team,prediction_team,win_probability_team,Team_team,Opponent_team,Date,actual_opp,prediction_opp,win_probability_opp,Team_opp,Opponent_opp
0,1,0,0.300098,Anaheim Ducks,San Jose Sharks,2024-10-12,0,0,0.265429,San Jose Sharks,Anaheim Ducks
1,0,0,0.307236,Anaheim Ducks,Vegas Golden Knights,2024-10-13,1,1,0.619329,Vegas Golden Knights,Anaheim Ducks
2,1,0,0.281756,Anaheim Ducks,Utah Hockey Club,2024-10-16,0,1,0.583297,Utah Hockey Club,Anaheim Ducks
3,0,0,0.286994,Anaheim Ducks,Colorado Avalanche,2024-10-18,1,1,0.665757,Colorado Avalanche,Anaheim Ducks
4,0,0,0.270019,Anaheim Ducks,Los Angeles Kings,2024-10-20,1,1,0.615926,Los Angeles Kings,Anaheim Ducks
...,...,...,...,...,...,...,...,...,...,...,...
2601,1,1,0.547259,Winnipeg Jets,St Louis Blues,2025-04-07,0,0,0.441375,St Louis Blues,Winnipeg Jets
2602,1,1,0.533147,Winnipeg Jets,Dallas Stars,2025-04-10,0,0,0.450971,Dallas Stars,Winnipeg Jets
2603,0,1,0.620901,Winnipeg Jets,Chicago Blackhawks,2025-04-12,0,0,0.325890,Chicago Blackhawks,Winnipeg Jets
2604,0,1,0.549877,Winnipeg Jets,Edmonton Oilers,2025-04-13,1,0,0.464486,Edmonton Oilers,Winnipeg Jets


In [42]:
#Filter only valid pairings
paired = paired[paired['Team_team'] != paired['Team_opp']]

# Choose team with higher probability to win
paired['predicted_winner'] = paired.apply(
    lambda row: row['Team_team'] if row['win_probability_team'] > row['win_probability_opp'] else row['Team_opp'],
    axis=1
)

# Determine actual winner from true result
paired['actual_winner'] = paired.apply(
    lambda row: row['Team_team'] if row['actual_team'] == 1 else row['Team_opp'],
    axis=1
)

paired

Unnamed: 0,actual_team,prediction_team,win_probability_team,Team_team,Opponent_team,Date,actual_opp,prediction_opp,win_probability_opp,Team_opp,Opponent_opp,predicted_winner,actual_winner
0,1,0,0.300098,Anaheim Ducks,San Jose Sharks,2024-10-12,0,0,0.265429,San Jose Sharks,Anaheim Ducks,Anaheim Ducks,Anaheim Ducks
1,0,0,0.307236,Anaheim Ducks,Vegas Golden Knights,2024-10-13,1,1,0.619329,Vegas Golden Knights,Anaheim Ducks,Vegas Golden Knights,Vegas Golden Knights
2,1,0,0.281756,Anaheim Ducks,Utah Hockey Club,2024-10-16,0,1,0.583297,Utah Hockey Club,Anaheim Ducks,Utah Hockey Club,Anaheim Ducks
3,0,0,0.286994,Anaheim Ducks,Colorado Avalanche,2024-10-18,1,1,0.665757,Colorado Avalanche,Anaheim Ducks,Colorado Avalanche,Colorado Avalanche
4,0,0,0.270019,Anaheim Ducks,Los Angeles Kings,2024-10-20,1,1,0.615926,Los Angeles Kings,Anaheim Ducks,Los Angeles Kings,Los Angeles Kings
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2601,1,1,0.547259,Winnipeg Jets,St Louis Blues,2025-04-07,0,0,0.441375,St Louis Blues,Winnipeg Jets,Winnipeg Jets,Winnipeg Jets
2602,1,1,0.533147,Winnipeg Jets,Dallas Stars,2025-04-10,0,0,0.450971,Dallas Stars,Winnipeg Jets,Winnipeg Jets,Winnipeg Jets
2603,0,1,0.620901,Winnipeg Jets,Chicago Blackhawks,2025-04-12,0,0,0.325890,Chicago Blackhawks,Winnipeg Jets,Winnipeg Jets,Chicago Blackhawks
2604,0,1,0.549877,Winnipeg Jets,Edmonton Oilers,2025-04-13,1,0,0.464486,Edmonton Oilers,Winnipeg Jets,Winnipeg Jets,Edmonton Oilers


In [43]:

# Evaluate how accurate our prediction was
paired['correct'] = paired['predicted_winner'] == paired['actual_winner']
accuracy = paired['correct'].mean()
print(f"Match-level accuracy: {accuracy:.3f}")

Match-level accuracy: 0.581
