In [1]:
import pandas as pd

data = pd.read_csv('newdata.csv', index_col=0)
data

Unnamed: 0,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,...,LDGF%,LDSH%,LDSV%,SH%,SV%,PDO,Attendance,Opponent,Date,Result
0,Pittsburgh Penguins,60:00,58,55,51.33,47,40,54.02,35,28,...,100.00,11.11,100.00,17.14,92.86,1.100,19092,Tampa Bay Lightning,2021-10-12,1
1,Tampa Bay Lightning,60:00,55,58,48.67,40,47,45.98,28,35,...,0.00,0.00,88.89,7.14,82.86,0.900,19092,Pittsburgh Penguins,2021-10-12,0
2,Seattle Kraken,60:00,63,58,52.07,44,47,48.35,31,30,...,100.00,6.67,100.00,9.68,86.67,0.963,18431,Vegas Golden Knights,2021-10-12,0
3,Vegas Golden Knights,60:00,58,63,47.93,47,44,51.65,30,31,...,0.00,0.00,93.33,13.33,90.32,1.037,18431,Seattle Kraken,2021-10-12,1
4,Montreal Canadiens,60:00,55,64,46.22,41,51,44.57,32,30,...,-,0.00,100.00,3.13,93.33,0.965,18493,Toronto Maple Leafs,2021-10-13,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10489,Tampa Bay Lightning,60:00,50,46,52.08,36,35,50.70,27,22,...,-,0.00,100.00,0.00,81.82,0.818,18006,New York Rangers,2025-04-17,0
10490,Pittsburgh Penguins,60:00,74,44,62.71,51,33,60.71,38,20,...,0.00,0.00,90.00,13.16,90.00,1.032,18348,Washington Capitals,2025-04-17,1
10491,Washington Capitals,60:00,44,74,37.29,33,51,39.29,20,38,...,100.00,10.00,100.00,10.00,86.84,0.968,18348,Pittsburgh Penguins,2025-04-17,0
10492,Columbus Blue Jackets,60:00,63,72,46.67,44,56,44.00,26,37,...,100.00,16.67,100.00,23.08,97.30,1.204,18874,New York Islanders,2025-04-17,1


In [2]:

def rolling_averages(team, cols, new_cols, window=5):
    team = team.sort_values("Date")    # Getting team data organized chronologically
    rolling = team[cols].rolling(window, closed='left').mean()   # closed=left to ignore current row in sliding window
    team[new_cols] = rolling
    team = team.dropna(subset=new_cols) # dropping first rows because not enough data
    return team


In [3]:
data_opp = data.drop(columns=['Opponent']).rename(columns={
    'Team': 'Opponent',
    'CF': 'Opponent_CF',
    'CA': 'Opponent_CA',
    'CF%': 'Opponent_CF%',
    'FF': 'Opponent_FF',
    'FA': 'Opponent_FA',
    'FF%': 'Opponent_FF%',
    'SF': 'Opponent_SF',
    'SA': 'Opponent_SA',
    'GF': 'Opponent_GF',
    'GA': 'Opponent_GA',
    'xGF': 'Opponent_xGF',
    'xGA': 'Opponent_xGA',
    'xGF%': 'Opponent_xGF%',
    'HDCF' : 'Opponent_HDCF',
    'HDCF%' : 'Opponent_HDCF%',
    'SCF' : 'Opponent_SCF',
    'PDO' : 'Opponent_PDO'
})


merged = data.merge(
    data_opp,
    left_on=['Date', 'Opponent'],
    right_on=['Date', 'Opponent'],
    how='inner',
    suffixes=('', '_y')  # Avoids conflicts if any columns aren’t renamed
)

In [4]:
merged

Unnamed: 0,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,...,LDGF_y,LDGA_y,LDGF%_y,LDSH%_y,LDSV%_y,SH%_y,SV%_y,Opponent_PDO,Attendance_y,Result_y
0,Pittsburgh Penguins,60:00,58,55,51.33,47,40,54.02,35,28,...,0,1,0.00,0.00,88.89,7.14,82.86,0.900,19092,0
1,Tampa Bay Lightning,60:00,55,58,48.67,40,47,45.98,28,35,...,1,0,100.00,11.11,100.00,17.14,92.86,1.100,19092,1
2,Seattle Kraken,60:00,63,58,52.07,44,47,48.35,31,30,...,0,1,0.00,0.00,93.33,13.33,90.32,1.037,18431,1
3,Vegas Golden Knights,60:00,58,63,47.93,47,44,51.65,30,31,...,1,0,100.00,6.67,100.00,9.68,86.67,0.963,18431,0
4,Montreal Canadiens,60:00,55,64,46.22,41,51,44.57,32,30,...,0,0,-,0.00,100.00,6.67,96.88,1.035,18493,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10489,Tampa Bay Lightning,60:00,50,46,52.08,36,35,50.70,27,22,...,0,0,-,0.00,100.00,18.18,100.00,1.182,18006,1
10490,Pittsburgh Penguins,60:00,74,44,62.71,51,33,60.71,38,20,...,1,0,100.00,10.00,100.00,10.00,86.84,0.968,18348,0
10491,Washington Capitals,60:00,44,74,37.29,33,51,39.29,20,38,...,0,1,0.00,0.00,90.00,13.16,90.00,1.032,18348,1
10492,Columbus Blue Jackets,60:00,63,72,46.67,44,56,44.00,26,37,...,0,2,0.00,0.00,83.33,2.70,76.92,0.796,18874,0


In [5]:
merged['CF_diff'] = merged['CF'] - merged['Opponent_CF']
merged['CF%_diff'] = merged['CF%'] - merged['Opponent_CF%']
merged['GF_diff'] = merged['GF'] - merged['Opponent_GF']
merged['xGF_diff'] = merged['xGF'] - merged['Opponent_xGF']
merged['HDCF_diff'] = merged['HDCF'] - merged['Opponent_HDCF']
merged['HDCF%_diff'] = merged['HDCF%'] - merged['Opponent_HDCF%']
merged['FF_diff'] = merged['FF'] - merged['Opponent_FF']
merged['FF%_diff'] = merged['FF%'] - merged['Opponent_FF%']
merged['SCF_diff'] = merged['SCF'] - merged['Opponent_SCF']
merged['PDO_diff'] = merged['PDO'] - merged['Opponent_PDO']


In [6]:
merged

Unnamed: 0,Team,TOI,CF,CA,CF%,FF,FA,FF%,SF,SA,...,CF_diff,CF%_diff,GF_diff,xGF_diff,HDCF_diff,HDCF%_diff,FF_diff,FF%_diff,SCF_diff,PDO_diff
0,Pittsburgh Penguins,60:00,58,55,51.33,47,40,54.02,35,28,...,3,2.66,4,2.10,4,20.00,7,8.04,11,0.200
1,Tampa Bay Lightning,60:00,55,58,48.67,40,47,45.98,28,35,...,-3,-2.66,-4,-2.10,-4,-20.00,-7,-8.04,-11,-0.200
2,Seattle Kraken,60:00,63,58,52.07,44,47,48.35,31,30,...,5,4.14,-1,-1.17,1,4.00,-3,-3.30,0,-0.074
3,Vegas Golden Knights,60:00,58,63,47.93,47,44,51.65,30,31,...,-5,-4.14,1,1.17,-1,-4.00,3,3.30,0,0.074
4,Montreal Canadiens,60:00,55,64,46.22,41,51,44.57,32,30,...,-9,-7.56,-1,-1.21,-8,-33.34,-10,-10.86,-16,-0.070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10489,Tampa Bay Lightning,60:00,50,46,52.08,36,35,50.70,27,22,...,4,4.16,-4,-0.05,2,12.50,1,1.40,6,-0.364
10490,Pittsburgh Penguins,60:00,74,44,62.71,51,33,60.71,38,20,...,30,25.42,3,0.83,3,13.04,18,21.42,19,0.064
10491,Washington Capitals,60:00,44,74,37.29,33,51,39.29,20,38,...,-30,-25.42,-3,-0.83,-3,-13.04,-18,-21.42,-19,-0.064
10492,Columbus Blue Jackets,60:00,63,72,46.67,44,56,44.00,26,37,...,-9,-6.66,5,-0.11,2,9.10,-12,-12.00,-8,0.408


In [7]:
merged['GF%'] = pd.to_numeric(merged['GF%'], errors='coerce')
merged['xGF%'] = pd.to_numeric(merged['xGF%'], errors='coerce')

print(merged[['GF%', 'xGF%']].dtypes)

GF%     float64
xGF%    float64
dtype: object


In [8]:
feature = [
    'CF%', 'FF%', 'SF%', 'xGF%', 'SCF%', 'HDCF%', 'GF%', 'SH%', 'SV%', 'HDCA', 'xGA', 'PDO'
]
feature += ['CF', 'CA', 'FF', 'FA', 'SF', 'GA', 'GF', 'SCA', 'SCF', 'HDCF']
feature_diff = ['CF_diff', 'CF%_diff', 'GF_diff', 'xGF_diff', 'HDCF_diff', 'HDCF%_diff', 'FF_diff', 'FF%_diff', 'SCF_diff', 'PDO_diff']

features = feature + feature_diff

features = ['SF%', 'PDO', 'PDO_diff', 'SV%', 'CF%', 'CF%_diff', 'FF%', 'FF%_diff', 'HDCF%', 'HDCF%_diff', 'SCF%', 'SCF_diff', 'GF%', 'xGF%']

features = ['SCF%', 'xGF%', 'CF%_diff']

#all_predictors = ['CF%', 'SF%', 'xGF%', 'SV%']

new_cols = [f'{c}_rolling' for c in features]

predictors = new_cols




In [9]:
print(merged[features].dtypes)

SCF%        float64
xGF%        float64
CF%_diff    float64
dtype: object


In [10]:
# Adjusting columns order for debugging / clarity
columns = list(merged.columns)

columns.remove('Date')
columns.remove('Result')

columns.insert(1, 'Date')
columns.insert(2, 'Result')

merged = merged[columns]
merged

Unnamed: 0,Team,Date,Result,TOI,CF,CA,CF%,FF,FA,FF%,...,CF_diff,CF%_diff,GF_diff,xGF_diff,HDCF_diff,HDCF%_diff,FF_diff,FF%_diff,SCF_diff,PDO_diff
0,Pittsburgh Penguins,2021-10-12,1,60:00,58,55,51.33,47,40,54.02,...,3,2.66,4,2.10,4,20.00,7,8.04,11,0.200
1,Tampa Bay Lightning,2021-10-12,0,60:00,55,58,48.67,40,47,45.98,...,-3,-2.66,-4,-2.10,-4,-20.00,-7,-8.04,-11,-0.200
2,Seattle Kraken,2021-10-12,0,60:00,63,58,52.07,44,47,48.35,...,5,4.14,-1,-1.17,1,4.00,-3,-3.30,0,-0.074
3,Vegas Golden Knights,2021-10-12,1,60:00,58,63,47.93,47,44,51.65,...,-5,-4.14,1,1.17,-1,-4.00,3,3.30,0,0.074
4,Montreal Canadiens,2021-10-13,0,60:00,55,64,46.22,41,51,44.57,...,-9,-7.56,-1,-1.21,-8,-33.34,-10,-10.86,-16,-0.070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10489,Tampa Bay Lightning,2025-04-17,0,60:00,50,46,52.08,36,35,50.70,...,4,4.16,-4,-0.05,2,12.50,1,1.40,6,-0.364
10490,Pittsburgh Penguins,2025-04-17,1,60:00,74,44,62.71,51,33,60.71,...,30,25.42,3,0.83,3,13.04,18,21.42,19,0.064
10491,Washington Capitals,2025-04-17,0,60:00,44,74,37.29,33,51,39.29,...,-30,-25.42,-3,-0.83,-3,-13.04,-18,-21.42,-19,-0.064
10492,Columbus Blue Jackets,2025-04-17,1,60:00,63,72,46.67,44,56,44.00,...,-9,-6.66,5,-0.11,2,9.10,-12,-12.00,-8,0.408


In [11]:
merged = merged[merged['Team'] != 'Arizona Coyotes']
merged = merged[merged['Opponent'] != 'Arizona Coyotes']
merged

Unnamed: 0,Team,Date,Result,TOI,CF,CA,CF%,FF,FA,FF%,...,CF_diff,CF%_diff,GF_diff,xGF_diff,HDCF_diff,HDCF%_diff,FF_diff,FF%_diff,SCF_diff,PDO_diff
0,Pittsburgh Penguins,2021-10-12,1,60:00,58,55,51.33,47,40,54.02,...,3,2.66,4,2.10,4,20.00,7,8.04,11,0.200
1,Tampa Bay Lightning,2021-10-12,0,60:00,55,58,48.67,40,47,45.98,...,-3,-2.66,-4,-2.10,-4,-20.00,-7,-8.04,-11,-0.200
2,Seattle Kraken,2021-10-12,0,60:00,63,58,52.07,44,47,48.35,...,5,4.14,-1,-1.17,1,4.00,-3,-3.30,0,-0.074
3,Vegas Golden Knights,2021-10-12,1,60:00,58,63,47.93,47,44,51.65,...,-5,-4.14,1,1.17,-1,-4.00,3,3.30,0,0.074
4,Montreal Canadiens,2021-10-13,0,60:00,55,64,46.22,41,51,44.57,...,-9,-7.56,-1,-1.21,-8,-33.34,-10,-10.86,-16,-0.070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10489,Tampa Bay Lightning,2025-04-17,0,60:00,50,46,52.08,36,35,50.70,...,4,4.16,-4,-0.05,2,12.50,1,1.40,6,-0.364
10490,Pittsburgh Penguins,2025-04-17,1,60:00,74,44,62.71,51,33,60.71,...,30,25.42,3,0.83,3,13.04,18,21.42,19,0.064
10491,Washington Capitals,2025-04-17,0,60:00,44,74,37.29,33,51,39.29,...,-30,-25.42,-3,-0.83,-3,-13.04,-18,-21.42,-19,-0.064
10492,Columbus Blue Jackets,2025-04-17,1,60:00,63,72,46.67,44,56,44.00,...,-9,-6.66,5,-0.11,2,9.10,-12,-12.00,-8,0.408


In [12]:
merged['Date'] = pd.to_datetime(merged['Date'])

data = merged.groupby('Team').apply(lambda x: rolling_averages(x, features, new_cols, 3))
data = data.droplevel('Team')
data.index = range(data.shape[0])
data

  data = merged.groupby('Team').apply(lambda x: rolling_averages(x, features, new_cols, 3))


Unnamed: 0,Team,Date,Result,TOI,CF,CA,CF%,FF,FA,FF%,...,xGF_diff,HDCF_diff,HDCF%_diff,FF_diff,FF%_diff,SCF_diff,PDO_diff,SCF%_rolling,xGF%_rolling,CF%_diff_rolling
0,Anaheim Ducks,2021-10-19,0,60:00,63,61,50.81,46,47,49.46,...,-2.49,-12,-46.16,-1,-1.08,-7,-0.046,37.813333,41.020000,-20.500000
1,Anaheim Ducks,2021-10-21,0,60:00,68,50,57.63,50,38,56.82,...,0.03,9,42.86,12,13.64,8,-0.320,38.960000,35.170000,-11.153333
2,Anaheim Ducks,2021-10-23,0,64:47,51,63,44.74,32,43,42.67,...,-1.93,-10,-55.56,-11,-14.66,-14,0.028,44.060000,38.920000,-2.026667
3,Anaheim Ducks,2021-10-26,0,60:00,62,53,53.91,51,39,56.67,...,0.75,3,10.34,12,13.34,8,-0.148,46.070000,37.603333,2.120000
4,Anaheim Ducks,2021-10-28,0,64:16,56,53,51.38,47,42,52.81,...,0.41,9,36.00,5,5.62,3,-0.114,49.943333,45.333333,4.186667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9895,Winnipeg Jets,2025-04-07,1,60:00,64,40,61.54,43,21,67.19,...,3.02,13,61.90,22,34.38,13,0.098,52.853333,52.583333,1.586667
9896,Winnipeg Jets,2025-04-10,1,60:00,60,59,50.42,47,43,52.22,...,0.23,2,9.10,4,4.44,9,0.228,56.283333,60.800000,8.660000
9897,Winnipeg Jets,2025-04-12,0,65:00,76,49,60.80,59,42,58.42,...,0.68,10,29.42,17,16.84,18,-0.060,60.673333,61.426667,12.960000
9898,Winnipeg Jets,2025-04-13,0,60:00,41,75,35.34,31,57,35.23,...,-2.37,-7,-30.44,-26,-29.54,-14,-0.094,62.636667,61.123333,15.173333


In [13]:
data.drop(columns=['Result_y'], inplace=True)

In [14]:
data

Unnamed: 0,Team,Date,Result,TOI,CF,CA,CF%,FF,FA,FF%,...,xGF_diff,HDCF_diff,HDCF%_diff,FF_diff,FF%_diff,SCF_diff,PDO_diff,SCF%_rolling,xGF%_rolling,CF%_diff_rolling
0,Anaheim Ducks,2021-10-19,0,60:00,63,61,50.81,46,47,49.46,...,-2.49,-12,-46.16,-1,-1.08,-7,-0.046,37.813333,41.020000,-20.500000
1,Anaheim Ducks,2021-10-21,0,60:00,68,50,57.63,50,38,56.82,...,0.03,9,42.86,12,13.64,8,-0.320,38.960000,35.170000,-11.153333
2,Anaheim Ducks,2021-10-23,0,64:47,51,63,44.74,32,43,42.67,...,-1.93,-10,-55.56,-11,-14.66,-14,0.028,44.060000,38.920000,-2.026667
3,Anaheim Ducks,2021-10-26,0,60:00,62,53,53.91,51,39,56.67,...,0.75,3,10.34,12,13.34,8,-0.148,46.070000,37.603333,2.120000
4,Anaheim Ducks,2021-10-28,0,64:16,56,53,51.38,47,42,52.81,...,0.41,9,36.00,5,5.62,3,-0.114,49.943333,45.333333,4.186667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9895,Winnipeg Jets,2025-04-07,1,60:00,64,40,61.54,43,21,67.19,...,3.02,13,61.90,22,34.38,13,0.098,52.853333,52.583333,1.586667
9896,Winnipeg Jets,2025-04-10,1,60:00,60,59,50.42,47,43,52.22,...,0.23,2,9.10,4,4.44,9,0.228,56.283333,60.800000,8.660000
9897,Winnipeg Jets,2025-04-12,0,65:00,76,49,60.80,59,42,58.42,...,0.68,10,29.42,17,16.84,18,-0.060,60.673333,61.426667,12.960000
9898,Winnipeg Jets,2025-04-13,0,60:00,41,75,35.34,31,57,35.23,...,-2.37,-7,-30.44,-26,-29.54,-14,-0.094,62.636667,61.123333,15.173333


In [15]:
#####
# 1. Initialize Elo ratings
initial_elo = 1500
teams = data['Team'].unique()
elo_ratings = {team: initial_elo for team in teams}

elo_features = []

# 2. Loop through each game and update ratings
for idx, row in data.iterrows():
    team = row['Team']
    opponent = row['Opponent']
    result = row['Result']  # 1 if win, 0 if loss

    # Optional: home-ice advantage
    team_elo = elo_ratings[team]
    opponent_elo = elo_ratings[opponent]

    # Store Elo features BEFORE the game
    elo_features.append({
        'team_elo': team_elo,
        'opponent_elo': opponent_elo,
        'elo_diff': team_elo - opponent_elo
    })

    # Calculate expected outcome
    expected_win = 1 / (1 + 10 ** ((opponent_elo - team_elo) / 400))

    # Elo update (K-factor can be tuned)
    #k = 40

    k = 30
    change = k * (result - expected_win)
    elo_ratings[team] += change
    elo_ratings[opponent] -= change

# Convert Elo features to DataFrame
elo_df = pd.DataFrame(elo_features)

# Merge with combined_team_view
dataset = pd.concat([data.reset_index(drop=True), elo_df], axis=1)

####
new = ['team_elo', 'opponent_elo', 'elo_diff']
predictors = predictors + new

In [16]:
# Computation for scale_pos_weight
class_counts = data['Result'].value_counts()

count_class_0 = class_counts[0]
count_class_1 = class_counts[1]

print(f"Losses (0): {count_class_0}")
print(f"Wins   (1): {count_class_1}")

scale = count_class_0 / count_class_1

Losses (0): 5298
Wins   (1): 4602


In [17]:
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression

model = XGBClassifier(scale_pos_weight = scale, random_state=10)

# Defining Time Series Split
TSS = TimeSeriesSplit(n_splits=5)

test_model = RandomForestClassifier(random_state=10)

lin = BaggingClassifier(LogisticRegression(random_state=10, solver='liblinear', penalty='l2', max_iter=1000))


In [18]:
from sklearn.metrics import precision_score

# Function to make predictions given the data, input features and chosen model

def make_predictions(data, predictors, model):
    train = data[data['Date'] < '2024-04-19']
    #train = train[train['Date'] > '2022-10-06']
    test = data[data['Date'] > '2024-04-19']
    model.fit(train[predictors], train['Result'])
    preds = model.predict(test[predictors])
    combined  = pd.DataFrame(dict(actual=test['Result'], prediction = preds), index=test.index)
    precision = precision_score(test['Result'], preds)
    return combined, precision

In [20]:
# Defining search space for GridSearchCV
search_grid = {
    'n_estimators': [50, 75, 100, 150],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.03, 0.05],
    'reg_alpha': [1, 5, 10],
    'reg_lambda': [1, 5, 10]

}

alt_search_grid = {
    'n_estimators' : [50, 100, 200, 500],
    'max_depth' : [3, 6, 9],
    'min_samples_split': [3, 5, 10]
}

lin_search_grid = {
    # Logistic Regression hyperparameters (base_estimator__)
    'estimator__C' : [0.5, 0.8, 1.0],
    'n_estimators': [3, 5, 10, 50, 100],
}

GS = GridSearchCV(
    estimator = lin,
    param_grid = lin_search_grid,
    scoring = 'neg_log_loss',
    refit = True,
    cv = TSS,
    verbose= 4
)

training = dataset[dataset['Date'] < '2024-04-19']  # Training using 2021-2024 data
#training = training[training['Date'] > '2022-10-06']
testing = dataset[dataset['Date'] > '2024-04-19']   # Testing on most recent season (2024-2025)
print(training.columns[training.columns.str.contains('Result')])

Index(['Result'], dtype='object')


In [21]:
predictors

['SCF%_rolling',
 'xGF%_rolling',
 'CF%_diff_rolling',
 'team_elo',
 'opponent_elo',
 'elo_diff']

In [22]:
dataset

Unnamed: 0,Team,Date,Result,TOI,CF,CA,CF%,FF,FA,FF%,...,FF_diff,FF%_diff,SCF_diff,PDO_diff,SCF%_rolling,xGF%_rolling,CF%_diff_rolling,team_elo,opponent_elo,elo_diff
0,Anaheim Ducks,2021-10-19,0,60:00,63,61,50.81,46,47,49.46,...,-1,-1.08,-7,-0.046,37.813333,41.020000,-20.500000,1500.000000,1500.000000,0.000000
1,Anaheim Ducks,2021-10-21,0,60:00,68,50,57.63,50,38,56.82,...,12,13.64,8,-0.320,38.960000,35.170000,-11.153333,1485.000000,1500.000000,-15.000000
2,Anaheim Ducks,2021-10-23,0,64:47,51,63,44.74,32,43,42.67,...,-11,-14.66,-14,0.028,44.060000,38.920000,-2.026667,1470.647200,1500.000000,-29.352800
3,Anaheim Ducks,2021-10-26,0,60:00,62,53,53.91,51,39,56.67,...,12,13.34,8,-0.148,46.070000,37.603333,2.120000,1456.911456,1514.352800,-57.441344
4,Anaheim Ducks,2021-10-28,0,64:16,56,53,51.38,47,42,52.81,...,5,5.62,3,-0.114,49.943333,45.333333,4.186667,1444.369047,1500.000000,-55.630953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9895,Winnipeg Jets,2025-04-07,1,60:00,64,40,61.54,43,21,67.19,...,22,34.38,13,0.098,52.853333,52.583333,1.586667,1600.145994,1488.885215,111.260779
9896,Winnipeg Jets,2025-04-10,1,60:00,60,59,50.42,47,43,52.22,...,4,4.44,9,0.228,56.283333,60.800000,8.660000,1610.500213,1588.233282,22.266931
9897,Winnipeg Jets,2025-04-12,0,65:00,76,49,60.80,59,42,58.42,...,17,16.84,18,-0.060,60.673333,61.426667,12.960000,1624.540186,1325.511517,299.028669
9898,Winnipeg Jets,2025-04-13,0,60:00,41,75,35.34,31,57,35.23,...,-26,-29.54,-14,-0.094,62.636667,61.123333,15.173333,1599.091117,1527.933815,71.157302


In [23]:
print(type(training['Result']))         # should be <class 'pandas.Series'>
print(training['Result'].shape)        # should be (n_samples,)

<class 'pandas.core.series.Series'>
(7279,)


In [24]:
GS.fit(training[predictors], training['Result'])     # Training

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV 1/5] END .estimator__C=0.5, n_estimators=3;, score=-0.655 total time=   0.0s
[CV 2/5] END .estimator__C=0.5, n_estimators=3;, score=-0.658 total time=   0.0s
[CV 3/5] END .estimator__C=0.5, n_estimators=3;, score=-0.677 total time=   0.0s
[CV 4/5] END .estimator__C=0.5, n_estimators=3;, score=-0.660 total time=   0.0s
[CV 5/5] END .estimator__C=0.5, n_estimators=3;, score=-0.688 total time=   0.0s
[CV 1/5] END .estimator__C=0.5, n_estimators=5;, score=-0.657 total time=   0.0s
[CV 2/5] END .estimator__C=0.5, n_estimators=5;, score=-0.659 total time=   0.0s
[CV 3/5] END .estimator__C=0.5, n_estimators=5;, score=-0.677 total time=   0.0s
[CV 4/5] END .estimator__C=0.5, n_estimators=5;, score=-0.661 total time=   0.0s
[CV 5/5] END .estimator__C=0.5, n_estimators=5;, score=-0.689 total time=   0.0s
[CV 1/5] END estimator__C=0.5, n_estimators=10;, score=-0.658 total time=   0.0s
[CV 2/5] END estimator__C=0.5, n_estimators=10;,

0,1,2
,estimator,BaggingClassi...='liblinear'))
,param_grid,"{'estimator__C': [0.5, 0.8, ...], 'n_estimators': [3, 5, ...]}"
,scoring,'neg_log_loss'
,n_jobs,
,refit,True
,cv,TimeSeriesSpl...est_size=None)
,verbose,4
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.5
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,10
,solver,'liblinear'
,max_iter,1000


In [25]:
GS.best_score_

np.float64(-0.6677605040185831)

In [26]:
new_model = GS.best_estimator_
new_model

0,1,2
,estimator,LogisticRegre...r='liblinear')
,n_estimators,3
,max_samples,1.0
,max_features,1.0
,bootstrap,True
,bootstrap_features,False
,oob_score,False
,warm_start,False
,n_jobs,
,random_state,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.5
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,10
,solver,'liblinear'
,max_iter,1000


In [27]:
combined, precision = make_predictions(dataset, predictors, new_model)
precision

0.564901349948079

In [28]:
from sklearn.metrics import classification_report, roc_auc_score, log_loss

predictions = new_model.predict(testing[predictors])

print(classification_report(testing['Result'], predictions))

              precision    recall  f1-score   support

           0       0.59      0.70      0.64      1389
           1       0.56      0.44      0.50      1232

    accuracy                           0.58      2621
   macro avg       0.57      0.57      0.57      2621
weighted avg       0.58      0.58      0.57      2621



In [29]:
# Create DataFrame pairing features with their importances
importances = pd.DataFrame({
    'Feature': predictors,
    'Importance': new_model.feature_importances_
})

# Sort by importance
importances = importances.sort_values(by='Importance', ascending=False)

# Display top features
print(importances.head(10))

AttributeError: 'BaggingClassifier' object has no attribute 'feature_importances_'

In [30]:
print(dataset['Result'].unique())
print(training['Result'].value_counts())

[0 1]
Result
0    3909
1    3370
Name: count, dtype: int64


In [31]:
print(training[new_cols].info())
print(training[new_cols].isna().sum())
print(training[new_cols].describe())


<class 'pandas.core.frame.DataFrame'>
Index: 7279 entries, 0 to 9817
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SCF%_rolling      7279 non-null   float64
 1   xGF%_rolling      7279 non-null   float64
 2   CF%_diff_rolling  7279 non-null   float64
dtypes: float64(3)
memory usage: 227.5 KB
None
SCF%_rolling        0
xGF%_rolling        0
CF%_diff_rolling    0
dtype: int64
       SCF%_rolling  xGF%_rolling  CF%_diff_rolling
count   7279.000000   7279.000000       7279.000000
mean      49.997730     49.996944         -0.008171
std        6.684933      7.177986         11.108291
min       25.650000     27.910000        -34.346667
25%       45.465000     45.100000         -7.710000
50%       50.076667     50.133333          0.113333
75%       54.500000     55.035000          7.643333
max       75.516667     71.103333         38.033333


In [32]:
import numpy as np
# Check class balance after preprocessing
print(data['Result'].value_counts())

# Check if model predicted any 1s
print(np.unique(predictions, return_counts=True))

# Look at rolling feature distribution
print(training[new_cols].describe())

Result
0    5298
1    4602
Name: count, dtype: int64
(array([0, 1]), array([1658,  963]))
       SCF%_rolling  xGF%_rolling  CF%_diff_rolling
count   7279.000000   7279.000000       7279.000000
mean      49.997730     49.996944         -0.008171
std        6.684933      7.177986         11.108291
min       25.650000     27.910000        -34.346667
25%       45.465000     45.100000         -7.710000
50%       50.076667     50.133333          0.113333
75%       54.500000     55.035000          7.643333
max       75.516667     71.103333         38.033333


In [33]:
combined = combined.merge(dataset[['Date', 'Team', 'Opponent', 'Result']], left_index=True, right_index=True)
combined

Unnamed: 0,actual,prediction,Date,Team,Opponent,Result
234,1,0,2024-10-12,Anaheim Ducks,San Jose Sharks,1
235,0,0,2024-10-13,Anaheim Ducks,Vegas Golden Knights,0
236,1,0,2024-10-16,Anaheim Ducks,Utah Hockey Club,1
237,0,0,2024-10-18,Anaheim Ducks,Colorado Avalanche,0
238,0,0,2024-10-20,Anaheim Ducks,Los Angeles Kings,0
...,...,...,...,...,...,...
9895,1,1,2025-04-07,Winnipeg Jets,St Louis Blues,1
9896,1,1,2025-04-10,Winnipeg Jets,Dallas Stars,1
9897,0,1,2025-04-12,Winnipeg Jets,Chicago Blackhawks,0
9898,0,1,2025-04-13,Winnipeg Jets,Edmonton Oilers,0


In [34]:
final = combined.merge(combined, left_on=['Date', 'Team'], right_on=['Date', 'Opponent'])  # few games will drop due to rolling windows
final

Unnamed: 0,actual_x,prediction_x,Date,Team_x,Opponent_x,Result_x,actual_y,prediction_y,Team_y,Opponent_y,Result_y
0,1,0,2024-10-12,Anaheim Ducks,San Jose Sharks,1,0,0,San Jose Sharks,Anaheim Ducks,0
1,0,0,2024-10-13,Anaheim Ducks,Vegas Golden Knights,0,1,1,Vegas Golden Knights,Anaheim Ducks,1
2,1,0,2024-10-16,Anaheim Ducks,Utah Hockey Club,1,0,0,Utah Hockey Club,Anaheim Ducks,0
3,0,0,2024-10-18,Anaheim Ducks,Colorado Avalanche,0,1,1,Colorado Avalanche,Anaheim Ducks,1
4,0,0,2024-10-20,Anaheim Ducks,Los Angeles Kings,0,1,1,Los Angeles Kings,Anaheim Ducks,1
...,...,...,...,...,...,...,...,...,...,...,...
2613,1,1,2025-04-07,Winnipeg Jets,St Louis Blues,1,0,0,St Louis Blues,Winnipeg Jets,0
2614,1,1,2025-04-10,Winnipeg Jets,Dallas Stars,1,0,0,Dallas Stars,Winnipeg Jets,0
2615,0,1,2025-04-12,Winnipeg Jets,Chicago Blackhawks,0,0,0,Chicago Blackhawks,Winnipeg Jets,0
2616,0,1,2025-04-13,Winnipeg Jets,Edmonton Oilers,0,1,1,Edmonton Oilers,Winnipeg Jets,1


In [35]:
final[(final['prediction_x'] == 1) & (final['prediction_y'] == 0)]['actual_x'].value_counts()

actual_x
1    501
0    371
Name: count, dtype: int64

In [36]:
533 / (533 + 382)

0.5825136612021858

In [37]:
print(dataset['Result'].value_counts())

Result
0    5298
1    4602
Name: count, dtype: int64


In [38]:
def make_pred(data, predictors, model):
    train = data[data['Date'] < '2024-04-19']
    test = data[data['Date'] > '2024-04-19']

    model.fit(train[predictors], train['Result'])

    preds = model.predict(test[predictors])
    probs = model.predict_proba(test[predictors])[:, 1]  # Probability of class 1 (win)

    combined = pd.DataFrame({
        'actual': test['Result'],
        'prediction': preds,
        'win_probability': probs
    }, index=test.index)

    # Keep useful columns
    combined = pd.concat([combined, test[['Team', 'Opponent', 'Date']]], axis=1)

    precision = precision_score(test['Result'], preds)
    return combined, precision

In [39]:
combined, precision = make_pred(dataset, predictors, new_model)
combined

Unnamed: 0,actual,prediction,win_probability,Team,Opponent,Date
234,1,0,0.345305,Anaheim Ducks,San Jose Sharks,2024-10-12
235,0,0,0.340884,Anaheim Ducks,Vegas Golden Knights,2024-10-13
236,1,0,0.332033,Anaheim Ducks,Utah Hockey Club,2024-10-16
237,0,0,0.339955,Anaheim Ducks,Colorado Avalanche,2024-10-18
238,0,0,0.310977,Anaheim Ducks,Los Angeles Kings,2024-10-20
...,...,...,...,...,...,...
9895,1,1,0.576432,Winnipeg Jets,St Louis Blues,2025-04-07
9896,1,1,0.540239,Winnipeg Jets,Dallas Stars,2025-04-10
9897,0,1,0.757475,Winnipeg Jets,Chicago Blackhawks,2025-04-12
9898,0,1,0.605368,Winnipeg Jets,Edmonton Oilers,2025-04-13


In [40]:
# Merge team vs opponent predictions
paired = combined.merge(
    combined,
    left_on=['Date', 'Team'],
    right_on=['Date', 'Opponent'],
    suffixes=('_team', '_opp')
)

# Filter out same-team merges (shouldn't happen if data is clean)
paired = paired[paired['Team_team'] != paired['Team_opp']]
paired

Unnamed: 0,actual_team,prediction_team,win_probability_team,Team_team,Opponent_team,Date,actual_opp,prediction_opp,win_probability_opp,Team_opp,Opponent_opp
0,1,0,0.345305,Anaheim Ducks,San Jose Sharks,2024-10-12,0,0,0.289390,San Jose Sharks,Anaheim Ducks
1,0,0,0.340884,Anaheim Ducks,Vegas Golden Knights,2024-10-13,1,1,0.606443,Vegas Golden Knights,Anaheim Ducks
2,1,0,0.332033,Anaheim Ducks,Utah Hockey Club,2024-10-16,0,0,0.494984,Utah Hockey Club,Anaheim Ducks
3,0,0,0.339955,Anaheim Ducks,Colorado Avalanche,2024-10-18,1,1,0.628310,Colorado Avalanche,Anaheim Ducks
4,0,0,0.310977,Anaheim Ducks,Los Angeles Kings,2024-10-20,1,1,0.619471,Los Angeles Kings,Anaheim Ducks
...,...,...,...,...,...,...,...,...,...,...,...
2613,1,1,0.576432,Winnipeg Jets,St Louis Blues,2025-04-07,0,0,0.394369,St Louis Blues,Winnipeg Jets
2614,1,1,0.540239,Winnipeg Jets,Dallas Stars,2025-04-10,0,0,0.442402,Dallas Stars,Winnipeg Jets
2615,0,1,0.757475,Winnipeg Jets,Chicago Blackhawks,2025-04-12,0,0,0.282797,Chicago Blackhawks,Winnipeg Jets
2616,0,1,0.605368,Winnipeg Jets,Edmonton Oilers,2025-04-13,1,1,0.542286,Edmonton Oilers,Winnipeg Jets


In [41]:
#Filter only valid pairings
paired = paired[paired['Team_team'] != paired['Team_opp']]

# Choose team with higher probability to win
paired['predicted_winner'] = paired.apply(
    lambda row: row['Team_team'] if row['win_probability_team'] > row['win_probability_opp'] else row['Team_opp'],
    axis=1
)

# Determine actual winner from true result
paired['actual_winner'] = paired.apply(
    lambda row: row['Team_team'] if row['actual_team'] == 1 else row['Team_opp'],
    axis=1
)

paired

Unnamed: 0,actual_team,prediction_team,win_probability_team,Team_team,Opponent_team,Date,actual_opp,prediction_opp,win_probability_opp,Team_opp,Opponent_opp,predicted_winner,actual_winner
0,1,0,0.345305,Anaheim Ducks,San Jose Sharks,2024-10-12,0,0,0.289390,San Jose Sharks,Anaheim Ducks,Anaheim Ducks,Anaheim Ducks
1,0,0,0.340884,Anaheim Ducks,Vegas Golden Knights,2024-10-13,1,1,0.606443,Vegas Golden Knights,Anaheim Ducks,Vegas Golden Knights,Vegas Golden Knights
2,1,0,0.332033,Anaheim Ducks,Utah Hockey Club,2024-10-16,0,0,0.494984,Utah Hockey Club,Anaheim Ducks,Utah Hockey Club,Anaheim Ducks
3,0,0,0.339955,Anaheim Ducks,Colorado Avalanche,2024-10-18,1,1,0.628310,Colorado Avalanche,Anaheim Ducks,Colorado Avalanche,Colorado Avalanche
4,0,0,0.310977,Anaheim Ducks,Los Angeles Kings,2024-10-20,1,1,0.619471,Los Angeles Kings,Anaheim Ducks,Los Angeles Kings,Los Angeles Kings
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2613,1,1,0.576432,Winnipeg Jets,St Louis Blues,2025-04-07,0,0,0.394369,St Louis Blues,Winnipeg Jets,Winnipeg Jets,Winnipeg Jets
2614,1,1,0.540239,Winnipeg Jets,Dallas Stars,2025-04-10,0,0,0.442402,Dallas Stars,Winnipeg Jets,Winnipeg Jets,Winnipeg Jets
2615,0,1,0.757475,Winnipeg Jets,Chicago Blackhawks,2025-04-12,0,0,0.282797,Chicago Blackhawks,Winnipeg Jets,Winnipeg Jets,Chicago Blackhawks
2616,0,1,0.605368,Winnipeg Jets,Edmonton Oilers,2025-04-13,1,1,0.542286,Edmonton Oilers,Winnipeg Jets,Winnipeg Jets,Edmonton Oilers


In [42]:

# Evaluate how accurate our prediction was
paired['correct'] = paired['predicted_winner'] == paired['actual_winner']
accuracy = paired['correct'].mean()
print(f"Match-level accuracy: {accuracy:.3f}")

Match-level accuracy: 0.583
