In [None]:
import pandas as pd

data = pd.read_csv('newdata.csv', index_col=0)
data

In [None]:

def rolling_averages(team, cols, new_cols, window=5):
    team = team.sort_values("Date")    # Getting team data organized chronologically
    rolling = team[cols].rolling(window, closed='left').mean()   # closed=left to ignore current row in sliding window
    team[new_cols] = rolling
    team = team.dropna(subset=new_cols) # dropping first rows because not enough data
    return team


In [None]:
data_opp = data.drop(columns=['Opponent']).rename(columns={
    'Team': 'Opponent',
    'CF': 'Opponent_CF',
    'CA': 'Opponent_CA',
    'CF%': 'Opponent_CF%',
    'FF': 'Opponent_FF',
    'FA': 'Opponent_FA',
    'FF%': 'Opponent_FF%',
    'SF': 'Opponent_SF',
    'SA': 'Opponent_SA',
    'GF': 'Opponent_GF',
    'GA': 'Opponent_GA',
    'xGF': 'Opponent_xGF',
    'xGA': 'Opponent_xGA',
    'xGF%': 'Opponent_xGF%',
    'HDCF' : 'Opponent_HDCF',
    'HDCF%' : 'Opponent_HDCF%',
    'SCF' : 'Opponent_SCF',
    'PDO' : 'Opponent_PDO'
})


merged = data.merge(
    data_opp,
    left_on=['Date', 'Opponent'],
    right_on=['Date', 'Opponent'],
    how='inner',
    suffixes=('', '_y')  # Avoids conflicts if any columns aren’t renamed
)

In [None]:
merged

In [None]:
merged['CF_diff'] = merged['CF'] - merged['Opponent_CF']
merged['CF%_diff'] = merged['CF%'] - merged['Opponent_CF%']
merged['GF_diff'] = merged['GF'] - merged['Opponent_GF']
merged['xGF_diff'] = merged['xGF'] - merged['Opponent_xGF']
merged['HDCF_diff'] = merged['HDCF'] - merged['Opponent_HDCF']
merged['HDCF%_diff'] = merged['HDCF%'] - merged['Opponent_HDCF%']
merged['FF_diff'] = merged['FF'] - merged['Opponent_FF']
merged['FF%_diff'] = merged['FF%'] - merged['Opponent_FF%']
merged['SCF_diff'] = merged['SCF'] - merged['Opponent_SCF']
merged['PDO_diff'] = merged['PDO'] - merged['Opponent_PDO']


In [None]:
merged

In [None]:
merged['GF%'] = pd.to_numeric(merged['GF%'], errors='coerce')
merged['xGF%'] = pd.to_numeric(merged['xGF%'], errors='coerce')

print(merged[['GF%', 'xGF%']].dtypes)

In [None]:
feature = [
    'CF%', 'FF%', 'SF%', 'xGF%', 'SCF%', 'HDCF%', 'GF%', 'SH%', 'SV%', 'HDCA', 'xGA', 'PDO'
]
feature += ['CF', 'CA', 'FF', 'FA', 'SF', 'GA', 'GF', 'SCA', 'SCF', 'HDCF']
feature_diff = ['CF_diff', 'CF%_diff', 'GF_diff', 'xGF_diff', 'HDCF_diff', 'HDCF%_diff', 'FF_diff', 'FF%_diff', 'SCF_diff', 'PDO_diff']

features = feature + feature_diff

features = ['SF%', 'PDO', 'PDO_diff', 'SV%', 'CF%', 'CF%_diff', 'FF%', 'FF%_diff', 'HDCF%', 'HDCF%_diff', 'SCF%', 'SCF_diff', 'GF%', 'xGF%']

features = ['SCF%', 'xGF%', 'CF%_diff']

#all_predictors = ['CF%', 'SF%', 'xGF%', 'SV%']

new_cols = [f'{c}_rolling' for c in features]

predictors = new_cols




In [None]:
print(merged[features].dtypes)

In [None]:
# Adjusting columns order for debugging / clarity
columns = list(merged.columns)

columns.remove('Date')
columns.remove('Result')

columns.insert(1, 'Date')
columns.insert(2, 'Result')

merged = merged[columns]
merged

In [None]:
merged = merged[merged['Team'] != 'Arizona Coyotes']
merged = merged[merged['Opponent'] != 'Arizona Coyotes']
merged

In [None]:
merged['Date'] = pd.to_datetime(merged['Date'])

data = merged.groupby('Team').apply(lambda x: rolling_averages(x, features, new_cols, 3))
data = data.droplevel('Team')
data.index = range(data.shape[0])
data

In [None]:
data.drop(columns=['Result_y'], inplace=True)

In [None]:
data

In [None]:
#####
# 1. Initialize Elo ratings
initial_elo = 1500
teams = data['Team'].unique()
elo_ratings = {team: initial_elo for team in teams}

elo_features = []

# 2. Loop through each game and update ratings
for idx, row in data.iterrows():
    team = row['Team']
    opponent = row['Opponent']
    result = row['Result']  # 1 if win, 0 if loss

    # Optional: home-ice advantage
    team_elo = elo_ratings[team]
    opponent_elo = elo_ratings[opponent]

    # Store Elo features BEFORE the game
    elo_features.append({
        'team_elo': team_elo,
        'opponent_elo': opponent_elo,
        'elo_diff': team_elo - opponent_elo
    })

    # Calculate expected outcome
    expected_win = 1 / (1 + 10 ** ((opponent_elo - team_elo) / 400))

    # Elo update (K-factor can be tuned)
    #k = 40

    k = 30
    change = k * (result - expected_win)
    elo_ratings[team] += change
    elo_ratings[opponent] -= change

# Convert Elo features to DataFrame
elo_df = pd.DataFrame(elo_features)

# Merge with combined_team_view
dataset = pd.concat([data.reset_index(drop=True), elo_df], axis=1)

####
new = ['team_elo', 'opponent_elo', 'elo_diff']
predictors = predictors + new

In [None]:
# Computation for scale_pos_weight
class_counts = data['Result'].value_counts()

count_class_0 = class_counts[0]
count_class_1 = class_counts[1]

print(f"Losses (0): {count_class_0}")
print(f"Wins   (1): {count_class_1}")

scale = count_class_0 / count_class_1

In [None]:
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression

model = XGBClassifier(scale_pos_weight = scale, random_state=10)

# Defining Time Series Split
TSS = TimeSeriesSplit(n_splits=5)

test_model = RandomForestClassifier(random_state=10)

lin = BaggingClassifier(LogisticRegression(random_state=10, solver='liblinear', penalty='l2', max_iter=1000))


In [None]:
from sklearn.metrics import precision_score

# Function to make predictions given the data, input features and chosen model

def make_predictions(data, predictors, model):
    train = data[data['Date'] < '2024-04-19']
    #train = train[train['Date'] > '2022-10-06']
    test = data[data['Date'] > '2024-04-19']
    model.fit(train[predictors], train['Result'])
    preds = model.predict(test[predictors])
    combined  = pd.DataFrame(dict(actual=test['Result'], prediction = preds), index=test.index)
    precision = precision_score(test['Result'], preds)
    return combined, precision

In [None]:
# Defining search space for GridSearchCV
search_grid = {
    'n_estimators': [50, 75, 100, 150],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.03, 0.05],
    'reg_alpha': [1, 5, 10],
    'reg_lambda': [1, 5, 10]

}

alt_search_grid = {
    'n_estimators' : [50, 100, 200, 500],
    'max_depth' : [3, 6, 9],
    'min_samples_split': [3, 5, 10]
}

lin_search_grid = {
    # Logistic Regression hyperparameters (base_estimator__)
    'estimator__C' : [0.5, 0.8, 1.0],
    'n_estimators': [3, 5, 10, 50, 100],
}

GS = GridSearchCV(
    estimator = lin,
    param_grid = lin_search_grid,
    scoring = 'neg_log_loss',
    refit = True,
    cv = TSS,
    verbose= 4
)

training = dataset[dataset['Date'] < '2024-04-19']  # Training using 2021-2024 data
#training = training[training['Date'] > '2022-10-06']
testing = dataset[dataset['Date'] > '2024-04-19']   # Testing on most recent season (2024-2025)
print(training.columns[training.columns.str.contains('Result')])

In [None]:
predictors

In [None]:
dataset

In [None]:
print(type(training['Result']))         # should be <class 'pandas.Series'>
print(training['Result'].shape)        # should be (n_samples,)

In [None]:
GS.fit(training[predictors], training['Result'])     # Training

In [None]:
GS.best_score_

In [None]:
new_model = GS.best_estimator_
new_model

In [None]:
combined, precision = make_predictions(dataset, predictors, new_model)
precision

In [None]:
from sklearn.metrics import classification_report, roc_auc_score, log_loss

predictions = new_model.predict(testing[predictors])

print(classification_report(testing['Result'], predictions))

In [None]:
# Create DataFrame pairing features with their importances
importances = pd.DataFrame({
    'Feature': predictors,
    'Importance': new_model.feature_importances_
})

# Sort by importance
importances = importances.sort_values(by='Importance', ascending=False)

# Display top features
print(importances.head(10))

In [None]:
print(dataset['Result'].unique())
print(training['Result'].value_counts())

In [None]:
print(training[new_cols].info())
print(training[new_cols].isna().sum())
print(training[new_cols].describe())


In [None]:
import numpy as np
# Check class balance after preprocessing
print(data['Result'].value_counts())

# Check if model predicted any 1s
print(np.unique(predictions, return_counts=True))

# Look at rolling feature distribution
print(training[new_cols].describe())

In [None]:
combined = combined.merge(dataset[['Date', 'Team', 'Opponent', 'Result']], left_index=True, right_index=True)
combined

In [None]:
final = combined.merge(combined, left_on=['Date', 'Team'], right_on=['Date', 'Opponent'])  # few games will drop due to rolling windows
final

In [None]:
final[(final['prediction_x'] == 1) & (final['prediction_y'] == 0)]['actual_x'].value_counts()

In [None]:
533 / (533 + 382)

In [None]:
print(dataset['Result'].value_counts())

In [None]:
def make_pred(data, predictors, model):
    train = data[data['Date'] < '2024-04-19']
    test = data[data['Date'] > '2024-04-19']

    model.fit(train[predictors], train['Result'])

    preds = model.predict(test[predictors])
    probs = model.predict_proba(test[predictors])[:, 1]  # Probability of class 1 (win)

    combined = pd.DataFrame({
        'actual': test['Result'],
        'prediction': preds,
        'win_probability': probs
    }, index=test.index)

    # Keep useful columns
    combined = pd.concat([combined, test[['Team', 'Opponent', 'Date']]], axis=1)

    precision = precision_score(test['Result'], preds)
    return combined, precision

In [None]:
combined, precision = make_pred(dataset, predictors, new_model)
combined

In [None]:
# Merge team vs opponent predictions
paired = combined.merge(
    combined,
    left_on=['Date', 'Team'],
    right_on=['Date', 'Opponent'],
    suffixes=('_team', '_opp')
)

# Filter out same-team merges (shouldn't happen if data is clean)
paired = paired[paired['Team_team'] != paired['Team_opp']]
paired

In [None]:
#Filter only valid pairings
paired = paired[paired['Team_team'] != paired['Team_opp']]

# Choose team with higher probability to win
paired['predicted_winner'] = paired.apply(
    lambda row: row['Team_team'] if row['win_probability_team'] > row['win_probability_opp'] else row['Team_opp'],
    axis=1
)

# Determine actual winner from true result
paired['actual_winner'] = paired.apply(
    lambda row: row['Team_team'] if row['actual_team'] == 1 else row['Team_opp'],
    axis=1
)

paired

In [None]:

# Evaluate how accurate our prediction was
paired['correct'] = paired['predicted_winner'] == paired['actual_winner']
accuracy = paired['correct'].mean()
print(f"Match-level accuracy: {accuracy:.3f}")