In [47]:
import os
from pathlib import Path
from sqlalchemy import create_engine
import pandas as pd
import numpy as np

In [19]:
BASE_DIR = os.path.dirname(Path(os.path.abspath('')))
DATABASE_DIR = os.path.join(BASE_DIR, "db")
DATABASE_FILE_PATH = os.path.join(DATABASE_DIR, "game-data.sqlite")
engine = create_engine(f"sqlite:///{DATABASE_FILE_PATH}")

In [140]:
unblocked_query = """
WITH GoalieIdJoin AS (
SELECT
shots.gameId,
shots.season,
shots.TeamId,
shots.TeamName,
shots.isHomeTeam,
shots.Prev25FenwickForPerHour,
shots.Prev25FenwickAgainstPerHour,
pp.Prev25ShotsFor5v4PerHour,
pp.Prev25ShotsAgainst4v5PerHour,
spct.Prev25ShootingPercentage,
playerId as GoalieId
FROM UnblockedShotGenSupLast25 AS shots 
LEFT JOIN goalie_saves as gs ON shots.gameId = gs.gameId AND ((shots.isHomeTeam = 1 AND gs.homeRoad = 'H') OR (shots.isHomeTeam = 0 AND gs.homeRoad = 'R')) and gs.gamesStarted = 1 
LEFT JOIN ShotGenSup5v4Last25 as pp ON pp.gameId = shots.gameId AND pp.teamId = shots.TeamId
LEFT JOIN TeamScoringPercentage as spct ON spct.gameId = shots.gameId AND spct.eventOwnerTeamId = shots.TeamId
)
SELECT GoalieIdJoin.*,
gsave.diluted_save_pct,
games.homeScore > games.visitingScore AS HomeTeamWin
FROM GoalieIdJoin
LEFT JOIN GoalieSavePct as gsave ON gsave.goalieId = GoalieIdJoin.GoalieId
LEFT JOIN games ON games.id = GoalieIdJoin.gameId
"""
UnblockedShotGenSup = pd.read_sql_query(unblocked_query, engine)

In [103]:
df = UnblockedShotGenSup

In [104]:
df_nn = df[df["Prev25FenwickAgainstPerHour"].notna()]

In [114]:
df_home = df_nn[df_nn["isHomeTeam"] == 1].rename(columns={
    "TeamName": "HomeTeamName",
    "TeamId": "HomeTeamId",
    "Prev25FenwickForPerHour": "HomePrev25FenwickForPerHour",
    "Prev25FenwickAgainstPerHour": "HomePrev25FenwickAgainstPerHour",
    "Prev25ShotsFor5v4PerHour": "HomePrev25ShotsFor5v4PerHour",
    "Prev25ShotsAgainst4v5PerHour": "HomePrev25ShotsAgainst4v5PerHour",
    "Prev25ShootingPercentage": "HomePrev25ShootingPercentage",
    "GoalieId": "HomeGoalieId",
    "diluted_save_pct": "HomeDilutedSavePct"
}).drop(["isHomeTeam"], axis=1)
df_away = df_nn[df_nn["isHomeTeam"] == 0].rename(columns={
    "TeamName": "AwayTeamName",
    "TeamId": "AwayTeamId",
    "Prev25FenwickForPerHour": "AwayPrev25FenwickForPerHour",
    "Prev25FenwickAgainstPerHour": "AwayPrev25FenwickAgainstPerHour",
    "Prev25ShotsFor5v4PerHour": "AwayPrev25ShotsFor5v4PerHour",
    "Prev25ShotsAgainst4v5PerHour": "AwayPrev25ShotsAgainst4v5PerHour",
    "Prev25ShootingPercentage": "AwayPrev25ShootingPercentage",
    "GoalieId": "AwayGoalieId",
    "diluted_save_pct": "AwayDilutedSavePct"
}).drop(["season", "isHomeTeam", "HomeTeamWin"], axis=1)

In [115]:
df_com = pd.merge(left=df_home, right=df_away, how="left", on="gameId")

In [149]:
df_com

Unnamed: 0,gameId,season,HomeTeamId,HomeTeamName,HomePrev25FenwickForPerHour,HomePrev25FenwickAgainstPerHour,HomePrev25ShotsFor5v4PerHour,HomePrev25ShotsAgainst4v5PerHour,HomePrev25ShootingPercentage,HomeGoalieId,...,HomeTeamWin,AwayTeamId,AwayTeamName,AwayPrev25FenwickForPerHour,AwayPrev25FenwickAgainstPerHour,AwayPrev25ShotsFor5v4PerHour,AwayPrev25ShotsAgainst4v5PerHour,AwayPrev25ShootingPercentage,AwayGoalieId,AwayDilutedSavePct
0,2010020027,20102011,1.0,New Jersey Devils,39.099006,41.675320,79.120879,56.814921,0.061224,8455710,...,0,5.0,Pittsburgh Penguins,48.225839,40.013350,71.523179,84.297521,0.038462,8462161.0,0.916907
1,2010020048,20102011,1.0,New Jersey Devils,40.258206,40.970074,91.099476,69.973190,0.040000,8455710,...,0,21.0,Colorado Avalanche,42.873685,42.268354,62.983425,110.297872,0.082353,8467950.0,0.919413
2,2010020056,20102011,1.0,New Jersey Devils,41.300980,40.910896,101.160660,67.648553,0.039062,8455710,...,0,6.0,Boston Bruins,52.909065,45.152840,105.017503,108.724832,0.066667,8460703.0,0.927824
3,2010020100,20102011,1.0,New Jersey Devils,39.889462,41.585541,94.778412,74.913295,0.052632,8460704,...,0,7.0,Buffalo Sabres,44.323312,38.073489,98.392283,101.694915,0.071770,8468011.0,0.923425
4,2010020179,20102011,1.0,New Jersey Devils,40.356640,41.198609,88.415882,84.229919,0.053254,8460704,...,0,3.0,New York Rangers,42.574722,42.543636,93.307839,100.987433,0.088123,8468685.0,0.926831
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17583,2024021188,20242025,59.0,Utah Hockey Club,46.859218,39.264275,89.936486,93.218249,0.077944,8478872,...,1,20.0,Calgary Flames,43.461463,40.868321,101.610971,110.021322,0.062395,8481692.0,0.920215
17584,2024021202,20242025,59.0,Utah Hockey Club,46.698060,39.120470,89.735331,92.615012,0.080000,8478872,...,0,26.0,Los Angeles Kings,46.449416,37.867495,106.172840,82.765092,0.102694,8475311.0,0.921607
17585,2024021219,20242025,59.0,Utah Hockey Club,45.811441,39.457845,90.485657,90.998767,0.080134,8478872,...,1,52.0,Winnipeg Jets,42.317290,39.229430,93.831957,87.378641,0.101504,8476945.0,0.926401
17586,2024021243,20242025,59.0,Utah Hockey Club,46.092442,39.494459,96.321070,88.661961,0.080000,8478872,...,1,55.0,Seattle Kraken,43.352008,44.222161,80.946746,103.323263,0.097938,8478916.0,0.920074


In [135]:
df_std = df_com.copy()

In [138]:
cols_to_standardize = ["HomePrev25FenwickForPerHour", "HomePrev25FenwickAgainstPerHour", "HomePrev25ShotsFor5v4PerHour", "HomePrev25ShotsAgainst4v5PerHour", "HomePrev25ShootingPercentage", "HomeDilutedSavePct", "AwayPrev25FenwickForPerHour", "AwayPrev25FenwickAgainstPerHour", "AwayPrev25ShotsFor5v4PerHour", "AwayPrev25ShotsAgainst4v5PerHour", "AwayPrev25ShootingPercentage", "AwayDilutedSavePct"]
df_std[cols_to_standardize] = df_std.groupby('season')[cols_to_standardize].transform(
    lambda x: (x - x.mean()) / x.std()
)

In [151]:
df_std.dtypes

gameId                                int64
season                                int64
HomeTeamId                          float64
HomeTeamName                         object
HomePrev25FenwickForPerHour         float64
HomePrev25FenwickAgainstPerHour     float64
HomePrev25ShotsFor5v4PerHour        float64
HomePrev25ShotsAgainst4v5PerHour    float64
HomePrev25ShootingPercentage        float64
HomeGoalieId                          int64
HomeDilutedSavePct                  float64
HomeTeamWin                           int64
AwayTeamId                          float64
AwayTeamName                         object
AwayPrev25FenwickForPerHour         float64
AwayPrev25FenwickAgainstPerHour     float64
AwayPrev25ShotsFor5v4PerHour        float64
AwayPrev25ShotsAgainst4v5PerHour    float64
AwayPrev25ShootingPercentage        float64
AwayGoalieId                        float64
AwayDilutedSavePct                  float64
dtype: object

## Prediction Model Fitting

In [153]:
from sklearn.linear_model import LogisticRegression