In [1]:
import os
from pathlib import Path
from sqlalchemy import create_engine
import pandas as pd
import numpy as np

In [2]:
BASE_DIR = os.path.dirname(Path(os.path.abspath('')))
DATABASE_DIR = os.path.join(BASE_DIR, "db")
DATABASE_FILE_PATH = os.path.join(DATABASE_DIR, "game-data.sqlite")
engine = create_engine(f"sqlite:///{DATABASE_FILE_PATH}")

In [3]:
pd.set_option('display.max_columns', None)

## Data Import

In [4]:
unblocked_query = """
WITH GoalieIdJoin AS (
SELECT
shots.gameId,
shots.season,
shots.TeamId,
shots.TeamName,
shots.isHomeTeam,
shots.Prev25FenwickForPerHour,
shots.Prev25FenwickAgainstPerHour,
pp.Prev25ShotsFor5v4PerHour,
pp.Prev25ShotsAgainst4v5PerHour,
spct.Prev25ShootingPercentage,
playerId as GoalieId
FROM UnblockedShotGenSupLast25 AS shots 
LEFT JOIN goalie_saves as gs ON shots.gameId = gs.gameId AND ((shots.isHomeTeam = 1 AND gs.homeRoad = 'H') OR (shots.isHomeTeam = 0 AND gs.homeRoad = 'R')) and gs.gamesStarted = 1 
LEFT JOIN ShotGenSup5v4Last25 as pp ON pp.gameId = shots.gameId AND pp.teamId = shots.TeamId
LEFT JOIN TeamScoringPercentage as spct ON spct.gameId = shots.gameId AND spct.eventOwnerTeamId = shots.TeamId
)
SELECT GoalieIdJoin.*,
gsave.diluted_save_pct,
games.homeScore > games.visitingScore AS HomeTeamWin
FROM GoalieIdJoin
LEFT JOIN GoalieSavePct as gsave ON gsave.goalieId = GoalieIdJoin.GoalieId
LEFT JOIN games ON games.id = GoalieIdJoin.gameId
"""
UnblockedShotGenSup = pd.read_sql_query(unblocked_query, engine)

In [5]:
df = UnblockedShotGenSup

## Data Processing

In [6]:
df_home = df[df["isHomeTeam"] == 1].rename(columns={
    "TeamName": "HomeTeamName",
    "TeamId": "HomeTeamId",
    "Prev25FenwickForPerHour": "HomePrev25FenwickForPerHour",
    "Prev25FenwickAgainstPerHour": "HomePrev25FenwickAgainstPerHour",
    "Prev25ShotsFor5v4PerHour": "HomePrev25ShotsFor5v4PerHour",
    "Prev25ShotsAgainst4v5PerHour": "HomePrev25ShotsAgainst4v5PerHour",
    "Prev25ShootingPercentage": "HomePrev25ShootingPercentage",
    "GoalieId": "HomeGoalieId",
    "diluted_save_pct": "HomeDilutedSavePct"
}).drop(["isHomeTeam"], axis=1)
df_away = df[df["isHomeTeam"] == 0].rename(columns={
    "TeamName": "AwayTeamName",
    "TeamId": "AwayTeamId",
    "Prev25FenwickForPerHour": "AwayPrev25FenwickForPerHour",
    "Prev25FenwickAgainstPerHour": "AwayPrev25FenwickAgainstPerHour",
    "Prev25ShotsFor5v4PerHour": "AwayPrev25ShotsFor5v4PerHour",
    "Prev25ShotsAgainst4v5PerHour": "AwayPrev25ShotsAgainst4v5PerHour",
    "Prev25ShootingPercentage": "AwayPrev25ShootingPercentage",
    "GoalieId": "AwayGoalieId",
    "diluted_save_pct": "AwayDilutedSavePct"
}).drop(["season", "isHomeTeam", "HomeTeamWin"], axis=1)

In [7]:
df_com = pd.merge(left=df_home, right=df_away, how="left", on="gameId")

In [8]:
df_com.loc[df_com.isnull().any(axis=1)]

Unnamed: 0,gameId,season,HomeTeamId,HomeTeamName,HomePrev25FenwickForPerHour,HomePrev25FenwickAgainstPerHour,HomePrev25ShotsFor5v4PerHour,HomePrev25ShotsAgainst4v5PerHour,HomePrev25ShootingPercentage,HomeGoalieId,HomeDilutedSavePct,HomeTeamWin,AwayTeamId,AwayTeamName,AwayPrev25FenwickForPerHour,AwayPrev25FenwickAgainstPerHour,AwayPrev25ShotsFor5v4PerHour,AwayPrev25ShotsAgainst4v5PerHour,AwayPrev25ShootingPercentage,AwayGoalieId,AwayDilutedSavePct
0,2010020007,20102011,1,New Jersey Devils,,,,,,8455710,0.913166,0,25,Dallas Stars,,,,,,8470140,0.923361
41,2010020015,20102011,2,New York Islanders,,,,,,8468481,0.906516,0,25,Dallas Stars,36.724883,35.229596,60.0,40.449438,0.130435,8470140,0.923361
172,2010020002,20102011,5,Pittsburgh Penguins,,,,,,8470594,0.921912,0,4,Philadelphia Flyers,,,,,,8475683,0.924061
217,2010020012,20102011,6,Boston Bruins,,,,,,8471695,0.927981,0,27,Phoenix Coyotes,,,,,,8468524,0.922107
271,2010020013,20102011,7,Buffalo Sabres,44.573072,37.649607,112.720157,93.413174,0.043478,8468011,0.923425,0,3,New York Rangers,,,,,,8468685,0.926831
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18895,2024020037,20242025,28,San Jose Sharks,31.127050,52.168905,141.176471,105.000000,0.095238,8477970,0.916120,0,24,Anaheim Ducks,,,,,,8480843,0.916953
18976,2024020018,20242025,30,Minnesota Wild,,,,,,8479406,0.921956,1,29,Columbus Blue Jackets,,,,,,8478007,0.912389
19068,2024020010,20242025,54,Vegas Golden Knights,,,,,,8478499,0.914868,1,21,Colorado Avalanche,,,,,,8480382,0.914913
19115,2024020003,20242025,55,Seattle Kraken,,,,,,8475831,0.916550,0,19,St. Louis Blues,,,,,,8476412,0.917520


In [9]:
df_nn = df_com.loc[df_com["HomePrev25FenwickAgainstPerHour"].notna() & df_com["AwayPrev25FenwickForPerHour"].notna()].copy()
df_nn.dropna(inplace=True)

In [10]:
df_std = df_nn.copy()
cols_to_standardize = ["HomePrev25FenwickForPerHour", "HomePrev25FenwickAgainstPerHour", "HomePrev25ShotsFor5v4PerHour", "HomePrev25ShotsAgainst4v5PerHour", "HomePrev25ShootingPercentage", "HomeDilutedSavePct", "AwayPrev25FenwickForPerHour", "AwayPrev25FenwickAgainstPerHour", "AwayPrev25ShotsFor5v4PerHour", "AwayPrev25ShotsAgainst4v5PerHour", "AwayPrev25ShootingPercentage", "AwayDilutedSavePct"]
df_std[cols_to_standardize] = df_std.groupby('season')[cols_to_standardize].transform(
    lambda x: (x - x.mean()) / x.std()
)

In [11]:
df_std.dtypes

gameId                                int64
season                                int64
HomeTeamId                            int64
HomeTeamName                         object
HomePrev25FenwickForPerHour         float64
HomePrev25FenwickAgainstPerHour     float64
HomePrev25ShotsFor5v4PerHour        float64
HomePrev25ShotsAgainst4v5PerHour    float64
HomePrev25ShootingPercentage        float64
HomeGoalieId                          int64
HomeDilutedSavePct                  float64
HomeTeamWin                           int64
AwayTeamId                            int64
AwayTeamName                         object
AwayPrev25FenwickForPerHour         float64
AwayPrev25FenwickAgainstPerHour     float64
AwayPrev25ShotsFor5v4PerHour        float64
AwayPrev25ShotsAgainst4v5PerHour    float64
AwayPrev25ShootingPercentage        float64
AwayGoalieId                          int64
AwayDilutedSavePct                  float64
dtype: object

## Logistic Regression Model Fitting

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
df_std.dtypes

gameId                                int64
season                                int64
HomeTeamId                            int64
HomeTeamName                         object
HomePrev25FenwickForPerHour         float64
HomePrev25FenwickAgainstPerHour     float64
HomePrev25ShotsFor5v4PerHour        float64
HomePrev25ShotsAgainst4v5PerHour    float64
HomePrev25ShootingPercentage        float64
HomeGoalieId                          int64
HomeDilutedSavePct                  float64
HomeTeamWin                           int64
AwayTeamId                            int64
AwayTeamName                         object
AwayPrev25FenwickForPerHour         float64
AwayPrev25FenwickAgainstPerHour     float64
AwayPrev25ShotsFor5v4PerHour        float64
AwayPrev25ShotsAgainst4v5PerHour    float64
AwayPrev25ShootingPercentage        float64
AwayGoalieId                          int64
AwayDilutedSavePct                  float64
dtype: object

In [14]:
cols = ["HomePrev25FenwickForPerHour",
        "AwayPrev25FenwickForPerHour",
        "HomePrev25FenwickAgainstPerHour", 
        "AwayPrev25FenwickAgainstPerHour",
        "HomePrev25ShotsFor5v4PerHour", 
        "AwayPrev25ShotsFor5v4PerHour",
        "HomePrev25ShotsAgainst4v5PerHour", 
        "AwayPrev25ShotsAgainst4v5PerHour",
        "HomePrev25ShootingPercentage", 
        "AwayPrev25ShootingPercentage",
        "HomeDilutedSavePct", 
        "AwayDilutedSavePct"]
X = df_std[cols].copy()
Y = df_std[["HomeTeamWin"]].copy()

In [15]:
log_reg = LogisticRegression()
log_reg.fit(X, Y.values.ravel())

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [16]:
list(zip(cols, log_reg.coef_[0]))

[('HomePrev25FenwickForPerHour', np.float64(0.1035523953667874)),
 ('AwayPrev25FenwickForPerHour', np.float64(-0.1025966114292798)),
 ('HomePrev25FenwickAgainstPerHour', np.float64(-0.15003063964910018)),
 ('AwayPrev25FenwickAgainstPerHour', np.float64(0.12092185808289044)),
 ('HomePrev25ShotsFor5v4PerHour', np.float64(0.04371643495619142)),
 ('AwayPrev25ShotsFor5v4PerHour', np.float64(-0.0471648882413947)),
 ('HomePrev25ShotsAgainst4v5PerHour', np.float64(-0.008686878265346587)),
 ('AwayPrev25ShotsAgainst4v5PerHour', np.float64(0.035609897824414795)),
 ('HomePrev25ShootingPercentage', np.float64(0.04056951619886304)),
 ('AwayPrev25ShootingPercentage', np.float64(-0.06173745583661633)),
 ('HomeDilutedSavePct', np.float64(0.112436533385258)),
 ('AwayDilutedSavePct', np.float64(-0.1286673024372822))]

In [17]:
cols_pip = ["HomePrev25FenwickForPerHour",
        "AwayPrev25FenwickForPerHour",
        "HomePrev25FenwickAgainstPerHour", 
        "AwayPrev25FenwickAgainstPerHour"]
X_Pip = df_std[cols_pip].copy()
log_reg.fit(X_Pip, Y.values.ravel())

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [18]:
list(zip(cols_pip, log_reg.coef_[0]))

[('HomePrev25FenwickForPerHour', np.float64(0.1217403474451301)),
 ('AwayPrev25FenwickForPerHour', np.float64(-0.12292205744998448)),
 ('HomePrev25FenwickAgainstPerHour', np.float64(-0.15762394716559122)),
 ('AwayPrev25FenwickAgainstPerHour', np.float64(0.13635005384430976))]

## Logistic Regression Grid Search

In [19]:
from sklearn.model_selection import cross_val_score, GridSearchCV

In [20]:
log_reg = LogisticRegression()
scores = cross_val_score(log_reg, X, Y.values.ravel(), cv=5, scoring="neg_brier_score")
print(scores)

[-0.24008909 -0.24275874 -0.24265521 -0.23810482 -0.23765567]


In [21]:
gs = GridSearchCV(log_reg, {
    'C': [0.01, 0.1, 1, 10, 100],
    'fit_intercept': [True, False],
    'solver': ["liblinear", "newton-cholesky", "newton-cg", "saga"]
}, cv=5)

In [22]:
gs.fit(X, Y.values.ravel())

0,1,2
,estimator,LogisticRegression()
,param_grid,"{'C': [0.01, 0.1, ...], 'fit_intercept': [True, False], 'solver': ['liblinear', 'newton-cholesky', ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'newton-cholesky'
,max_iter,100


In [23]:
gs.best_score_

np.float64(0.5794439428359188)

In [24]:
gs.best_estimator_.coef_

array([[ 0.10322895, -0.10251913, -0.14953365,  0.12081983,  0.04364969,
        -0.04724276, -0.00872349,  0.03571123,  0.04045621, -0.06154613,
         0.11216729, -0.12844073]])