In [1]:
import pandas as pd
import duckdb

### Data

In [2]:
db = duckdb.connect('./data/data.db')
Xy = db.sql(
"""
SELECT
    * Exclude(Home_Score, Away_Score)
FROM game_features
"""
).df()
db.close()

### Model

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

In [4]:
X = Xy.drop(columns=['Date', 'Year', 'Home_Team', 'Away_Team', 'Home_Won'])
y = Xy['Home_Won']

In [5]:
def additive_smoothing_win_rate_diff(X, k=2):
    htw = (X["Home_Wins"] + k) / (X["Home_Games_Played"] + 2*k)
    atw = (X["Away_Wins"] + k) / (X["Away_Games_Played"] + 2*k)
    return (htw - atw).values.reshape(-1, 1)

def weighted_win_rate_diff(X, C=4, max_rank=32):
    X_ = X.copy()
    X_["Home_Raw_Win_Pct"] = (X_["Home_Wins"] / X_["Home_Games_Played"]).fillna(0.5)    
    X_["Away_Raw_Win_Pct"] = (X_["Away_Wins"] / X_["Away_Games_Played"]).fillna(0.5)

    hps = 1 - (X_["Home_Rank"] - 1) / (max_rank - 1)
    aps = 1 - (X_["Away_Rank"] - 1) / (max_rank - 1)
    
    hcw = X_["Home_Games_Played"] / (X_["Home_Games_Played"] + C)
    acw = X_["Away_Games_Played"] / (X_["Away_Games_Played"] + C)
    
    hwr = (hcw * X_["Home_Raw_Win_Pct"]) + ((1 - hcw) * hps)
    awr = (acw * X_["Away_Raw_Win_Pct"]) + ((1 - acw) * aps)
    
    return (hwr - awr).values.reshape(-1, 1)

In [6]:
def create_model(
    penalty='l2',
    C=1.0,
    tol=1e-4,
    solver='lbfgs',
    max_iter=1000,
    wr_C=2,
    random_state=42,
    **kwargs
):

    # Define the column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('passthrough', 'passthrough', ["Is_Neutral", "Spread"]),
            # ('Diff_Rank', 
            #     FunctionTransformer(
            #         lambda X_: (X_['Home_Rank'] - X_['Away_Rank']).values.reshape(-1, 1)
            #     ), 
            #     ['Home_Rank', 'Away_Rank']
            # ),
            ('Diff_Days_Rest', 
                FunctionTransformer(
                    lambda X_: (X_['Home_Days_Since_Last_Game'] - X_['Away_Days_Since_Last_Game']).values.reshape(-1, 1)
                ), 
                ['Home_Days_Since_Last_Game', 'Away_Days_Since_Last_Game']
            ),
            ('Win_Rate_Diff', 
                FunctionTransformer(lambda X_: weighted_win_rate_diff(X_, C=wr_C)), 
                ['Home_Wins', 'Home_Games_Played', 'Away_Wins', 'Away_Games_Played', 'Home_Rank', 'Away_Rank']
            ),
            ('Season_Stage', 
                Pipeline([
                    ("transform", FunctionTransformer(
                        lambda X_: X_["Week"].map(lambda x: 0 if x <= 6 else 1 if x <= 12 else 2).values.reshape(-1, 1)
                        )
                    ),
                    ("encode", OneHotEncoder())
                ]), 
                ['Week']
            ),
        ]
    )

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('clf', LogisticRegression(
            penalty=penalty,
            C=C, tol=tol,
            solver=solver,
            max_iter=max_iter,
            random_state=random_state,
            **kwargs
        ))
    ])
    return pipeline

In [7]:
# model =  create_model()
# pd.DataFrame(
#     model.named_steps['preprocessor'].fit_transform(X, y),
#     columns=["Is_Neutral", "Rank_Age", "Diff_Rank", "Diff_Days_Rest", "Win_Rate_Diff", "1", "2", "3"]
# )mk

In [8]:
import optuna
from sklearn.model_selection import StratifiedKFold, cross_val_score
import numpy as np

optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective(trial):
    pipeline = create_model(
        C=trial.suggest_float('C', 0.0001, 10.0, log=True),
        tol=trial.suggest_float('tol', 1e-6, 1e-2, log=True),
        wr_C=trial.suggest_int("wr_C", 1, 18, step=1)
    )

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring='neg_log_loss')
    return -scores.mean()

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=250, show_progress_bar=True)

print("Best trial:")
print(study.best_trial)

  0%|          | 0/250 [00:00<?, ?it/s]

Best trial:
FrozenTrial(number=104, state=1, values=[0.610355907440373], datetime_start=datetime.datetime(2025, 7, 13, 14, 2, 38, 704513), datetime_complete=datetime.datetime(2025, 7, 13, 14, 2, 38, 761115), params={'C': 0.010463208808686153, 'tol': 0.0021796347772512804, 'wr_C': 14}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'C': FloatDistribution(high=10.0, log=True, low=0.0001, step=None), 'tol': FloatDistribution(high=0.01, log=True, low=1e-06, step=None), 'wr_C': IntDistribution(high=18, log=False, low=1, step=1)}, trial_id=104, value=None)


In [9]:
# Get the best parameters from the Optuna study
best_params = study.best_trial.params

# Create the model using the optimal parameters
model = create_model(**best_params)

# Train the model on all data
model.fit(X, y)

0,1,2
,steps,"[('preprocessor', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('passthrough', ...), ('Diff_Days_Rest', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,func,<function cre...t 0x1242b6c00>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,func,<function cre...t 0x1242b7240>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,func,<function cre...t 0x1242b6ca0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0021796347772512804
,C,0.010463208808686153
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [10]:
X_test = Xy.copy().drop(columns=["Home_Team", "Away_Team", "Year"]) # [ (Xy.Year == 2024) & (Xy.Week == 1) ]
X_test["p"] = model.predict_proba(X_test.drop(columns=["Home_Won"]))[:, 1]
X_test["Home_Won-p"] = X_test["p"].map(lambda p: 1 if p >= 0.5 else 0)
X_test["Home_Won-home"] = 1
X_test

Unnamed: 0,Week,Date,Is_Neutral,Spread,Home_Rank,Away_Rank,Home_Days_Since_Last_Game,Away_Days_Since_Last_Game,Home_Games_Played,Away_Games_Played,Home_Wins,Away_Wins,Home_Losses,Away_Losses,Home_Won,p,Home_Won-p,Home_Won-home
0,1,2013-09-08,0,10.0,22,1,14,14,0,0,0.0,0.0,0.0,0.0,0,0.162455,0,1
1,1,2013-09-08,0,3.0,29,6,14,14,0,0,0.0,0.0,0.0,0.0,0,0.354910,0,1
2,1,2013-09-08,0,-3.0,18,15,14,14,0,0,0.0,0.0,0.0,0.0,1,0.576293,1,1
3,1,2013-09-08,0,-2.5,25,20,14,14,0,0,0.0,0.0,0.0,0.0,0,0.557735,1,1
4,1,2013-09-08,0,-3.5,12,10,14,14,0,0,0.0,0.0,0.0,0.0,1,0.594511,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3130,18,2025-01-05,0,3.0,31,2,8,7,16,16,3.0,13.0,13.0,3.0,1,0.381244,0,1
3131,18,2025-01-05,0,-3.0,5,30,7,7,16,16,13.0,3.0,3.0,13.0,1,0.606591,1,1
3132,18,2025-01-04,0,2.5,10,11,10,7,16,16,10.0,8.0,6.0,8.0,0,0.403051,0,1
3133,18,2025-01-05,0,-15.0,8,25,7,7,16,16,9.0,5.0,7.0,11.0,1,0.901748,1,1


In [11]:
(X_test["Home_Won"] == X_test["Home_Won-p"]).sum() / len(X_test)

np.float64(0.6650717703349283)

### Save model

In [12]:
import cloudpickle as pickle

with open('./models/lr_full.pkl', 'wb') as m:
    pickle.dump(model, m)