In [2]:
"""
Downloading data
"""
from nba_api.stats.endpoints import leaguegamefinder
import pandas as pd
import numpy as np
import time

years = range(10, 25)
seasons = [f"20{i:02d}-{(i+1)%100:02d}" for i in years]

to_merge = []
for s in seasons:
    time.sleep(1)
    finder = leaguegamefinder.LeagueGameFinder(
        season_nullable=s,
        season_type_nullable="Regular Season",
        league_id_nullable='00'
    )

    game_df = finder.get_data_frames()[0]
    to_merge.append(game_df)

games_df = pd.concat(to_merge)

In [3]:
"""
Merging games into single rows
"""

def merge_games(df):
    df['HOME'] = df['MATCHUP'].str.contains('vs.') 
    home_df = df[df['HOME']].copy()
    away_df = df[~df['HOME']].copy()

    merged = home_df.merge(
        away_df,
        on='GAME_ID',
        suffixes=('_home', '_away')
    )

    merged = merged[[
        'SEASON_ID_home', 'GAME_DATE_home', 'TEAM_ABBREVIATION_home', 'TEAM_ABBREVIATION_away',
        'PTS_home', 'PTS_away'
    ]].rename(columns={
        'SEASON_ID_home' : 'sid',
        'GAME_DATE_home': 'date',
        'TEAM_ABBREVIATION_home': 'team_home',
        'TEAM_ABBREVIATION_away': 'team_away',
        'PTS_home': 'score_home',
        'PTS_away': 'score_away'
    })

    merged["win_home"] = (merged["score_home"] > merged["score_away"]).astype(int)

    merged['date'] = pd.to_datetime(merged['date'])
    merged = merged.sort_values('date').reset_index(drop=True)
    return merged

games = merge_games(games_df)
games


Unnamed: 0,sid,date,team_home,team_away,score_home,score_away,win_home
0,22010,2010-10-26,LAL,HOU,112,110,1
1,22010,2010-10-26,BOS,MIA,88,80,1
2,22010,2010-10-26,POR,PHX,106,92,1
3,22010,2010-10-27,MIN,SAC,116,117,0
4,22010,2010-10-27,CLE,BOS,95,87,1
...,...,...,...,...,...,...,...
17878,22024,2025-04-13,MIN,UTA,116,105,1
17879,22024,2025-04-13,POR,LAL,109,81,1
17880,22024,2025-04-13,CLE,IND,118,126,0
17881,22024,2025-04-13,MIL,DET,140,133,1


In [73]:
"""
Updating Elos
"""

import math

p_home = (games["win_home"] == 1).mean()
HCA = 400 * math.log10(p_home / (1 - p_home))
K = 20


def update_elo(games_df, K=K, HCA=HCA):
    elo = {team: 1500 for team in pd.unique(games_df[['team_home', 'team_away']].values.ravel())}
    records = []

    prev_sid = object() 
    for idx, row in games_df.iterrows():
        if row['sid'] != prev_sid:
            for team, rating in elo.items():
                elo[team] = 0.5 * rating + 0.5 * 1500

        home, away = row['team_home'], row['team_away']
        score_home, score_away = row['score_home'], row['score_away']
        win_home = 1 if score_home > score_away else 0

        Ra, Rb = elo[home], elo[away]
        Ea = 1 / (1 + 10 ** ((Rb - Ra + HCA) / 400))
        margin = abs(score_home - score_away)
        mult = np.log1p(margin) * 2.2 / ((Ra - Rb) * 0.001 + 2.2)
        delta = K * mult * (win_home - Ea)

        elo[home] += delta
        elo[away] -= delta

        games_df.loc[idx, 'elo_pre_home'] = Ra
        games_df.loc[idx, 'elo_pre_away'] = Rb
        games_df.loc[idx, 'elo_post_home'] = elo[home]
        games_df.loc[idx, 'elo_post_away'] = elo[away]

        prev_sid = row['sid']

    games_df['elo_diff'] = games_df['elo_pre_home'] - games_df['elo_pre_away'] + HCA

    return elo

elos = update_elo(games)

In [74]:
"""
Adding Rest Days, Rolling Win Pct, and Streak
"""

WIN_WINDOW = 10

def add_rest_and_rolling_winpct(games: pd.DataFrame, win_window: int = WIN_WINDOW,
                                fill_rest: int = 7) -> pd.DataFrame:
    df = games.copy()
    df["date"] = pd.to_datetime(df["date"])

    if "win_home" not in df.columns:
        if {"score_home", "score_away"}.issubset(df.columns):
            df["win_home"] = (df["score_home"] > df["score_away"]).astype(int)
        else:
            raise ValueError("Need win_home or scores to compute wins.")

    logs = []
    for _, r in df.sort_values("date").iterrows():
        logs.append({"date": r["date"], "team": r["team_home"], "win": r["win_home"]})
        logs.append({"date": r["date"], "team": r["team_away"], "win": 1 - r["win_home"]})
    logs = pd.DataFrame(logs).sort_values(["team", "date"]).reset_index(drop=True)

    logs["prev_date"] = logs.groupby("team")["date"].shift(1)
    logs["rest_days"] = ((logs["date"] - logs["prev_date"]).dt.days - 1).clip(0.0)

    logs["prev_win"] = logs.groupby("team")["win"].shift(1)
    logs["rw_pct"] = (logs
                      .groupby("team")["prev_win"]
                      .rolling(win_window, min_periods=1)
                      .mean()
                      .reset_index(level=0, drop=True))

    logs["prev_sign"] = logs["prev_win"].map({1: 1, 0: -1})
    def _streak_from_prev(g):
        cur = 0
        out = []
        for s in g["prev_sign"]:
            if pd.isna(s):
                cur = 0
            else:
                s = int(s)
                if s > 0:
                    cur = cur + 1 if cur > 0 else 1
                else:
                    cur = cur - 1 if cur < 0 else -1
            out.append(cur)
        g = g.copy()
        g["streak"] = out
        return g
    logs = logs.groupby("team", group_keys=False).apply(_streak_from_prev)

    # Spread into home/away features
    h = logs.rename(columns={
        "team": "team_home",
        "rest_days": "rest_h",
        "rw_pct": "rw_pct_h",
        "streak": "streak_h",
    })[["team_home", "date", "rest_h", "rw_pct_h", "streak_h"]]

    a = logs.rename(columns={
        "team": "team_away",
        "rest_days": "rest_a",
        "rw_pct": "rw_pct_a",
        "streak": "streak_a",
    })[["team_away", "date", "rest_a", "rw_pct_a", "streak_a"]]

    out = (df.merge(h, on=["team_home", "date"], how="left")
             .merge(a, on=["team_away", "date"], how="left"))

    out["rest_h"]   = out["rest_h"].fillna(fill_rest)
    out["rest_a"]   = out["rest_a"].fillna(fill_rest)
    out["rw_pct_h"] = out["rw_pct_h"].fillna(0.5)
    out["rw_pct_a"] = out["rw_pct_a"].fillna(0.5)
    out["streak_h"] = out["streak_h"].fillna(0).astype(int)
    out["streak_a"] = out["streak_a"].fillna(0).astype(int)

    out["rest_diff"] = out["rest_h"]   - out["rest_a"]
    out["rw_pct_diff"] = out["rw_pct_h"] - out["rw_pct_a"]
    out["streak_diff"] = out["streak_h"] - out["streak_a"]

    return out

games_d = add_rest_and_rolling_winpct(games)
games_d[(games_d['team_home'] == 'PHX') | (games_d['team_away'] == 'PHX')]


  logs = logs.groupby("team", group_keys=False).apply(_streak_from_prev)


Unnamed: 0,sid,date,team_home,team_away,score_home,score_away,win_home,elo_pre_home,elo_pre_away,elo_post_home,...,elo_diff,rest_h,rw_pct_h,streak_h,rest_a,rw_pct_a,streak_a,rest_diff,rw_pct_diff,streak_diff
2,22010,2010-10-26,POR,PHX,106,92,1,1500.000000,1500.000000,1531.097981,...,51.926381,7.0,0.500000,0,7.0,0.500000,0,0.0,0.000000,0
16,22010,2010-10-28,UTA,PHX,94,110,0,1463.993452,1468.902019,1440.202020,...,47.017814,0.0,0.000000,-1,1.0,0.000000,-1,-1.0,0.000000,0
22,22010,2010-10-29,PHX,LAL,106,114,0,1492.693451,1512.615949,1475.041481,...,32.003883,0.0,0.500000,1,2.0,1.000000,1,-2.0,-0.500000,0
56,22010,2010-11-03,PHX,SAS,110,112,0,1475.041481,1526.614436,1467.047174,...,0.353426,4.0,0.333333,-1,1.0,0.666667,1,3.0,-0.333333,-2
70,22010,2010-11-05,PHX,MEM,123,118,1,1467.047174,1471.084793,1487.864388,...,47.888762,1.0,0.250000,-2,1.0,0.400000,-2,0.0,-0.150000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17819,22024,2025-04-06,NYK,PHX,112,98,1,1566.311013,1454.864077,1587.712935,...,163.373317,0.0,0.600000,1,1.0,0.400000,-5,-1.0,0.200000,6
17834,22024,2025-04-08,PHX,GSW,95,133,0,1433.462156,1636.224957,1418.326252,...,-150.836421,1.0,0.400000,-6,1.0,0.700000,-1,0.0,-0.300000,-5
17846,22024,2025-04-09,PHX,OKC,112,125,0,1418.326252,1819.583488,1413.899148,...,-349.330855,0.0,0.300000,-7,0.0,0.800000,1,0.0,-0.500000,-8
17862,22024,2025-04-11,PHX,SAS,117,98,1,1413.899148,1397.743198,1446.689096,...,68.082331,1.0,0.200000,-8,1.0,0.200000,1,0.0,0.000000,-9


In [None]:
"""
Training Model
"""
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

FEATS = ["elo_diff", "rest_diff", "rw_pct_diff", "streak_diff"]
X = games_d[FEATS]
y = games_d['win_home']

model = Pipeline([
    ("poly", PolynomialFeatures(degree=1, include_bias=False, interaction_only=True)),
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(C=10.0, penalty="l2", solver="lbfgs", max_iter=2000))
])
model.fit(X, y)
#print("Model coefficients:", model.coef_, "intercept:", model.intercept_)

In [None]:
"""
Model Evaluation
"""

from sklearn.metrics import (
    log_loss, brier_score_loss, roc_auc_score, accuracy_score,
    confusion_matrix
)

def evaluate_model_on_holdout(
    model,
    df,
    feature_cols,
    label_col="win_home",
    date_col="date",
    holdout_frac=0.2, 
):
    # 1) Time-based split
    df = df.sort_values(date_col).copy()
    cut = df[date_col].quantile(1 - holdout_frac)
    train = df[df[date_col] < cut]
    test  = df[df[date_col] >= cut]

    X_tr, y_tr = train[feature_cols], train[label_col].astype(int)
    X_te, y_te = test[feature_cols],  test[label_col].astype(int)

    # 2) Fit and predict (pipeline handles scaling etc.)
    model.fit(X_tr, y_tr)
    p_te = model.predict_proba(X_te)[:, 1]
    yhat  = (p_te >= 0.5).astype(int)

    # 3) Core metrics
    metrics = {
        "n_train": int(len(train)),
        "n_test": int(len(test)),
        "log_loss": float(log_loss(y_te, p_te)),
        "brier": float(brier_score_loss(y_te, p_te)),
        "roc_auc": float(roc_auc_score(y_te, p_te)),
        "accuracy@0.5": float(accuracy_score(y_te, yhat)),
    }

    # 4) Confusion matrix @ 0.5
    tn, fp, fn, tp = confusion_matrix(y_te, yhat).ravel()
    metrics.update({"TP": int(tp), "FP": int(fp), "TN": int(tn), "FN": int(fn)})

    # 5) Calibration summary (10 bins)
    bins = pd.qcut(p_te, q=10, duplicates="drop")
    cal = (pd.DataFrame({"p": p_te, "y": y_te, "bin": bins})
             .groupby("bin", observed=True).agg(p_mean=("p","mean"), y_rate=("y","mean"), n=("y","size"))
             .reset_index(drop=True))
    cal["abs_err"] = (cal["p_mean"] - cal["y_rate"]).abs()
    metrics["calibration_mae(10-bins)"] = float(np.average(cal["abs_err"], weights=cal["n"]))


    return metrics, cal, test.assign(pred_prob=p_te)

feature_cols = ["elo_diff", "rest_diff", "rw_pct_diff", "streak_diff"]
metrics, calibration_table, test_with_preds = evaluate_model_on_holdout(
    model, games_d, feature_cols,
    label_col="win_home", date_col="date",)
print(metrics)
calibration_table


{'n_train': 14304, 'n_test': 3579, 'log_loss': 0.6250410570635226, 'brier': 0.2180246562321294, 'roc_auc': 0.6962689250214991, 'accuracy@0.5': 0.6457110924839341, 'TP': 1584, 'FP': 870, 'TN': 727, 'FN': 398, 'calibration_mae(10-bins)': 0.03407647120749428}


Unnamed: 0,p_mean,y_rate,n,abs_err
0,0.268907,0.231844,358,0.037063
1,0.365799,0.349162,358,0.016637
2,0.444922,0.455307,358,0.010385
3,0.517418,0.553073,358,0.035654
4,0.560343,0.519553,358,0.04079
5,0.605637,0.565826,357,0.039811
6,0.65601,0.625698,358,0.030312
7,0.71925,0.650838,358,0.068412
8,0.769691,0.723464,358,0.046227
9,0.847639,0.863128,358,0.015489


In [None]:
"""
Secondary Model Evaluation
"""

correct = (games['elo_pre_home'] > games['elo_pre_away']) == (games['win_home'] == 1)
elo_accuracy = correct.mean()

home_accuracy = games['win_home'].mean()

preds = model.predict_proba(X)[:, 1]
pred_classes = (preds >= 0.5).astype(int)

acc = accuracy_score(y, pred_classes)
auc = roc_auc_score(y, preds)
ll  = log_loss(y, preds)

print(f"Home predicted winner correctly {home_accuracy:.2%} of the time")
print(f"Elo predicted winner correctly {elo_accuracy:.2%} of the time")
print(f"Model accuracy: {acc:.2%}")
print(f"Model AUC: {auc:.3f}")
print(f"Model logloss: {ll:.4f}")

Home predicted winner correctly 57.42% of the time
Elo predicted winner correctly 63.76% of the time
Model accuracy: 65.87%
Model AUC: 0.710
Model logloss: 0.6121


In [55]:
"""
Saving Initial Elo states
"""

from importlib import reload
import utils 
from datetime import datetime
reload(utils)
from utils import SeasonState, save_state

elos_25 = {}

for team, rating in elos.items():
    elos_25[team] = 0.5 * rating + 0.5 * 1500

elo_params = {
    'K' : K,
    'HCA' : HCA
}

state = SeasonState(
    season="2025-26",
    last_updated=datetime.now(),
    elo=elos_25,
    params={"K": K, "HCA": HCA, "win_window": WIN_WINDOW}
)

save_state(state, "../states/2025-26_season_state.pkl")

In [64]:
"""
Saving Model
"""
import joblib as jl

jl.dump(model, "../models/elo_model_1_(10-23-25).pkl")

['../models/elo_model_1_(10-23-25).pkl']