In [1]:
from nba_api.stats.endpoints import leaguegamefinder
import pandas as pd
import numpy as np
import joblib as jl
import math
import time

In [2]:
years = range(10, 25)
seasons = [f"20{i:02d}-{(i+1)%100:02d}" for i in years]

to_merge = []
for s in seasons:
    time.sleep(1)
    finder = leaguegamefinder.LeagueGameFinder(
        season_nullable=s,
        season_type_nullable="Regular Season",
        league_id_nullable='00'
    )

    game_df = finder.get_data_frames()[0]
    to_merge.append(game_df)

games_df = pd.concat(to_merge)

In [3]:
def merge_games(df):
    df['HOME'] = df['MATCHUP'].str.contains('vs.') 
    home_df = df[df['HOME']].copy()
    away_df = df[~df['HOME']].copy()

    merged = home_df.merge(
        away_df,
        on='GAME_ID',
        suffixes=('_home', '_away')
    )

    merged = merged[[
        'SEASON_ID_home', 'GAME_DATE_home', 'TEAM_ABBREVIATION_home', 'TEAM_ABBREVIATION_away',
        'PTS_home', 'PTS_away'
    ]].rename(columns={
        'SEASON_ID_home' : 'sid',
        'GAME_DATE_home': 'date',
        'TEAM_ABBREVIATION_home': 'team_home',
        'TEAM_ABBREVIATION_away': 'team_away',
        'PTS_home': 'score_home',
        'PTS_away': 'score_away'
    })

    merged["win_home"] = (merged["score_home"] > merged["score_away"]).astype(int)

    merged['date'] = pd.to_datetime(merged['date'])
    merged = merged.sort_values('date').reset_index(drop=True)
    return merged

games = merge_games(games_df)
games


Unnamed: 0,sid,date,team_home,team_away,score_home,score_away,win_home
0,22010,2010-10-26,LAL,HOU,112,110,1
1,22010,2010-10-26,BOS,MIA,88,80,1
2,22010,2010-10-26,POR,PHX,106,92,1
3,22010,2010-10-27,MIN,SAC,116,117,0
4,22010,2010-10-27,CLE,BOS,95,87,1
...,...,...,...,...,...,...,...
17878,22024,2025-04-13,MIN,UTA,116,105,1
17879,22024,2025-04-13,POR,LAL,109,81,1
17880,22024,2025-04-13,CLE,IND,118,126,0
17881,22024,2025-04-13,MIL,DET,140,133,1


In [4]:
import numpy as np

p_home = (games["win_home"] == 1).mean()
HCA = 400 * math.log10(p_home / (1 - p_home))
K = 25


def update_elo(games_df, K=K, HCA=HCA):
    elo = {team: 1500 for team in pd.unique(games_df[['team_home', 'team_away']].values.ravel())}
    records = []

    prev_sid = object() 
    for idx, row in games_df.iterrows():
        if row['sid'] != prev_sid:
            for team, rating in elo.items():
                elo[team] = 0.75 * rating + 0.25 * 1500

        home, away = row['team_home'], row['team_away']
        score_home, score_away = row['score_home'], row['score_away']
        win_home = 1 if score_home > score_away else 0

        Ra, Rb = elo[home], elo[away]
        Ea = 1 / (1 + 10 ** ((Rb - Ra + HCA) / 400))
        margin = abs(score_home - score_away)
        mult = np.log1p(margin) * 2.2 / ((Ra - Rb) * 0.001 + 2.2)
        delta = K * mult * (win_home - Ea)

        elo[home] += delta
        elo[away] -= delta

        games_df.loc[idx, 'elo_pre_home'] = Ra
        games_df.loc[idx, 'elo_pre_away'] = Rb
        games_df.loc[idx, 'elo_post_home'] = elo[home]
        games_df.loc[idx, 'elo_post_away'] = elo[away]

        prev_sid = row['sid']

    games_df['elo_diff'] = games_df['elo_pre_home'] - games_df['elo_pre_away'] + HCA

    return elo

elos = update_elo(games)

In [5]:
WIN_WINDOW = 10

def add_rest_and_rolling_winpct(games: pd.DataFrame, win_window: int = WIN_WINDOW,
                                fill_rest: int = 7) -> pd.DataFrame:
    
    df = games.copy()
    df["date"] = pd.to_datetime(df["date"])

    if "win_home" not in df.columns:
        if {"score_home", "score_away"}.issubset(df.columns):
            df["win_home"] = (df["score_home"] > df["score_away"]).astype(int)
        else:
            raise ValueError("Need win_home or scores to compute wins.")

    logs = []
    for _, r in df.sort_values("date").iterrows():
        logs.append({"date": r["date"], "team": r["team_home"], "win": r["win_home"]})
        logs.append({"date": r["date"], "team": r["team_away"], "win": 1 - r["win_home"]})

    logs = pd.DataFrame(logs).sort_values(["team", "date"]).reset_index(drop=True)

    logs["prev_date"] = logs.groupby("team")["date"].shift(1)
    logs["rest_days"] = ((logs["date"] - logs["prev_date"]).dt.days - 1).clip(0.0)

    logs["prev_win"] = logs.groupby("team")["win"].shift(1)
    logs["rw_pct"] = (logs
                      .groupby("team")["prev_win"]
                      .rolling(win_window, min_periods=1)
                      .mean()
                      .reset_index(level=0, drop=True))

    h = logs.rename(columns={
        "team": "team_home",
        "rest_days": "rest_h",
        "rw_pct": "rw_pct_h"
    })[["team_home", "date", "rest_h", "rw_pct_h"]]

    a = logs.rename(columns={
        "team": "team_away",
        "rest_days": "rest_a",
        "rw_pct": "rw_pct_a"
    })[["team_away", "date", "rest_a", "rw_pct_a"]]

    out = (df.merge(h, on=["team_home", "date"], how="left")
             .merge(a, on=["team_away", "date"], how="left"))

    out["rest_h"] = out["rest_h"].fillna(fill_rest)
    out["rest_a"] = out["rest_a"].fillna(fill_rest)
    out["rw_pct_h"] = out["rw_pct_h"].fillna(0.5)
    out["rw_pct_a"] = out["rw_pct_a"].fillna(0.5)

    out["rest_diff"]   = out["rest_h"]   - out["rest_a"]
    out["rw_pct_diff"] = out["rw_pct_h"] - out["rw_pct_a"]

    return out

games_d = add_rest_and_rolling_winpct(games)
games_d[(games_d['team_home'] == 'PHX') | (games_d['team_away'] == 'PHX')]

Unnamed: 0,sid,date,team_home,team_away,score_home,score_away,win_home,elo_pre_home,elo_pre_away,elo_post_home,elo_post_away,elo_diff,rest_h,rw_pct_h,rest_a,rw_pct_a,rest_diff,rw_pct_diff
2,22010,2010-10-26,POR,PHX,106,92,1,1500.000000,1500.000000,1538.872476,1461.127524,51.926381,7.0,0.500000,7.0,0.500000,0.0,0.000000
16,22010,2010-10-28,UTA,PHX,94,110,0,1454.991815,1461.127524,1425.357959,1490.761381,45.790672,0.0,0.000000,1.0,0.000000,-1.0,0.000000
22,22010,2010-10-29,PHX,LAL,106,114,0,1490.761381,1515.769937,1469.033409,1537.497908,26.917825,0.0,0.500000,2.0,1.000000,-2.0,-0.500000
56,22010,2010-11-03,PHX,SAS,110,112,0,1469.033409,1532.381778,1459.420122,1541.995065,-11.421988,4.0,0.333333,1.0,0.666667,3.0,-0.333333
70,22010,2010-11-05,PHX,MEM,123,118,1,1459.420122,1466.535939,1485.671886,1440.284175,44.810564,1.0,0.250000,1.0,0.400000,0.0,-0.150000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17819,22024,2025-04-06,NYK,PHX,112,98,1,1562.461701,1457.320499,1589.858448,1429.923752,157.067583,0.0,0.600000,1.0,0.400000,-1.0,0.200000
17834,22024,2025-04-08,PHX,GSW,95,133,0,1429.923752,1645.134982,1411.968911,1663.089823,-163.284849,1.0,0.400000,1.0,0.700000,0.0,-0.300000
17846,22024,2025-04-09,PHX,OKC,112,125,0,1411.968911,1825.143414,1406.743714,1830.368610,-361.248122,0.0,0.300000,0.0,0.800000,0.0,-0.500000
17862,22024,2025-04-11,PHX,SAS,117,98,1,1406.743714,1394.241475,1448.185854,1352.799335,64.428620,1.0,0.200000,1.0,0.200000,0.0,0.000000


In [55]:
from xgboost import XGBClassifier
from xgboost.callback import EarlyStopping
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.model_selection import train_test_split

FEATS = ["elo_diff", "rest_diff", "rw_pct_diff"]
X = games_d[FEATS].astype(float)
y = games_d["win_home"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

xgb = XGBClassifier(
    n_estimators=10_000,       
    learning_rate=0.01,
    max_depth=2,
    subsample=0.5,
    colsample_bytree=0.5,
    min_child_weight=3,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric="logloss",      
    tree_method="hist",
    random_state=42,
)

xgb.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=False,
)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.5
,device,
,early_stopping_rounds,
,enable_categorical,False


In [56]:
probs = xgb.predict_proba(X_test)[:, 1]
pred_classes = (probs >= 0.5).astype(int)

acc = accuracy_score(y_test, pred_classes)
auc = roc_auc_score(y_test, probs)
ll  = log_loss(y_test, probs)

print(f"Model accuracy: {acc:.2%}")
print(f"Model AUC:      {auc:.3f}")
print(f"Model logloss:  {ll:.4f}")
print(f"Best trees used: {xgb.best_iteration+1 if hasattr(xgb, 'best_iteration') else xgb.n_estimators}")

# --------------------------
# Feature importance (quick sanity check)
# --------------------------
imp = pd.Series(xgb.feature_importances_, index=FEATS).sort_values(ascending=False)
print("\nFeature importances:")
print(imp.round(3))

# --------------------------
# Full-dataset probabilities (if you want them for today’s table)
# --------------------------
games_d["p_model_home_xgb"] = xgb.predict_proba(X)[:, 1]

Model accuracy: 65.31%
Model AUC:      0.696
Model logloss:  0.6219
Best trees used: 10000

Feature importances:
elo_diff       0.594
rw_pct_diff    0.285
rest_diff      0.121
dtype: float32
