In [1]:
from nba_api.stats.endpoints import leaguegamefinder
import pandas as pd
import numpy as np
import joblib as jl
import math

In [7]:
years = range(10, 25)
seasons = [f"20{i:02d}-{(i+1)%100:02d}" for i in years]

to_merge = []
for s in seasons:
    finder = leaguegamefinder.LeagueGameFinder(
        season_nullable=s,
        season_type_nullable="Regular Season",
        league_id_nullable='00'
    )

    games = finder.get_data_frames()[0]
    to_merge.append(games)

games = pd.concat(to_merge)

In [8]:
def merge_games(df):
    df['HOME'] = df['MATCHUP'].str.contains('vs.') 
    home_df = df[df['HOME']].copy()
    away_df = df[~df['HOME']].copy()

    merged = home_df.merge(
        away_df,
        on='GAME_ID',
        suffixes=('_home', '_away')
    )
    
    cols = ['TEAM_ABBREVIATION', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
            'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
            'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS']
    
    games = merged[['SEASON_ID_home', 'GAME_DATE_home'] + [f'{col}_home' for col in cols] + [f'{col}_away' for col in cols]].rename(columns={
        'SEASON_ID_home' : 'sid',
        'GAME_DATE_home': 'date',
        'TEAM_ABBREVIATION_home': 'team_home',
        'TEAM_ABBREVIATION_away': 'team_away',
        'PTS_home': 'score_home',
        'PTS_away': 'score_away'
    })

    games['date'] = pd.to_datetime(games['date'])
    games = games.sort_values('date').reset_index(drop=True)
    return games

merged_games = merge_games(games)
merged_games


Unnamed: 0,sid,date,team_home,MIN_home,score_home,FGM_home,FGA_home,FG_PCT_home,FG3M_home,FG3A_home,...,FT_PCT_away,OREB_away,DREB_away,REB_away,AST_away,STL_away,BLK_away,TOV_away,PF_away,PLUS_MINUS_away
0,22010,2010-10-26,LAL,240,112,40,96,0.417,9,21,...,0.929,16,37,53,25,6,7,20,25,-2.0
1,22010,2010-10-26,BOS,239,88,32,69,0.464,8,16,...,0.720,11,28,39,15,10,6,17,21,-8.0
2,22010,2010-10-26,POR,240,106,43,93,0.462,10,20,...,0.688,7,23,30,15,3,4,19,19,-14.0
3,22010,2010-10-27,MIN,240,116,43,91,0.473,8,18,...,0.745,14,31,45,25,8,4,13,24,1.0
4,22010,2010-10-27,CLE,240,95,36,81,0.444,6,20,...,0.762,6,32,38,24,8,1,19,24,-8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17878,22024,2025-04-13,MIN,240,116,46,98,0.469,13,43,...,0.818,9,31,40,24,6,5,13,15,-11.0
17879,22024,2025-04-13,POR,240,109,42,96,0.438,15,42,...,0.714,13,29,42,21,9,11,20,21,-28.0
17880,22024,2025-04-13,CLE,292,118,44,115,0.383,18,60,...,0.909,14,47,61,27,15,10,13,20,8.0
17881,22024,2025-04-13,MIL,265,140,50,86,0.581,23,44,...,0.643,14,28,42,35,9,3,13,19,-7.0


In [None]:
STAT_COLS = ['MIN', 'score', 'FGM', 'FGA', 'FG_PCT',
            'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
            'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS']

def build_prev_stats_training(games_df):
    df = games_df.sort_values("date").copy()

    logs = []
    for _, r in df.iterrows():
        h = { "date": r["date"], "team": r["team_home"], "is_home": 1 }
        for c in STAT_COLS:
            h[c] = r[f"{c}_home"]
        logs.append(h)

        a = { "date": r["date"], "team": r["team_away"], "is_home": 0 }
        for c in STAT_COLS:
            a[c] = r[f"{c}_away"]
        logs.append(a)

    logs = pd.DataFrame(logs).sort_values(["team","date"])

    for c in STAT_COLS:
        logs[f"prev_{c}"] = logs.groupby("team")[c].shift(1)

    h_prev = logs[logs["is_home"]==1][["team","date"] + [f"prev_{c}" for c in STAT_COLS]].rename(
        columns={"team":"team_home"})
    a_prev = logs[logs["is_home"]==0][["team","date"] + [f"prev_{c}" for c in STAT_COLS]].rename(
        columns={"team":"team_away"})

    out = (df
           .merge(h_prev, on=["team_home","date"], how="left")
           .merge(a_prev, on=["team_away","date"], how="left", suffixes=("", "_away")))

    rename_map = {f"prev_{c}": f"prev_{c}_home" for c in STAT_COLS}
    out = out.rename(columns=rename_map)
    rename_map_away = {f"prev_{c}_away": f"prev_{c}_away" for c in STAT_COLS}

    out["home_win"] = (out["score_home"] > out["score_away"]).astype(int)

    out.dropna(inplace=True)

    out.drop(['MIN_home', 'FGM_home', 'FGA_home', 'FG_PCT_home', 'FG3M_home', 'FG3A_home',
       'FG3_PCT_home', 'FTM_home', 'FTA_home', 'FT_PCT_home', 'OREB_home',
       'DREB_home', 'REB_home', 'AST_home', 'STL_home', 'BLK_home', 'TOV_home',
       'PF_home', 'PLUS_MINUS_home', 'MIN_away', 'FGM_away', 'FGA_away', 'FG_PCT_away', 'FG3M_away', 'FG3A_away',
       'FG3_PCT_away', 'FTM_away', 'FTA_away', 'FT_PCT_away', 'OREB_away',
       'DREB_away', 'REB_away', 'AST_away', 'STL_away', 'BLK_away', 'TOV_away',
       'PF_away', 'PLUS_MINUS_away'], axis=1, inplace=True)

    return out

games_prev = build_prev_stats_training(merged_games)
games_prev

Unnamed: 0,sid,date,team_home,score_home,team_away,score_away,prev_MIN_home,prev_score_home,prev_FGM_home,prev_FGA_home,...,prev_OREB_away,prev_DREB_away,prev_REB_away,prev_AST_away,prev_STL_away,prev_BLK_away,prev_TOV_away,prev_PF_away,prev_PLUS_MINUS_away,home_win
17,22010,2010-10-28,UTA,94,PHX,110,241.0,88.0,27.0,70.0,...,7.0,23.0,30.0,15.0,3.0,4.0,19.0,19.0,-14.0,0
18,22010,2010-10-29,CHA,101,IND,104,239.0,86.0,29.0,73.0,...,9.0,27.0,36.0,20.0,9.0,6.0,23.0,26.0,-13.0,0
19,22010,2010-10-29,PHI,101,ATL,104,241.0,87.0,36.0,83.0,...,9.0,35.0,44.0,20.0,7.0,9.0,15.0,20.0,15.0,0
20,22010,2010-10-29,NJN,106,SAC,100,238.0,101.0,39.0,80.0,...,14.0,31.0,45.0,25.0,8.0,4.0,13.0,24.0,1.0,1
21,22010,2010-10-29,BOS,105,NYK,101,238.0,87.0,34.0,72.0,...,13.0,36.0,49.0,12.0,4.0,10.0,15.0,23.0,5.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17878,22024,2025-04-13,NOP,100,OKC,115,240.0,104.0,38.0,95.0,...,5.0,42.0,47.0,41.0,12.0,4.0,10.0,17.0,34.0,0
17879,22024,2025-04-13,MEM,132,DAL,97,239.0,109.0,41.0,91.0,...,12.0,44.0,56.0,34.0,8.0,10.0,17.0,12.0,22.0,1
17880,22024,2025-04-13,GSW,119,LAC,124,240.0,103.0,33.0,83.0,...,7.0,30.0,37.0,19.0,6.0,2.0,12.0,19.0,1.0,0
17881,22024,2025-04-13,SAC,109,PHX,98,240.0,100.0,37.0,85.0,...,13.0,39.0,52.0,27.0,9.0,10.0,11.0,9.0,19.0,1


In [None]:
import numpy as np

p_home = (games_prev["home_win"] == 1).mean()
HCA = 400 * math.log10(p_home / (1 - p_home))
K = 25.0

def run_elo_over_frame(
    games_df: pd.DataFrame,
    *,
    K: float = K,
    HCA: float = HCA,
    base: float = 1500.0,
    carry_w: float = 0.75,
    season_col: str = "sid",
    date_col: str = "date",
    home_col: str = "team_home",
    away_col: str = "team_away",
    hs_col: str = "score_home",
    as_col: str = "score_away",
):
    
    df = games_df.copy()
    df = df.sort_values([season_col, date_col, home_col, away_col]).reset_index(drop=True)

    df["home_elo_pre"]  = np.nan
    df["away_elo_pre"]  = np.nan
    df["home_elo_post"] = np.nan
    df["away_elo_post"] = np.nan

    # State
    elo: dict[str, float] = {}           
    prev_final_elos: dict[str, float] = {} 
    final_elos_by_season: dict[str, dict[str, float]] = {}

    current_sid = None

    def season_reset(new_sid: str):
        nonlocal elo, current_sid, prev_final_elos
        if prev_final_elos:
            elo = {t: carry_w * r + (1.0 - carry_w) * base for t, r in prev_final_elos.items()}
        else:
            elo = {}
        current_sid = new_sid

    for i, row in enumerate(df.itertuples(index=False)):
        sid  = getattr(row, season_col)
        d    = getattr(row, date_col)
        home = getattr(row, home_col)
        away = getattr(row, away_col)
        hs   = getattr(row, hs_col)
        aS   = getattr(row, as_col)

        if current_sid != sid:
            if current_sid is not None:
                final_elos_by_season[current_sid] = dict(elo)
                prev_final_elos = dict(elo)
            season_reset(sid)

        if home not in elo: elo[home] = base
        if away not in elo: elo[away] = base

        Ra, Rb = elo[home], elo[away]
        df.at[i, "home_elo_pre"] = Ra
        df.at[i, "away_elo_pre"] = Rb

        if pd.isna(hs) or pd.isna(aS):
            df.at[i, "home_elo_post"] = Ra
            df.at[i, "away_elo_post"] = Rb
            continue

        win_home = 1 if hs > aS else 0
        margin   = abs(hs - aS)

        Ea = 1.0 / (1.0 + 10.0 ** ((Rb - Ra + HCA) / 400.0))

        mult  = np.log1p(margin) * 2.2 / (((Ra - Rb) * 0.001) + 2.2)
        delta = K * mult * (win_home - Ea)

        elo[home] = Ra + delta
        elo[away] = Rb - delta

        df.at[i, "home_elo_post"] = elo[home]
        df.at[i, "away_elo_post"] = elo[away]

    if current_sid is not None:
        final_elos_by_season[current_sid] = dict(elo)

    return df, final_elos_by_season

games_elo, final_elo = run_elo_over_frame(games_prev)

In [15]:
games_elo['elo_diff'] = games_elo['home_elo_pre'] - games_elo['away_elo_pre'] + HCA
games_elo

Unnamed: 0,sid,date,team_home,score_home,team_away,score_away,prev_MIN_home,prev_score_home,prev_FGM_home,prev_FGA_home,...,prev_TOV_away,prev_PF_away,prev_PLUS_MINUS_away,home_win,home_elo_pre,away_elo_pre,home_elo_post,away_elo_post,elo_dff,elo_diff
0,22010,2010-10-28,UTA,94,PHX,110,241.0,88.0,27.0,70.0,...,19.0,19.0,-14.0,0,1500.000000,1500.000000,1469.831084,1530.168916,51.849124,51.849124
1,22010,2010-10-29,BOS,105,NYK,101,238.0,87.0,34.0,72.0,...,15.0,23.0,5.0,1,1500.000000,1500.000000,1523.098164,1476.901836,51.849124,51.849124
2,22010,2010-10-29,CHA,101,IND,104,239.0,86.0,29.0,73.0,...,23.0,26.0,-13.0,0,1500.000000,1500.000000,1485.238317,1514.761683,51.849124,51.849124
3,22010,2010-10-29,DAL,90,MEM,91,239.0,101.0,41.0,75.0,...,14.0,26.0,-15.0,0,1500.000000,1500.000000,1492.619158,1507.380842,51.849124,51.849124
4,22010,2010-10-29,DET,104,OKC,105,240.0,98.0,37.0,83.0,...,12.0,15.0,11.0,0,1500.000000,1500.000000,1492.619158,1507.380842,51.849124,51.849124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17857,22024,2025-04-13,NOP,100,OKC,115,240.0,104.0,38.0,95.0,...,10.0,17.0,34.0,0,1241.500835,1833.062838,1239.221812,1835.341861,-539.712879,-539.712879
17858,22024,2025-04-13,PHI,102,CHI,122,240.0,110.0,43.0,97.0,...,12.0,13.0,30.0,0,1173.459966,1611.554831,1168.115511,1616.899286,-386.245742,-386.245742
17859,22024,2025-04-13,POR,109,LAL,81,240.0,86.0,32.0,85.0,...,12.0,15.0,31.0,1,1455.999795,1679.558350,1533.771104,1601.787042,-171.709431,-171.709431
17860,22024,2025-04-13,SAC,109,PHX,98,240.0,100.0,37.0,85.0,...,11.0,9.0,19.0,1,1464.972486,1448.214462,1498.901949,1414.285000,68.607148,68.607148


In [20]:
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.metrics import accuracy_score

FEATS = (["elo_diff"] +
         [f"prev_{c}_home" for c in STAT_COLS] +
         [f"prev_{c}_away" for c in STAT_COLS])

games_elo["date"] = pd.to_datetime(games_elo["date"])

cutoff = games_elo["date"].quantile(0.80)
train = games_elo[games_elo["date"] <= cutoff]
test  = games_elo[games_elo["date"]  > cutoff]

X_train, y_train = train[FEATS], train["home_win"].values
X_test,  y_test  = test[FEATS],  test["home_win"].values

model = XGBClassifier(
    n_estimators=600, max_depth=4, learning_rate=0.03,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0, random_state=42
)
model.fit(X_train, y_train)

p_test = model.predict_proba(X_test)[:, 1]
y_hat  = (p_test >= 0.5).astype(int)

print("Test Accuracy:", f"{accuracy_score(y_test, y_hat):.2%}")
print("Test LogLoss:", f"{log_loss(y_test, p_test):.4f}")
print("Test AUC:",     f"{roc_auc_score(y_test, p_test):.3f}")

Test Accuracy: 63.04%
Test LogLoss: 0.6356
Test AUC: 0.681
