# QEPC NBA — Leakage‑free Eoin totals backtest + optional Odds baseline

This notebook is designed to be **portable across machines** (no hardcoded `C:\Users\...` paths) and **leakage‑free** (rolling features use only past games).

It:
- Loads Eoin games parquet from `cache/imports/eoin_games_qepc.parquet`
- Builds simple team rolling offense/defense priors (cumulative PPG since a cutoff)
- Predicts home/away points and totals, then reports MAE
- Adds an **environment drift** correction (two-window blend) to reduce league-wide scoring drift
- Optionally attaches Kaggle “long odds” and reports a Vegas baseline MAE when the CSV exists

If the odds CSV is not present on this machine, the odds section will **skip gracefully**.


In [None]:
# --- Cell 1: Robust project-root bootstrap (portable; no hardcoded paths) ---
from __future__ import annotations

from pathlib import Path
import sys
import datetime as dt

import numpy as np
import pandas as pd

# Detect repo root by walking up until we find qepc/__init__.py
_cwd = Path.cwd().resolve()
PROJECT_ROOT = None
for p in [_cwd] + list(_cwd.parents):
    if (p / "qepc" / "__init__.py").exists():
        PROJECT_ROOT = p
        break

if PROJECT_ROOT is None:
    raise RuntimeError(
        f"Could not find PROJECT_ROOT above {_cwd}. Expected qepc/__init__.py to exist."
    )

# Ensure imports come from this repo first (avoids accidentally importing another copy)
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import qepc
print("PROJECT_ROOT:", PROJECT_ROOT)
print("qepc imported from:", Path(qepc.__file__).resolve())
print("python:", sys.executable)
print("cwd:", _cwd)

CONFIG = {
    # Start building priors from this date forward (keeps things "modern")
    "modern_cutoff": dt.date(2022, 10, 1),

    # Scoring window. If None, defaults to modern_cutoff → min(games_max, odds_max if odds exist)
    "backtest_start": None,
    "backtest_end": None,

    # Minimum prior games per team required before we emit a scored row
    "min_games_per_team": 5,

    # Simple adjustments (tune later)
    "home_bonus": 1.5,
    "away_penalty": 0.5,
    "b2b_penalty": 1.5,

    # Optional: walk-forward linear calibration on (raw_pred → actual) for home/away separately
    "use_calibration": True,
    "min_calibration_rows": 200,
}
print("CONFIG:", CONFIG)

In [None]:
# --- Cell 2: Load Eoin games parquet (no hardcoded paths) ---
from pathlib import Path

games_path = Path(PROJECT_ROOT) / "cache" / "imports" / "eoin_games_qepc.parquet"
if not games_path.exists():
    # fallback search (helps if someone moved cache/)
    matches = list(Path(PROJECT_ROOT).rglob("eoin_games_qepc.parquet"))
    if matches:
        games_path = matches[0]
    else:
        raise FileNotFoundError(
            f"Could not find eoin_games_qepc.parquet under {PROJECT_ROOT}.\n"
            "Expected at: cache/imports/eoin_games_qepc.parquet\n"
            "Run your Eoin fetch/build notebook on this machine first."
        )

games = pd.read_parquet(games_path).copy()

# Normalize / coerce key columns
if "game_date" not in games.columns:
    raise KeyError("Expected 'game_date' in games parquet.")

games["game_date"] = pd.to_datetime(games["game_date"], errors="coerce").dt.date

# Score columns
HOME_COL = "home_score" if "home_score" in games.columns else None
AWAY_COL = "away_score" if "away_score" in games.columns else None
if HOME_COL is None or AWAY_COL is None:
    raise KeyError("Expected 'home_score' and 'away_score' columns in Eoin games table.")

# IDs
for col in ["game_id", "home_team_id", "away_team_id"]:
    if col not in games.columns:
        raise KeyError(f"Expected '{col}' in games parquet.")

games = games.sort_values(["game_date", "game_id"]).reset_index(drop=True)

print("games rows:", len(games))
print("games date range:", games["game_date"].min(), "→", games["game_date"].max())
games.head()

In [None]:
# --- Cell 3: Try to load odds (optional). If missing, we continue without odds. ---
from pathlib import Path

HAS_ODDS = False
odds = None
odds_max_date = None

try:
    from qepc.nba.odds_long_loader import load_long_odds, attach_odds_to_games

    odds_path = Path(PROJECT_ROOT) / "data" / "raw" / "nba" / "odds_long" / "nba_2008-2025.csv"
    if not odds_path.exists():
        matches = list(Path(PROJECT_ROOT).rglob("nba_2008-2025.csv"))
        if matches:
            odds_path = matches[0]

    if odds_path.exists():
        odds = load_long_odds(odds_path)
        HAS_ODDS = True
        odds_max_date = pd.to_datetime(odds["game_date"], errors="coerce").max().date()
        print("[odds] loaded:", odds_path)
        print("[odds] date range:", odds["game_date"].min(), "→", odds["game_date"].max(), "rows:", len(odds))
    else:
        print("[odds] csv not found under repo; skipping odds attach.")
        HAS_ODDS = False

except Exception as e:
    print("[odds] failed to load odds; skipping. Error:", repr(e))
    HAS_ODDS = False

HAS_ODDS, odds_max_date

In [None]:
# --- Cell 4: Backtest builder (leakage-free) ---
# Rolling priors use games strictly BEFORE each game:
# - off_ppg_prev = avg points scored so far
# - def_ppg_prev = avg points allowed so far
#
# Predict:
#   raw_home = (home_off_ppg_prev + away_def_ppg_prev)/2 + home_bonus - b2b_penalty*(home_is_b2b)
#   raw_away = (away_off_ppg_prev + home_def_ppg_prev)/2 - away_penalty - b2b_penalty*(away_is_b2b)
#
# Optional calibration happens in the next cell.

modern_cutoff = CONFIG["modern_cutoff"]
start = CONFIG["backtest_start"] or modern_cutoff

# If odds exist, default end to odds coverage (keeps backtest comparable to Vegas).
# Otherwise, end at the max game_date in games.
end_default = odds_max_date if HAS_ODDS and odds_max_date is not None else games["game_date"].max()
end = CONFIG["backtest_end"] or end_default

min_gp = int(CONFIG["min_games_per_team"])

games_slice = games[(games["game_date"] >= modern_cutoff) & (games["game_date"] <= end)].copy()
games_slice = games_slice.sort_values(["game_date", "game_id"]).reset_index(drop=True)

# Rolling state per team (since modern_cutoff)
gp = {}
pf = {}
pa = {}
last_date = {}

rows = []
skipped_no_history = 0

for _, g in games_slice.iterrows():
    gdate = g["game_date"]
    home_id = int(g["home_team_id"])
    away_id = int(g["away_team_id"])

    # Prior state BEFORE updating with this game
    home_gp = gp.get(home_id, 0)
    away_gp = gp.get(away_id, 0)

    home_off = (pf.get(home_id, 0.0) / home_gp) if home_gp > 0 else np.nan
    home_def = (pa.get(home_id, 0.0) / home_gp) if home_gp > 0 else np.nan
    away_off = (pf.get(away_id, 0.0) / away_gp) if away_gp > 0 else np.nan
    away_def = (pa.get(away_id, 0.0) / away_gp) if away_gp > 0 else np.nan

    home_is_b2b = bool((home_id in last_date) and ((gdate - last_date[home_id]).days == 1))
    away_is_b2b = bool((away_id in last_date) and ((gdate - last_date[away_id]).days == 1))

    # Emit scored row only if both teams have enough history
    if (home_gp >= min_gp) and (away_gp >= min_gp) and (gdate >= start):
        raw_home = 0.5 * (home_off + away_def) + float(CONFIG["home_bonus"]) - (float(CONFIG["b2b_penalty"]) if home_is_b2b else 0.0)
        raw_away = 0.5 * (away_off + home_def) - float(CONFIG["away_penalty"]) - (float(CONFIG["b2b_penalty"]) if away_is_b2b else 0.0)

        actual_home = float(g[HOME_COL])
        actual_away = float(g[AWAY_COL])

        rows.append({
            "game_id": int(g["game_id"]),
            "game_date": gdate,
            "home_team_id": home_id,
            "away_team_id": away_id,

            "home_off_ppg_prev": float(home_off),
            "home_def_ppg_prev": float(home_def),
            "away_off_ppg_prev": float(away_off),
            "away_def_ppg_prev": float(away_def),

            "home_is_b2b": home_is_b2b,
            "away_is_b2b": away_is_b2b,

            "exp_home_pts_raw": float(raw_home),
            "exp_away_pts_raw": float(raw_away),

            # placeholders to fill after calibration (or copy raw)
            "exp_home_pts": float(raw_home),
            "exp_away_pts": float(raw_away),

            "actual_home_pts": actual_home,
            "actual_away_pts": actual_away,
        })
    elif gdate >= start:
        skipped_no_history += 1

    # UPDATE state with this game AFTER scoring (prevents leakage)
    gp[home_id] = home_gp + 1
    pf[home_id] = pf.get(home_id, 0.0) + float(g[HOME_COL])
    pa[home_id] = pa.get(home_id, 0.0) + float(g[AWAY_COL])
    last_date[home_id] = gdate

    gp[away_id] = away_gp + 1
    pf[away_id] = pf.get(away_id, 0.0) + float(g[AWAY_COL])
    pa[away_id] = pa.get(away_id, 0.0) + float(g[HOME_COL])
    last_date[away_id] = gdate

backtest_df = pd.DataFrame(rows).sort_values(["game_date", "game_id"]).reset_index(drop=True)

print("built backtest rows:", len(backtest_df))
print("skipped (insufficient history during scoring window):", skipped_no_history)
backtest_df.head()

In [None]:
# --- Cell 5: Walk-forward calibration (optional; leakage-free) ---
df = backtest_df.copy()

if df.empty:
    raise RuntimeError("No backtest rows built. Try lowering min_games_per_team or widening the date range.")

def expanding_ols_yhat(x: pd.Series, y: pd.Series, min_train: int = 200) -> pd.Series:
    # Walk-forward OLS for y ≈ a + b*x using cumulative sums; predictions use only prior rows.
    x = x.astype(float)
    y = y.astype(float)

    cum_x  = x.cumsum().shift(1).fillna(0.0)
    cum_y  = y.cumsum().shift(1).fillna(0.0)
    cum_x2 = (x * x).cumsum().shift(1).fillna(0.0)
    cum_xy = (x * y).cumsum().shift(1).fillna(0.0)

    n = pd.Series(np.arange(len(x)), index=x.index).astype(float)

    den = n * cum_x2 - cum_x * cum_x
    b = np.where(den != 0, (n * cum_xy - cum_x * cum_y) / den, 1.0)
    a = np.where(n != 0, (cum_y - b * cum_x) / n, 0.0)

    use = n >= float(min_train)
    a = np.where(use, a, 0.0)
    b = np.where(use, b, 1.0)

    return pd.Series(a + b * x, index=x.index)

if CONFIG["use_calibration"]:
    min_cal = int(CONFIG["min_calibration_rows"])
    df["exp_home_pts"] = expanding_ols_yhat(df["exp_home_pts_raw"], df["actual_home_pts"], min_train=min_cal)
    df["exp_away_pts"] = expanding_ols_yhat(df["exp_away_pts_raw"], df["actual_away_pts"], min_train=min_cal)
else:
    df["exp_home_pts"] = df["exp_home_pts_raw"]
    df["exp_away_pts"] = df["exp_away_pts_raw"]

# Totals + errors
df["total_pred"] = df["exp_home_pts"] + df["exp_away_pts"]
df["total_actual"] = df["actual_home_pts"] + df["actual_away_pts"]

df["home_abs_err"] = (df["exp_home_pts"] - df["actual_home_pts"]).abs()
df["away_abs_err"] = (df["exp_away_pts"] - df["actual_away_pts"]).abs()
df["total_abs_err"] = (df["total_pred"] - df["total_actual"]).abs()

backtest_df = df

print("Backtest rows:", len(df))
print("MAE home:", round(df["home_abs_err"].mean(), 3))
print("MAE away:", round(df["away_abs_err"].mean(), 3))
print("MAE total:", round(df["total_abs_err"].mean(), 3))
print("Bias total:", round((df["total_pred"] - df["total_actual"]).mean(), 3))
df.head()

In [None]:
# --- Cell 6: Environment drift correction (two-window blend; leakage-free) ---
# Estimate a league-wide scoring "field" and correct totals using ONLY past residuals.

df = backtest_df.copy().sort_values(["game_date", "game_id"]).reset_index(drop=True)

resid = (df["total_actual"] - df["total_pred"]).astype(float)

adj_fast = resid.rolling(50,  min_periods=50).mean().shift(1).fillna(0.0)
adj_slow = resid.rolling(250, min_periods=250).mean().shift(1).fillna(0.0)

df["env_drift_total"] = 0.6 * adj_fast + 0.4 * adj_slow
df["total_pred_env"] = df["total_pred"] + df["env_drift_total"]
df["total_abs_err_env"] = (df["total_pred_env"] - df["total_actual"]).abs()

print("MAE raw :", round((df["total_pred"] - df["total_actual"]).abs().mean(), 3))
print("MAE env :", round(df["total_abs_err_env"].mean(), 3))
print("Bias raw:", round((df["total_pred"] - df["total_actual"]).mean(), 3))
print("Bias env:", round((df["total_pred_env"] - df["total_actual"]).mean(), 3))

backtest_df = df

In [None]:
# --- Cell 7: Optional odds attach to backtest_df and Vegas baseline ---
df = backtest_df.copy()

if HAS_ODDS and odds is not None:
    games_for_join = df[["game_id", "game_date", "home_team_id", "away_team_id"]].copy()
    games_for_join["game_date"] = pd.to_datetime(games_for_join["game_date"], errors="coerce").dt.normalize().dt.date

    joined, diag = attach_odds_to_games(games_for_join, odds)

    odds_cols = [c for c in joined.columns if c not in games_for_join.columns]
    df = df.merge(joined[["game_id"] + odds_cols], on="game_id", how="left")

    if "total_points" in df.columns:
        dfo = df[df["total_points"].notna()].copy()
        print("[odds] rows with Vegas totals:", len(dfo))
        print("QEPC raw MAE   :", round((dfo["total_pred"] - dfo["total_actual"]).abs().mean(), 3))
        print("QEPC env MAE   :", round((dfo["total_pred_env"] - dfo["total_actual"]).abs().mean(), 3))
        print("Vegas total MAE:", round((dfo["total_points"] - dfo["total_actual"]).abs().mean(), 3))
    else:
        print("[odds] attached, but total_points column not present (unexpected).")
else:
    print("Skipping odds attach (odds CSV missing or loader unavailable on this machine).")

backtest_df = df

In [None]:
# --- Cell 8: Quick diagnostics (shared-direction errors + bias) ---
df = backtest_df.copy()

df["err_home"] = df["exp_home_pts"] - df["actual_home_pts"]
df["err_away"] = df["exp_away_pts"] - df["actual_away_pts"]
df["err_total"] = df["total_pred"] - df["total_actual"]
df["err_total_env"] = df["total_pred_env"] - df["total_actual"]

print("Corr(err_home, err_away):", round(df["err_home"].corr(df["err_away"]), 3))
print("Mean err_total raw:", round(df["err_total"].mean(), 3))
print("Mean err_total env:", round(df["err_total_env"].mean(), 3))
print("Std err_total raw:", round(df["err_total"].std(), 3))
print("Std err_total env:", round(df["err_total_env"].std(), 3))