In [46]:
# ==========================================
# CELL 1 – QEPC PATH SETUP
# ==========================================
import sys
from pathlib import Path

NOTEBOOK_DIR = Path.cwd()

PROJECT_ROOT = None
for parent in [NOTEBOOK_DIR] + list(NOTEBOOK_DIR.parents):
    if (parent / "qepc").is_dir():
        PROJECT_ROOT = parent
        break

if PROJECT_ROOT is None:
    PROJECT_ROOT = Path(r"C:\Users\wdors\qepc_project").resolve()

if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("PROJECT_ROOT:", PROJECT_ROOT)
print("qepc exists?:", (PROJECT_ROOT / "qepc").is_dir())

DATA_DIR = PROJECT_ROOT / "data"
CACHE_DIR = PROJECT_ROOT / "cache"

print("DATA_DIR:", DATA_DIR)
print("CACHE_DIR:", CACHE_DIR)


NOTEBOOK_DIR: C:\Users\wdorsey\qepc_project\notebooks\nba
PROJECT_ROOT: C:\Users\wdorsey\qepc_project
qepc exists?: True
DATA_DIR: C:\Users\wdorsey\qepc_project\data
CACHE_DIR: C:\Users\wdorsey\qepc_project\cache


In [47]:
# ==========================================
# CELL 2 – LOAD GAMES + MATCHUP ENGINE
# ==========================================
import pandas as pd

from qepc.nba.eoin_data_source import load_eoin_games
from qepc.nba.matchups_eoin import build_matchups_for_date

games_qepc = load_eoin_games()

print("games_qepc shape:", games_qepc.shape)
print(games_qepc[["game_id", "game_date", "home_team_id", "away_team_id", "home_score", "away_score"]].head())


games_qepc shape: (72290, 19)
    game_id   game_date  home_team_id  away_team_id  home_score  away_score
0  22500349  2025-12-05    1610612760    1610612742         132         111
1  22500345  2025-12-05    1610612741    1610612754         105         120
2  22500347  2025-12-05    1610612763    1610612746         107          98
3  22500348  2025-12-05    1610612749    1610612755         101         116
4  22500346  2025-12-05    1610612745    1610612756         117          98


In [48]:
# ==========================================
# CELL 2B – LOAD TEAM BOXES + TEAM STRENGTHS
# ==========================================
from qepc.nba.eoin_data_source import load_eoin_team_boxes
from qepc.nba.eoin_team_stats import build_team_stats_from_eoin
from qepc.nba.team_strengths_eoin import calculate_advanced_strengths_from_eoin

team_boxes_qepc = load_eoin_team_boxes()
print("team_boxes_qepc shape:", team_boxes_qepc.shape)

# Build aggregate team stats from Eoin
team_stats = build_team_stats_from_eoin(team_boxes_qepc)
print("team_stats shape:", team_stats.shape)
display(team_stats.head())

# Build advanced strengths (off_ppg, def_ppg, strength_score, etc.)
strengths_df = calculate_advanced_strengths_from_eoin(team_stats)
print("strengths_df shape:", strengths_df.shape)
display(strengths_df.head())

# Index by team_id for quick lookup
strengths_idx = strengths_df.set_index("team_id")


team_boxes_qepc shape: (144580, 49)
team_stats shape: (34, 14)


Unnamed: 0,team_id,games_played,wins,losses,win_pct,pts_for,pts_against,pts_diff,off_ppg,def_ppg,reb_total,reb_pg,ast_total,ast_pg
0,15016,1,0,1,0.0,97,107,-10,97.0,107.0,48.0,48.0,23.0,23.0
1,15018,3,0,3,0.0,257,395,-138,85.666667,131.666667,80.0,26.666667,49.0,16.333333
2,50013,1,0,1,0.0,92,127,-35,92.0,127.0,42.0,42.0,24.0,24.0
3,50014,1,0,1,0.0,88,123,-35,88.0,123.0,38.0,38.0,18.0,18.0
4,1610612737,6462,3163,3299,0.489477,668300,671894,-3594,103.419994,103.976168,174313.0,26.975085,87465.0,13.535283


Built advanced strengths from Eoin team_stats:
      team_id  games_played   win_pct     off_ppg     def_ppg  \
0  1610612738          6883  0.596833  106.253523  103.205434   
1  1610612747          6898  0.585242  106.762540  104.581328   
2  1610612759          4492  0.585485  104.987311  102.372663   
3  1610612756          5074  0.532913  107.760347  106.615491   
4  1610612760          5192  0.539676  105.905817  104.664291   
5  1610612749          5075  0.521773  105.209064  104.075862   
6  1610612743          4360  0.505963  108.001376  108.068349   
7  1610612745          5160  0.518411  105.497093  104.875000   
8  1610612762          4552  0.527900  103.588752  102.658172   
9  1610612757          4865  0.515313  105.058582  104.502980   

   pts_diff_per_game  strength_score  strength_rank  
0           3.048089        0.779025              1  
1           2.181212        0.728369              2  
2           2.614648        0.700651              3  
3           1.144856 

Unnamed: 0,team_id,games_played,wins,losses,win_pct,pts_for,pts_against,pts_diff,off_ppg,def_ppg,...,reb_pg,ast_total,ast_pg,pts_diff_per_game,z_win_pct,z_off_ppg,z_def_ppg,z_pts_diff_pg,strength_score,strength_rank
0,1610612738,6883,4108,2775,0.596833,731343,710363,20980,106.253523,103.205434,...,26.04315,98151.0,14.259916,3.048089,0.971514,0.818578,0.412303,0.618246,0.779025,1
1,1610612747,6898,4037,2861,0.585242,736448,721402,15046,106.76254,104.581328,...,26.703392,100839.0,14.618585,2.181212,0.901392,0.921677,0.213312,0.540485,0.728369,2
2,1610612759,4492,2630,1862,0.585485,471603,459858,11745,104.987311,102.372663,...,38.395815,94752.0,21.0935,2.614648,0.902864,0.56211,0.532743,0.579365,0.700651,3
3,1610612756,5074,2704,2370,0.532913,546776,540967,5809,107.760347,106.615491,...,32.803114,95952.0,18.910524,1.144856,0.584811,1.12378,-0.080881,0.447522,0.584849,4
4,1610612760,5192,2802,2390,0.539676,549863,543417,6446,105.905817,104.664291,...,32.47265,89262.0,17.192219,1.241525,0.625729,0.748151,0.201313,0.456194,0.556911,5


In [49]:
# ==========================================
# CELL 2C – MODERN-ERA TEAM STRENGTHS
# ==========================================
import datetime as dt

# Choose a "modern" cutoff season. You can tweak this.
modern_cutoff = dt.date(2015, 10, 1)

# Ensure game_date exists and is a date
if "game_date" not in team_boxes_qepc.columns:
    raise ValueError("team_boxes_qepc is missing 'game_date' column.")

if not pd.api.types.is_datetime64_any_dtype(team_boxes_qepc["game_date"]):
    team_boxes_qepc["game_date"] = pd.to_datetime(team_boxes_qepc["game_date"]).dt.date

team_boxes_modern = team_boxes_qepc[team_boxes_qepc["game_date"] >= modern_cutoff].copy()

print("Modern-era team boxes:", len(team_boxes_modern))
print("Modern date range:",
      team_boxes_modern["game_date"].min(),
      "→",
      team_boxes_modern["game_date"].max())

# Rebuild stats + strengths from modern-only data
team_stats_modern = build_team_stats_from_eoin(team_boxes_modern)
print("team_stats_modern shape:", team_stats_modern.shape)

strengths_modern = calculate_advanced_strengths_from_eoin(team_stats_modern)
print("strengths_modern shape:", strengths_modern.shape)

strengths_idx = strengths_modern.set_index("team_id")  # overwrite the old index
display(strengths_modern.head())


Modern-era team boxes: 27988
Modern date range: 2015-10-02 → 2025-12-05
team_stats_modern shape: (34, 14)
Built advanced strengths from Eoin team_stats:
      team_id  games_played   win_pct     off_ppg     def_ppg  \
0  1610612738          1023  0.641251  111.378299  106.176931   
1  1610612744          1008  0.631944  113.995040  109.601190   
2  1610612749           968  0.586777  112.411157  109.772727   
3  1610612743           976  0.587090  112.147541  109.907787   
4  1610612760           948  0.575949  111.506329  109.031646   
5  1610612746           942  0.569002  110.988323  108.733546   
6  1610612761           962  0.563410  109.864865  107.860707   
7  1610612745           936  0.524573  112.228632  111.275641   
8  1610612748           989  0.541962  107.308392  106.227503   
9  1610612739           955  0.526702  109.218848  108.635602   

   pts_diff_per_game  strength_score  strength_rank  
0           5.201369        0.918521              1  
1           4.393849   

Unnamed: 0,team_id,games_played,wins,losses,win_pct,pts_for,pts_against,pts_diff,off_ppg,def_ppg,...,reb_pg,ast_total,ast_pg,pts_diff_per_game,z_win_pct,z_off_ppg,z_def_ppg,z_pts_diff_pg,strength_score,strength_rank
0,1610612738,1023,656,367,0.641251,113940,108619,5321,111.378299,106.176931,...,44.72825,25228.0,24.660802,5.201369,1.174658,0.553398,0.996284,0.794501,0.918521,1
1,1610612744,1008,637,371,0.631944,114907,110478,4429,113.99504,109.60119,...,44.824405,28530.0,28.303571,4.393849,1.120539,0.949004,0.353203,0.72294,0.890219,2
2,1610612749,968,568,400,0.586777,108814,106260,2554,112.411157,109.772727,...,45.365702,23978.0,24.770661,2.63843,0.857888,0.709548,0.320988,0.567378,0.687377,3
3,1610612743,976,573,403,0.58709,109456,107270,2186,112.147541,109.907787,...,44.798156,26102.0,26.743852,2.239754,0.85971,0.669694,0.295624,0.532048,0.667,4
4,1610612760,948,546,402,0.575949,105708,103362,2346,111.506329,109.031646,...,45.334388,22033.0,23.241561,2.474684,0.794926,0.572753,0.460164,0.552867,0.644398,5


In [50]:
# ==========================================
# CELL 2D – PACE & OFF/DEF RATING FROM EOIN
# ==========================================
import pandas as pd
import numpy as np

# Make sure we have the modern subset
# (re-run Cell 2C first if needed so team_boxes_modern exists)
tb = team_boxes_modern.copy()

# Dean Oliver-style possessions estimate:
# poss ≈ FGA + 0.44 * FTA - ORB + TOV
tb["fga"] = tb["fieldgoalsattempted"].fillna(0)
tb["fta"] = tb["freethrowsattempted"].fillna(0)
tb["orb"] = tb["reboundsoffensive"].fillna(0)
tb["tov"] = tb["turnovers"].fillna(0)

tb["possessions"] = tb["fga"] + 0.44 * tb["fta"] - tb["orb"] + tb["tov"]

# Points for/against
tb["points_for"] = tb["teamscore"].fillna(0)
tb["points_against"] = tb["opponentscore"].fillna(0)

# Aggregate by team_id
grouped = tb.groupby("team_id")

pace_stats = grouped.agg(
    games_played=("game_id", "nunique"),
    total_possessions=("possessions", "sum"),
    total_pts_for=("points_for", "sum"),
    total_pts_against=("points_against", "sum"),
).reset_index()

# Per-game pace
pace_stats["pace_per_game"] = (
    pace_stats["total_possessions"] / pace_stats["games_played"]
)

# Offensive & defensive rating (points per 100 possessions)
pace_stats["off_rating"] = (
    100.0 * pace_stats["total_pts_for"] / pace_stats["total_possessions"]
)
pace_stats["def_rating"] = (
    100.0 * pace_stats["total_pts_against"] / pace_stats["total_possessions"]
)

# Simple sanity print
print("pace_stats shape:", pace_stats.shape)
print("League avg pace:", pace_stats["pace_per_game"].mean())
print("League avg off_rating:", pace_stats["off_rating"].mean())
print("League avg def_rating:", pace_stats["def_rating"].mean())

display(pace_stats.head())

# Index for fast lookup
pace_idx = pace_stats.set_index("team_id")


pace_stats shape: (34, 8)
League avg pace: 101.13498873559854
League avg off_rating: 106.47805608268818
League avg def_rating: 110.2990867518534


Unnamed: 0,team_id,games_played,total_possessions,total_pts_for,total_pts_against,pace_per_game,off_rating,def_rating
0,15016,1,103.8,97,107,103.8,93.44894,103.082852
1,15018,3,296.32,257,395,98.773333,86.730562,133.301836
2,50013,1,101.8,92,127,101.8,90.373281,124.75442
3,50014,1,93.28,88,123,93.28,94.339623,131.861063
4,1610612737,922,94868.56,102414,103855,102.894317,107.953573,109.472517


In [51]:
# ==========================================
# CELL 3 – DEFINE BACKTEST POOL (24–25+)
# ==========================================
import datetime as dt

# Ensure game_date is a date
if not pd.api.types.is_datetime64_any_dtype(games_qepc["game_date"]):
    games_qepc["game_date"] = pd.to_datetime(games_qepc["game_date"]).dt.date

completed_mask = games_qepc["home_score"].notna() & games_qepc["away_score"].notna()

season_start = dt.date(2024, 10, 1)
today = dt.date.today()
yesterday = today - dt.timedelta(days=1)

season_mask = (games_qepc["game_date"] >= season_start) & (games_qepc["game_date"] <= yesterday)

backtest_pool = games_qepc[completed_mask & season_mask].copy()

print("Backtest pool games:", len(backtest_pool))
print("Date range:",
      backtest_pool["game_date"].min(),
      "→",
      backtest_pool["game_date"].max())


Backtest pool games: 1795
Date range: 2024-10-04 → 2025-12-05


In [52]:
# ==========================================
# CELL 3A – BUILD TEAM-GAME REST / B2B INFO
# ==========================================
import pandas as pd

# Make sure game_date is datetime.date (we used this earlier already)
if not pd.api.types.is_datetime64_any_dtype(games_qepc["game_date"]):
    games_qepc["game_date"] = pd.to_datetime(games_qepc["game_date"]).dt.date

# Build a long table: one row per (team, game)
rows = []
for _, row in games_qepc.iterrows():
    gid = row["game_id"]
    gdate = row["game_date"]
    # Home team row
    rows.append({
        "game_id": gid,
        "team_id": row["home_team_id"],
        "is_home": True,
        "game_date": gdate,
    })
    # Away team row
    rows.append({
        "game_id": gid,
        "team_id": row["away_team_id"],
        "is_home": False,
        "game_date": gdate,
    })

team_games = pd.DataFrame(rows)
team_games.sort_values(["team_id", "game_date"], inplace=True)

# Previous game date per team
team_games["prev_date"] = team_games.groupby("team_id")["game_date"].shift(1)

# Days since previous game
team_games["days_since_prev"] = (
    pd.to_datetime(team_games["game_date"])
    - pd.to_datetime(team_games["prev_date"])
).dt.days

# Back-to-back flag: played yesterday
team_games["is_b2b"] = team_games["days_since_prev"] == 1

# First game of season / missing prev_date → not B2B
team_games["days_since_prev"] = team_games["days_since_prev"].fillna(999)
team_games["is_b2b"] = team_games["is_b2b"].fillna(False)

print("Total team-game rows:", len(team_games))
print("Total B2B appearances:", int(team_games["is_b2b"].sum()))

# Index for fast lookup: (game_id, team_id) → is_b2b, is_home, etc.
rest_idx = team_games.set_index(["game_id", "team_id"])
rest_idx.head()


Total team-game rows: 144580
Total B2B appearances: 36332


Unnamed: 0_level_0,Unnamed: 1_level_0,is_home,game_date,prev_date,days_since_prev,is_b2b
game_id,team_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12500009,15016,False,2025-10-03,,999.0,False
12500032,15018,False,2025-10-06,,999.0,False
12500043,15018,False,2025-10-09,2025-10-06,3.0,False
12500055,15018,False,2025-10-13,2025-10-09,4.0,False
12500011,50013,False,2025-10-04,,999.0,False


In [53]:
# ==========================================
# CELL 4P – PACE-BASED BACKTEST (WITH HOME + B2B)
# ==========================================
import numpy as np

# You can reuse the same knobs or tweak them
HOME_BONUS = 1.5     # points added to home team
AWAY_PENALTY = 0.5   # points subtracted from away team
B2B_PENALTY = 1.5    # points subtracted if team is on B2B


def predict_team_points_pace(row, pace_index, rest_index,
                             home_bonus=HOME_BONUS,
                             away_penalty=AWAY_PENALTY,
                             b2b_penalty=B2B_PENALTY):
    """
    Predict raw team points from:
      - team pace (possessions per game)
      - team offensive rating (points per 100 possessions)
      - plus home-court and B2B schedule adjustments
    """
    home_id = row["home_team_id"]
    away_id = row["away_team_id"]

    if home_id not in pace_index.index or away_id not in pace_index.index:
        return np.nan, np.nan

    home = pace_index.loc[home_id]
    away = pace_index.loc[away_id]

    # Estimate game pace as average of both teams' typical pace
    pace = (home["pace_per_game"] + away["pace_per_game"]) / 2.0

    # Basic expected points: Pts = Possessions * OffRtg / 100
    home_raw = pace * home["off_rating"] / 100.0
    away_raw = pace * away["off_rating"] / 100.0

    # Home-court tweaks
    home_raw += home_bonus
    away_raw -= away_penalty

    # Back-to-back tweaks (same as before)
    key_home = (row["game_id"], home_id)
    key_away = (row["game_id"], away_id)

    if key_home in rest_index.index:
        if bool(rest_index.loc[key_home, "is_b2b"]):
            home_raw -= b2b_penalty

    if key_away in rest_index.index:
        if bool(rest_index.loc[key_away, "is_b2b"]):
            away_raw -= b2b_penalty

    return home_raw, away_raw


# Use the entire backtest pool
sample_games = backtest_pool.copy()
print("Sampled games (all):", len(sample_games))

results = []

for _, g in sample_games.iterrows():
    exp_home, exp_away = predict_team_points_pace(g, pace_idx, rest_idx)
    if np.isnan(exp_home) or np.isnan(exp_away):
        continue

    results.append({
        "game_id": g["game_id"],
        "game_date": g["game_date"],
        "home_team_id": g["home_team_id"],
        "away_team_id": g["away_team_id"],
        "pred_home_pts": exp_home,
        "pred_away_pts": exp_away,
        "actual_home_pts": float(g["home_score"]),
        "actual_away_pts": float(g["away_score"]),
    })

print("Total matched game predictions:", len(results))


Sampled games (all): 1795
Total matched game predictions: 1795


In [54]:
# ==========================================
# CELL 5 – EVALUATE BACKTEST ERRORS
# ==========================================
results_df = pd.DataFrame(results)

print("Backtest rows:", len(results_df))
display(results_df.head())

# Absolute errors
results_df["home_abs_err"] = (results_df["actual_home_pts"] - results_df["pred_home_pts"]).abs()
results_df["away_abs_err"] = (results_df["actual_away_pts"] - results_df["pred_away_pts"]).abs()

# Squared errors
results_df["home_sq_err"] = (results_df["actual_home_pts"] - results_df["pred_home_pts"])**2
results_df["away_sq_err"] = (results_df["actual_away_pts"] - results_df["pred_away_pts"])**2

home_mae = results_df["home_abs_err"].mean()
away_mae = results_df["away_abs_err"].mean()

home_rmse = np.sqrt(results_df["home_sq_err"].mean())
away_rmse = np.sqrt(results_df["away_sq_err"].mean())

print(f"Home MAE:  {home_mae:.2f} points")
print(f"Away MAE:  {away_mae:.2f} points")
print(f"Home RMSE: {home_rmse:.2f} points")
print(f"Away RMSE: {away_rmse:.2f} points")

# Bias (mean error: actual - predicted)
results_df["home_err"] = results_df["actual_home_pts"] - results_df["pred_home_pts"]
results_df["away_err"] = results_df["actual_away_pts"] - results_df["pred_away_pts"]

print("\nBias (mean error, actual - predicted):")
print(f"Home bias: {results_df['home_err'].mean():+.2f}")
print(f"Away bias: {results_df['away_err'].mean():+.2f}")


Backtest rows: 1795


Unnamed: 0,game_id,game_date,home_team_id,away_team_id,pred_home_pts,pred_away_pts,actual_home_pts,actual_away_pts
0,22500349,2025-12-05,1610612760,1610612742,111.545211,110.007727,132.0,111.0
1,22500345,2025-12-05,1610612741,1610612754,110.16308,110.511187,105.0,120.0
2,22500347,2025-12-05,1610612763,1610612746,109.587517,111.073864,107.0,98.0
3,22500348,2025-12-05,1610612749,1610612755,113.610309,107.87385,101.0,116.0
4,22500346,2025-12-05,1610612745,1610612756,113.343214,110.450376,117.0,98.0


Home MAE:  10.56 points
Away MAE:  10.69 points
Home RMSE: 13.57 points
Away RMSE: 13.77 points

Bias (mean error, actual - predicted):
Home bias: +3.64
Away bias: +3.73


In [55]:
# ==========================================
# CELL 6 – LINEAR CALIBRATION OF PREDICTIONS
# ==========================================
import numpy as np

# Fit y = m*x + b for home and away separately
home_m, home_b = np.polyfit(results_df["pred_home_pts"], results_df["actual_home_pts"], 1)
away_m, away_b = np.polyfit(results_df["pred_away_pts"], results_df["actual_away_pts"], 1)

print("Home calibration: actual_home ≈ "
      f"{home_m:.3f} * pred_home + {home_b:.3f}")
print("Away calibration: actual_away ≈ "
      f"{away_m:.3f} * pred_away + {away_b:.3f}")

# Apply calibrated predictions
results_df["cal_pred_home_pts"] = home_m * results_df["pred_home_pts"] + home_b
results_df["cal_pred_away_pts"] = away_m * results_df["pred_away_pts"] + away_b

# Recompute errors using calibrated predictions
results_df["cal_home_abs_err"] = (results_df["actual_home_pts"] - results_df["cal_pred_home_pts"]).abs()
results_df["cal_away_abs_err"] = (results_df["actual_away_pts"] - results_df["cal_pred_away_pts"]).abs()

results_df["cal_home_sq_err"] = (results_df["actual_home_pts"] - results_df["cal_pred_home_pts"])**2
results_df["cal_away_sq_err"] = (results_df["actual_away_pts"] - results_df["cal_pred_away_pts"])**2

cal_home_mae = results_df["cal_home_abs_err"].mean()
cal_away_mae = results_df["cal_away_abs_err"].mean()
cal_home_rmse = np.sqrt(results_df["cal_home_sq_err"].mean())
cal_away_rmse = np.sqrt(results_df["cal_away_sq_err"].mean())

# Bias after calibration
results_df["cal_home_err"] = results_df["actual_home_pts"] - results_df["cal_pred_home_pts"]
results_df["cal_away_err"] = results_df["actual_away_pts"] - results_df["cal_pred_away_pts"]

cal_home_bias = results_df["cal_home_err"].mean()
cal_away_bias = results_df["cal_away_err"].mean()

print("\n--- After linear calibration ---")
print(f"Calibrated Home MAE:  {cal_home_mae:.2f} points")
print(f"Calibrated Away MAE:  {cal_away_mae:.2f} points")
print(f"Calibrated Home RMSE: {cal_home_rmse:.2f} points")
print(f"Calibrated Away RMSE: {cal_away_rmse:.2f} points")
print(f"Calibrated Home bias: {cal_home_bias:+.2f}")
print(f"Calibrated Away bias: {cal_away_bias:+.2f}")


Home calibration: actual_home ≈ 0.868 * pred_home + 18.316
Away calibration: actual_away ≈ 0.918 * pred_away + 12.741

--- After linear calibration ---
Calibrated Home MAE:  10.12 points
Calibrated Away MAE:  10.26 points
Calibrated Home RMSE: 13.07 points
Calibrated Away RMSE: 13.26 points
Calibrated Home bias: +0.00
Calibrated Away bias: -0.00
