## QEPC NBA ‚Äì Quantum Backtest Notebook

**Identity**

- QEPC (Quantum Entangled Poisson Cascade) is a **quantum-inspired sports engine**.
- It thinks in **multiverses of game outcomes**, not single-point predictions.
- Each game exists in a **superposition of scripts** (Grind / Balanced / Chaos / etc.) until ‚Äúcollapse,‚Äù and QEPC‚Äôs job is to find the most probable collapse.
- Team and player stats are treated as **entangled variables**: pace, usage, injuries, volatility and matchup context all interact, not just add up.

**Core Objective**

1. Build the **most accurate real sports prediction model** we can, starting with NBA totals, spreads, and win probabilities.
2. Let **data and backtests** decide what survives:
   - Verifiable statistics > narrative.
   - No vibes, no astrology, no ‚Äúrevenge game‚Äù fluff.
3. Use this notebook as a **truth table**:
   - Measure QEPC‚Äôs raw performance.
   - Add calibration layers.
   - Track improvements over time (Win%, MAE, and out-of-sample results).


## üîß Setup

In [None]:
# --- Robust bootstrap to load notebook_header.py no matter where Jupyter started ---

import sys
import importlib.util
from pathlib import Path

# 1) Find the project root: the folder that contains notebook_header.py
cur = Path.cwd()
project_root = None

for _ in range(6):  # walk up a few levels just in case
    if (cur / "notebook_header.py").exists():
        project_root = cur
        break
    cur = cur.parent

if project_root is None:
    raise FileNotFoundError(
        "Could not find notebook_header.py in the current directory or its parents."
    )

# 2) Make sure project root is on sys.path
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# 3) Load notebook_header.py as a proper module
header_path = project_root / "notebook_header.py"
spec = importlib.util.spec_from_file_location("notebook_header", header_path)
notebook_header = importlib.util.module_from_spec(spec)

# IMPORTANT: register it in sys.modules so @dataclass doesn't break
sys.modules[spec.name] = notebook_header

spec.loader.exec_module(notebook_header)

# 4) Now call qepc_notebook_setup from that module
env = notebook_header.qepc_notebook_setup(run_diagnostics=False)
data_dir = env.data_dir
raw_dir = env.raw_dir

from qepc_autoload import qepc_step

print("‚úÖ QEPC environment initialized")
print("project_root:", project_root)
print("data_dir:", data_dir)
print("raw_dir:", raw_dir)


In [None]:
# CELL 2: Load 10-year team game logs (NBA_API_QEPC_Format)

import pandas as pd

# Point this to wherever you put the file:
# Example: data/raw/NBA_API_QEPC_Format.csv
team_stats_path = data_dir / "raw" / "NBA_API_QEPC_Format.csv"  # or .xls, both ok with read_csv

if not team_stats_path.exists():
    raise FileNotFoundError(f"NBA_API_QEPC_Format file not found at {team_stats_path}")

qepc_step(f"Loading 10-year team game logs from {team_stats_path}")

team_stats = pd.read_csv(team_stats_path)

# Make sure we have a usable date column
if "gameDate" not in team_stats.columns:
    raise ValueError(f"'gameDate' column not found. Columns: {list(team_stats.columns)[:15]}")

team_stats["gameDate"] = pd.to_datetime(team_stats["gameDate"], errors="coerce")

invalid = team_stats["gameDate"].isna().sum()
if invalid > 0:
    print(f"‚ö†Ô∏è Dropped {invalid} rows with invalid dates")
    team_stats = team_stats[team_stats["gameDate"].notna()].copy()

if len(team_stats) == 0:
    raise RuntimeError("After dropping invalid dates, no rows remain in team_stats.")

print(f"‚úÖ Loaded team_stats: {len(team_stats):,} rows")
print(f"üìÖ Date range: {team_stats['gameDate'].min().date()} to {team_stats['gameDate'].max().date()}")
print("üìã Columns:", list(team_stats.columns)[:12], "...")


In [None]:
# CELL 3: Build backtest_games = one row per game from 10-year log

from datetime import timedelta
import pandas as pd

# --- CONFIG ---
LOOKBACK_MODE  = "all"    # "days", "years", or "all"
# LOOKBACK_DAYS  = 30       # used if LOOKBACK_MODE == "days"
# LOOKBACK_YEARS = 3        # used if LOOKBACK_MODE == "years"

latest_date = team_stats["gameDate"].max()
earliest_date = team_stats["gameDate"].min()

if LOOKBACK_MODE == "days":
    BACKTEST_START = latest_date - timedelta(days=int(LOOKBACK_DAYS))
elif LOOKBACK_MODE == "years":
    BACKTEST_START = latest_date - timedelta(days=365 * int(LOOKBACK_YEARS))
elif LOOKBACK_MODE == "all":
    BACKTEST_START = earliest_date
else:
    raise ValueError(f"Unknown LOOKBACK_MODE: {LOOKBACK_MODE}")

BACKTEST_END = latest_date

print("üéØ Backtest Configuration")
print(f"   Mode:  {LOOKBACK_MODE}")
print(f"   Start: {BACKTEST_START.date()}")
print(f"   End:   {BACKTEST_END.date()}")
print(f"   Span:  {(BACKTEST_END - BACKTEST_START).days} days")

# Filter to window
mask = (team_stats["gameDate"] >= BACKTEST_START) & (team_stats["gameDate"] <= BACKTEST_END)
window_stats = team_stats[mask].copy()

print(f"\nüßπ Filtered to {len(window_stats)} rows in window.")

# --- NEW PART: build one row per game using gameId + home flag ---

if "gameId" not in window_stats.columns or "home" not in window_stats.columns:
    raise RuntimeError("Expected 'gameId' and 'home' columns in team_stats for game grouping.")

games_rows = []

for gid, group in window_stats.groupby("gameId"):
    # try to identify home/away rows
    home_rows = group[group["home"] == 1]
    away_rows = group[group["home"] == 0]

    if len(home_rows) == 0 or len(away_rows) == 0:
        # skip weird/incomplete games
        continue

    home_row = home_rows.iloc[0]
    away_row = away_rows.iloc[0]

    # Build nice names, coerce to string
    home_city  = str(home_row.get("teamCity", "") or "")
    home_name  = str(home_row.get("teamName", "") or "")
    away_city  = str(away_row.get("teamCity", "") or "")
    away_name  = str(away_row.get("teamName", "") or "")

    home_full = (home_city + " " + home_name).strip()
    away_full = (away_city + " " + away_name).strip()

    # Scores: use the teamScore columns from each row
    home_score = home_row.get("teamScore", 0)
    away_score = away_row.get("teamScore", 0)

    games_rows.append({
        "gameId": gid,
        "gameDate": home_row["gameDate"],  # same for both rows
        "Home_Team_Full": home_full,
        "Away_Team_Full": away_full,
        "Home_Score": home_score,
        "Away_Score": away_score,
        "Home_Short": home_name,
        "Away_Short": away_name,
    })

backtest_games = pd.DataFrame(games_rows)

print(f"üèÄ Built backtest_games with {len(backtest_games)} games (one row per game)")

print("\nüîç Sample of backtest_games:")
display(
    backtest_games[
        ["gameDate", "Home_Team_Full", "Away_Team_Full", "Home_Score", "Away_Score"]
    ].head()
)


In [None]:
# CELL 4: Compute team strengths table (ORtg/DRtg/Pace/Volatility)

from qepc.sports.nba.strengths_v2 import calculate_advanced_strengths

print("‚ßâ QEPC: Calculating team strengths...")

# Let strengths_v2 load from the canonical CSVs it expects
strengths_df = calculate_advanced_strengths(
    verbose=True,    # show one-time logs
)

if strengths_df is None or strengths_df.empty:
    raise RuntimeError("Strengths table is empty - cannot continue backtest.")

print(f"\n‚úÖ Calculated strengths for {len(strengths_df)} teams.")
print(strengths_df.head())


In [None]:
# CELL 5: Build schedule and compute raw lambdas for all backtest games

from qepc.core.lambda_engine import compute_lambda
import pandas as pd

if "backtest_games" not in globals() or len(backtest_games) == 0:
    raise RuntimeError("backtest_games is empty or not defined. Run Cell 3 first.")

# Filter to games where both teams exist in strengths_df
known_teams = set(str(t) for t in strengths_df["Team"].unique())

mask_known = (
    backtest_games["Home_Short"].isin(known_teams)
    & backtest_games["Away_Short"].isin(known_teams)
)
games_for_sim = backtest_games[mask_known].reset_index(drop=True)

skipped_unknown = len(backtest_games) - len(games_for_sim)
print(f"üß™ Using {len(games_for_sim)} games with known strengths (skipped {skipped_unknown}).")

if len(games_for_sim) == 0:
    raise RuntimeError("No games with matching team strengths - check naming alignment.")

# Build schedule DataFrame for lambda engine
schedule_df = pd.DataFrame({
    "Home Team": games_for_sim["Home_Short"],
    "Away Team": games_for_sim["Away_Short"],
})

print("\n‚ßâ QEPC: Computing raw lambdas for backtest schedule...")
lambda_df = compute_lambda(schedule_df, strengths_df)

print(f"‚úÖ Lambda table shape: {lambda_df.shape}")
display(lambda_df.head())


In [None]:
# CELL 6: Patch vol_home / vol_away to use strengths_df Volatility

# Drop any existing vol_* columns from lambda_df
for col in ["vol_home", "vol_away"]:
    if col in lambda_df.columns:
        lambda_df.drop(columns=[col], inplace=True)

# Map team -> volatility from strengths_df
vol_map = strengths_df.set_index("Team")["Volatility"]

lambda_df["vol_home"] = lambda_df["Home Team"].map(vol_map).astype(float)
lambda_df["vol_away"] = lambda_df["Away Team"].map(vol_map).astype(float)

print("Sample of lambda_df with patched vol columns:")
display(lambda_df[["Home Team", "Away Team", "lambda_home", "lambda_away", "vol_home", "vol_away"]].head())


In [None]:
# CELL 7: Run QEPC simulation on all games

from qepc.core.simulator import run_qepc_simulation

print("‚ßâ QEPC: Running QEPC simulation across backtest games...")
sim_df = run_qepc_simulation(lambda_df, num_trials=5000)

if sim_df is None or sim_df.empty:
    raise RuntimeError("Simulation returned no data.")

print(f"‚úÖ Simulation complete for {len(sim_df)} games")
display(sim_df.head())


In [None]:
# CELL 8: Combine predictions with actual results

# Align indices
sim_df = sim_df.reset_index(drop=True)
games_for_sim = games_for_sim.reset_index(drop=True)

if len(sim_df) != len(games_for_sim):
    raise RuntimeError("Mismatch between sim_df and games_for_sim lengths.")

results_df = pd.DataFrame({
    "Date":             games_for_sim["gameDate"],
    "Home_Team":        games_for_sim["Home_Team_Full"],
    "Away_Team":        games_for_sim["Away_Team_Full"],
    "Home_Short":       games_for_sim["Home_Short"],
    "Away_Short":       games_for_sim["Away_Short"],
    "Pred_Home_Score":  sim_df["Sim_Home_Score"],
    "Pred_Away_Score":  sim_df["Sim_Away_Score"],
})

results_df["Pred_Total"]   = results_df["Pred_Home_Score"] + results_df["Pred_Away_Score"]
results_df["Pred_Spread"]  = results_df["Pred_Home_Score"] - results_df["Pred_Away_Score"]

# Win prob if present
results_df["Home_Win_Prob"] = sim_df.get("Home_Win_Prob", 0.5)

# Actuals from games_for_sim
results_df["Actual_Home_Score"] = games_for_sim["Home_Score"]
results_df["Actual_Away_Score"] = games_for_sim["Away_Score"]
results_df["Actual_Total"]      = results_df["Actual_Home_Score"] + results_df["Actual_Away_Score"]
results_df["Actual_Spread"]     = results_df["Actual_Home_Score"] - results_df["Actual_Away_Score"]

# Winner correctness
results_df["Winner_Correct"] = (
    (results_df["Actual_Home_Score"] > results_df["Actual_Away_Score"]) ==
    (results_df["Home_Win_Prob"] > 0.5)
)

# Errors
results_df["Error_Total"]  = (results_df["Pred_Total"]  - results_df["Actual_Total"]).abs()
results_df["Error_Spread"] = (results_df["Pred_Spread"] - results_df["Actual_Spread"]).abs()

print(f"‚úÖ Built results_df with {len(results_df)} games")
display(results_df.head())


In [None]:
# CELL 9: Backtest summary metrics

if len(results_df) == 0:
    print("‚ùå No results to summarize.")
else:
    n_games = len(results_df)
    win_acc = results_df["Winner_Correct"].mean()
    mae_total = results_df["Error_Total"].mean()
    mae_spread = results_df["Error_Spread"].mean()

    print("üìä BACKTEST RESULTS")
    print("==================================================")
    print(f"Games Analyzed:  {n_games}")
    print(f"Win Accuracy:    {win_acc * 100:.1f}%")
    print(f"Avg Total Error: {mae_total:.1f} pts")
    print(f"Avg Spread Error:{mae_spread:.1f} pts")
    print("==================================================\n")

    # Best / worst by total error
    best = results_df.nsmallest(5, "Error_Total")
    worst = results_df.nlargest(5, "Error_Total")

    print("üèÜ Best predictions (by total error):")
    for _, row in best.iterrows():
        print(
            f"   {row['Away_Team']} @ {row['Home_Team']} "
            f"| Pred {row['Pred_Total']:.1f}, Actual {row['Actual_Total']:.1f} "
            f"| Error: {row['Error_Total']:.1f}"
        )

    print("\n‚ö†Ô∏è Worst predictions (by total error):")
    for _, row in worst.iterrows():
        print(
            f"   {row['Away_Team']} @ {row['Home_Team']} "
            f"| Pred {row['Pred_Total']:.1f}, Actual {row['Actual_Total']:.1f} "
            f"| Error: {row['Error_Total']:.1f}"
        )


In [None]:
# CELL: Train/Test split for total calibration (out-of-sample)

import numpy as np

if "results_df" not in globals() or results_df.empty:
    raise RuntimeError("results_df is empty ‚Äì run the main backtest cells first.")

# 1) Sort by date to simulate "time passing"
results_sorted = results_df.sort_values("Date").reset_index(drop=True)
n = len(results_sorted)
split_idx = int(n * 0.6)  # 60% train, 40% test

train = results_sorted.iloc[:split_idx].copy()
test  = results_sorted.iloc[split_idx:].copy()

print(f"Total games: {n}")
print(f"Train games: {len(train)}")
print(f"Test games:  {len(test)}")

# 2) Fit calibration on TRAIN: Actual ‚âà a + b * Pred_Total
x_train = train["Pred_Total"].values
y_train = train["Actual_Total"].values

A = np.vstack([x_train, np.ones_like(x_train)]).T
b_slope, a_intercept = np.linalg.lstsq(A, y_train, rcond=None)[0]

print(f"\nüìê Calibration fit on TRAIN:")
print(f"   Actual ‚âà {a_intercept:.2f} + {b_slope:.3f} * Pred_Total")

# 3) Apply calibration to TEST
test = test.copy()
test["Pred_Total_raw"] = test["Pred_Total"]
test["Pred_Total_cal"] = a_intercept + b_slope * test["Pred_Total_raw"]

test["Error_Total_raw"] = (test["Pred_Total_raw"] - test["Actual_Total"]).abs()
test["Error_Total_cal"] = (test["Pred_Total_cal"] - test["Actual_Total"]).abs()

mae_raw = test["Error_Total_raw"].mean()
mae_cal = test["Error_Total_cal"].mean()

# 4) Win% is unaffected by total calibration (we still use Home_Win_Prob),
#    but we can report it for the TEST window for reference.
win_acc_test = test["Winner_Correct"].mean() if "Winner_Correct" in test.columns else np.nan

print("\nüìä OUT-OF-SAMPLE TEST RESULTS")
print("==================================================")
print(f"Games in TEST:          {len(test)}")
print(f"Win Accuracy (TEST):    {win_acc_test * 100:.1f}%")
print(f"Avg Total Error (raw):  {mae_raw:.2f} pts")
print(f"Avg Total Error (cal):  {mae_cal:.2f} pts")
print("==================================================")


In [1]:
import pandas as pd
from pathlib import Path

# Try to get the true project root from QEPC's autoload paths module
try:
    from qepc.autoload.paths import get_project_root
    project_root = get_project_root()
except Exception:
    # Fallback if that import fails for some reason
    project_root = Path.cwd()
    print("‚ö†Ô∏è Falling back to cwd as project root")

print("Project root:", project_root)

# Helper: pick the "best" match for a file name among many
def pick_best_match(matches):
    if not matches:
        return None
    # Prefer paths that live under a 'data' folder and NOT under 'notebooks'
    scored = []
    for p in matches:
        score = 0
        parts = [str(part).lower() for part in p.parts]
        if "data" in parts:
            score += 2
        if "raw" in parts:
            score += 1
        if "props" in parts:
            score += 1
        if "results" in parts:
            score += 1
        if "notebooks" in parts:
            score -= 2
        if ".ipynb_checkpoints" in str(p):
            score -= 5
        scored.append((score, p))
    scored.sort(key=lambda x: x[0], reverse=True)
    return scored[0][1]

# (label, filename)
targets = [
    # Core game/team data
    ("TeamStatistics (team game logs)",      "TeamStatistics.csv"),
    ("Team_Stats (team season stats)",       "Team_Stats.csv"),
    ("PlayerStatistics (player logs)",       "PlayerStatistics.csv"),
    ("Canonical Games (schedule)",           "Games.csv"),
    ("GameResults_2025 (results)",           "GameResults_2025.csv"),
    ("Schedule_with_Rest",                   "Schedule_with_Rest.csv"),
    ("TeamForm",                             "TeamForm.csv"),

    # Roster / players
    ("Players",                              "Players.csv"),
    ("Players_Processed",                    "Players_Processed.csv"),

    # Injuries
    ("Injury_Overrides",                     "Injury_Overrides.csv"),
    ("Injury_Overrides_MASTER",              "Injury_Overrides_MASTER.csv"),
    ("Injury_Overrides_live_espn",           "Injury_Overrides_live_espn.csv"),

    # Props / aggregates
    ("Player_Season_Averages",               "Player_Season_Averages.csv"),
    ("Player_Averages_With_CI",              "Player_Averages_With_CI.csv"),
    ("Player_Recent_Form_L5",                "Player_Recent_Form_L5.csv"),
    ("Player_Recent_Form_L10",               "Player_Recent_Form_L10.csv"),
    ("Player_Recent_Form_L15",               "Player_Recent_Form_L15.csv"),
    ("Player_Home_Away_Splits",              "Player_Home_Away_Splits.csv"),
]

def preview_by_filename(label: str, filename: str, n: int = 3):
    print("\n" + "=" * 80)
    print(f"üìÑ {label}")
    print(f"Looking for filename: {filename}")

    # Find all matches anywhere under project_root
    matches = [p for p in project_root.rglob(filename)]
    if not matches:
        print("‚ö†Ô∏è No matches found in project.")
        return

    print("Found matches:")
    for m in matches:
        try:
            rel = m.relative_to(project_root)
        except ValueError:
            rel = m
        print("   ‚Ä¢", rel)

    best = pick_best_match(matches)
    if best is None:
        print("‚ö†Ô∏è Could not choose a best match.")
        return

    try:
        rel_best = best.relative_to(project_root)
    except ValueError:
        rel_best = best

    print(f"\n‚úÖ Using best match: {rel_best}")

    # Load a small sample (nrows=3) to avoid pulling full 300MB files
    try:
        df_sample = pd.read_csv(best, nrows=n)
        print(f"Sample shape: {df_sample.shape}")
        print("Columns:", list(df_sample.columns))
        print("\nSample rows:")
        display(df_sample)
    except Exception as e:
        print(f"‚ùå Error reading CSV sample: {e}")

for label, filename in targets:
    preview_by_filename(label, filename)


‚ö†Ô∏è Falling back to cwd as project root
Project root: C:\Users\wdors\qepc_project\notebooks\01_core

üìÑ TeamStatistics (team game logs)
Looking for filename: TeamStatistics.csv
‚ö†Ô∏è No matches found in project.

üìÑ Team_Stats (team season stats)
Looking for filename: Team_Stats.csv
‚ö†Ô∏è No matches found in project.

üìÑ PlayerStatistics (player logs)
Looking for filename: PlayerStatistics.csv
‚ö†Ô∏è No matches found in project.

üìÑ Canonical Games (schedule)
Looking for filename: Games.csv
‚ö†Ô∏è No matches found in project.

üìÑ GameResults_2025 (results)
Looking for filename: GameResults_2025.csv
‚ö†Ô∏è No matches found in project.

üìÑ Schedule_with_Rest
Looking for filename: Schedule_with_Rest.csv
‚ö†Ô∏è No matches found in project.

üìÑ TeamForm
Looking for filename: TeamForm.csv
‚ö†Ô∏è No matches found in project.

üìÑ Players
Looking for filename: Players.csv
‚ö†Ô∏è No matches found in project.

üìÑ Players_Processed
Looking for filename: Players_Processed.csv