# üî¨ QEPC Simple Backtest

This notebook works with your **current project structure** without needing any file modifications.

---

In [None]:
# --- Robust bootstrap to load notebook_header.py no matter where Jupyter started ---

import sys
import importlib.util
from pathlib import Path

cur = Path.cwd()
project_root = None

for _ in range(6):
    if (cur / "notebook_header.py").exists():
        project_root = cur
        break
    cur = cur.parent

if project_root is None:
    raise FileNotFoundError(
        "Could not find notebook_header.py in the current directory or its parents."
    )

if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

header_path = project_root / "notebook_header.py"
spec = importlib.util.spec_from_file_location("notebook_header", header_path)
notebook_header = importlib.util.module_from_spec(spec)
sys.modules[spec.name] = notebook_header
spec.loader.exec_module(notebook_header)

env = notebook_header.qepc_notebook_setup(run_diagnostics=False)
data_dir = env.data_dir
raw_dir = env.raw_dir

from qepc_autoload import qepc_step  # üëà add this here too

print("‚úÖ QEPC environment initialized")
print("project_root:", project_root)
print("data_dir:", data_dir)
print("raw_dir:", raw_dir)


In [None]:
# CELL 2: LOAD DATA

import pandas as pd

# ‚úÖ Use the data_dir that qepc_notebook_setup already discovered
data_dir = env.data_dir
print("Using data_dir:", data_dir)

# Try to find game data
game_data = None
for filename in ['raw/TeamStatistics.csv', 'GameResults_2025.csv', 'Games.csv']:
    path = data_dir / filename
    print("Checking:", path)  # debug
    if path.exists():
        try:
            game_data = pd.read_csv(path)
            print(f"‚úÖ Loaded: {filename} ({len(game_data):,} rows)")
            break
        except Exception as e:
            print(f"‚ö†Ô∏è Failed to read {filename}: {e}")

if game_data is None:
    print("‚ùå No game data found!")
    print(f"   Looked in: {data_dir}")
else:
    # Parse dates - try multiple column names
    date_col = None
    for col in ['gameDate', 'Date', 'date', 'GAME_DATE']:
        if col in game_data.columns:
            date_col = col
            break
    
    if date_col:
        game_data['gameDate'] = pd.to_datetime(game_data[date_col], errors='coerce')
        
        # Drop rows with invalid dates
        valid_dates = game_data['gameDate'].notna()
        invalid_count = (~valid_dates).sum()
        
        if invalid_count > 0:
            print(f"‚ö†Ô∏è Dropped {invalid_count} rows with invalid dates")
            game_data = game_data[valid_dates].copy()
        
        if len(game_data) > 0:
            print(f"üìÖ Date range: {game_data['gameDate'].min().date()} to {game_data['gameDate'].max().date()}")
        else:
            print("‚ùå No valid dates found in data")
    else:
        print(f"‚ö†Ô∏è No date column found. Columns: {list(game_data.columns)[:10]}")
    
    print(f"üìã Columns: {list(game_data.columns)[:8]}...")


In [None]:
# CELL 3: IMPORT QEPC MODULES
try:
    from qepc.sports.nba.strengths_v2 import calculate_advanced_strengths
    from qepc.core.lambda_engine import compute_lambda
    from qepc.core.simulator import run_qepc_simulation
    USE_QEPC = True
    print("‚úÖ QEPC modules loaded!")
except ImportError as e:
    print(f"‚ö†Ô∏è QEPC import error: {e}")
    print("   Will use simple fallback prediction...")
    USE_QEPC = False

In [None]:
# CELL 4: RUN BACKTEST (quiet logging + in-place progress)

from datetime import timedelta
import pandas as pd

# Core QEPC imports
from qepc.sports.nba.strengths_v2 import calculate_advanced_strengths
from qepc.core.lambda_engine import compute_lambda
from qepc.core.simulator import run_qepc_simulation

USE_QEPC = True  # toggle if you ever want a dumb baseline

if game_data is None:
    print("‚ùå No game data to backtest")
else:
    print("\nüöÄ Running backtest...")

    # Get home games only (if we have a 'home' flag)
    if 'home' in game_data.columns:
        home_games = game_data[game_data['home'] == 1].copy()
    else:
        home_games = game_data.copy()

    # Limit to last 30 days of data
    latest = home_games['gameDate'].max()
    cutoff = latest - timedelta(days=30)
    backtest_games = home_games[home_games['gameDate'] >= cutoff].copy()

    total_games = len(backtest_games)
    print(f"üìä Backtesting {total_games} games from last 30 days")

    # üß† Compute team strengths ONCE
    strengths = calculate_advanced_strengths(verbose=False)
    if strengths is None or strengths.empty:
        print("‚ùå Could not compute team strengths; aborting backtest.")
    else:
        known_teams = set(str(t) for t in strengths["Team"].unique())
        print(f"‚úÖ Strengths table ready for {len(known_teams)} teams")

        results = []

        for n, (row_idx, game) in enumerate(backtest_games.iterrows(), start=1):
            try:
                # --- Get canonical team names that match strengths["Team"] ---
                if "teamName" in game.index and "opponentTeamName" in game.index:
                    # Use just the mascot / nickname: "Bulls", "Nuggets", etc.
                    home_team = str(game["teamName"])
                    away_team = str(game["opponentTeamName"])
                else:
                    # Fallback: Home_Team / Away_Team (strip city, keep last word)
                    home_raw = str(game.get("Home_Team", "Home"))
                    away_raw = str(game.get("Away_Team", "Away"))
                    home_team = home_raw.split()[-1]
                    away_team = away_raw.split()[-1]

                # If either team isn't in strengths, skip this game
                if home_team not in known_teams or away_team not in known_teams:
                    continue

                # --- Get actual scores ---
                actual_home = game.get("teamScore", game.get("Home_Score", 110))
                actual_away = game.get("opponentScore", game.get("Away_Score", 108))

                # --- Make prediction using QEPC ---
                if USE_QEPC:
                    try:
                        schedule = pd.DataFrame(
                            [{"Home Team": home_team, "Away Team": away_team}]
                        )
                        # üëá turn off verbose logging inside lambda engine
                        schedule_lambda = compute_lambda(
                            schedule,
                            strengths,
                            verbose=False  # important
                        )

                        if schedule_lambda.empty:
                            continue

                        predictions = run_qepc_simulation(
                            schedule_lambda, num_trials=2000
                        )

                        row0 = predictions.iloc[0]
                        pred_home = row0.get("Sim_Home_Score", 110)
                        pred_away = row0.get("Sim_Away_Score", 108)
                        home_win_prob = row0.get("Home_Win_Prob", 0.5)
                    except Exception:
                        # Fallback if simulation fails
                        pred_home, pred_away, home_win_prob = 112, 109, 0.55
                else:
                    pred_home, pred_away, home_win_prob = 112, 109, 0.55

                # --- Determine outcomes ---
                actual_home_won = actual_home > actual_away
                pred_home_won = home_win_prob > 0.5

                results.append(
                    {
                        "Date": game["gameDate"],
                        "Home_Team": home_team,
                        "Away_Team": away_team,
                        "Pred_Home": pred_home,
                        "Pred_Away": pred_away,
                        "Actual_Home": actual_home,
                        "Actual_Away": actual_away,
                        "Home_Win_Prob": home_win_prob,
                        "Correct": actual_home_won == pred_home_won,
                        "Total_Error": abs(
                            (pred_home + pred_away)
                            - (actual_home + actual_away)
                        ),
                    }
                )
            except Exception:
                # Skip games that blow up, keep going
                pass

            # üîÑ In-place progress update
            if n % 5 == 0 or n == total_games:
                print(f"   Processed {n}/{total_games} games...", end="\r")

        # Final newline so the last progress line doesn't overwrite the next print
        print()
        results_df = pd.DataFrame(results)
        print(f"\n‚úÖ Processed {len(results_df)} games")


In [None]:
# CELL 5: SHOW RESULTS
if 'results_df' in dir() and len(results_df) > 0:
    win_acc = results_df['Correct'].mean()
    avg_error = results_df['Total_Error'].mean()
    
    print("\n" + "="*50)
    print("üìä BACKTEST RESULTS")
    print("="*50)
    print(f"Games Analyzed:  {len(results_df)}")
    print(f"Win Accuracy:    {win_acc:.1%}")
    print(f"Avg Total Error: {avg_error:.1f} pts")
    print("="*50)
    
    print("\nüèÜ Best predictions:")
    for _, r in results_df.nsmallest(3, 'Total_Error').iterrows():
        print(f"   {r['Away_Team'][:20]:20} @ {r['Home_Team'][:20]:20} | Error: {r['Total_Error']:.0f}")
    
    print("\n‚ö†Ô∏è Worst predictions:")
    for _, r in results_df.nlargest(3, 'Total_Error').iterrows():
        print(f"   {r['Away_Team'][:20]:20} @ {r['Home_Team'][:20]:20} | Error: {r['Total_Error']:.0f}")
else:
    print("‚ùå No results to show")

In [None]:
# CELL 6: SAVE RESULTS
if 'results_df' in dir() and len(results_df) > 0:
    output_dir = project_root / "data" / "results" / "backtests"
    output_dir.mkdir(parents=True, exist_ok=True)
    
    filename = f"Backtest_{datetime.now():%Y%m%d_%H%M}.csv"
    output_path = output_dir / filename
    
    results_df.to_csv(output_path, index=False)
    print(f"üíæ Saved to: {output_path}")
    
    print(f"\nüìã SUMMARY:")
    print(f"   Win Accuracy: {win_acc:.1%}")
    print(f"   Avg Error: {avg_error:.1f} pts")
    print(f"   Games: {len(results_df)}")

print("\nüèÅ Done!")