# üî¨ QEPC Backtest

**Validate predictions against actual results**

This notebook:
1. Runs time-travel backtesting (only uses data available before each game)
2. Calculates accuracy metrics
3. Analyzes calibration
4. Generates visualizations

---

In [None]:
# SETUP - Run this first!
import sys
from pathlib import Path

# Point directly to your project root (where the data folder is)
project_root = Path(r"C:\Users\wdors\qepc_project")

# Add the new QEPC v2 code to Python path
qepc_v2_path = project_root / "experimental" / "CLAUDE_REWRITE" / "qepc_v2"

if str(qepc_v2_path) not in sys.path:
    sys.path.insert(0, str(qepc_v2_path))

print(f"üìÅ Project root: {project_root}")
print(f"üì¶ QEPC v2 code: {qepc_v2_path}")

# Imports
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

from qepc.data.loader import DataLoader
from qepc.sports.nba.backtest import BacktestEngine

# Optional: Plotting
try:
    import matplotlib.pyplot as plt
    HAS_PLOTS = True
    print("‚úÖ Matplotlib available")
except ImportError:
    HAS_PLOTS = False
    print("‚ö†Ô∏è  Matplotlib not available - skipping plots")

print("\n‚úÖ Ready to backtest!")

---
## ‚öôÔ∏è Configure Backtest

In [None]:
# BACKTEST CONFIGURATION
# Adjust these settings:

N_DAYS = 14  # Number of days to backtest

print(f"üìÖ Will backtest last {N_DAYS} days of games")

---
## üöÄ Run Backtest

In [None]:
# Create loader with explicit project root
loader = DataLoader(project_root=project_root)

# Create backtest engine
engine = BacktestEngine(data_loader=loader)

# Run backtest
summary = engine.run_backtest(
    n_days=N_DAYS,
    verbose=True
)

---
## üìä Detailed Results

In [None]:
# Get results as DataFrame
results_df = engine.results_to_dataframe()

if not results_df.empty:
    print(f"üìä {len(results_df)} games analyzed")
    print("\nSample results:")
    display(results_df.head(10))
else:
    print("‚ùå No results generated")

---
## üìà Calibration Analysis

Do 60% predictions actually win 60% of the time?

In [None]:
# Calibration analysis
if not results_df.empty:
    calibration = engine.calibration_analysis()
    
    print("\nüéØ CALIBRATION ANALYSIS")
    print("=" * 50)
    print("(Predicted probability vs Actual win rate)")
    print()
    display(calibration)

---
## üìä Visualizations

In [None]:
if HAS_PLOTS and not results_df.empty:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # 1. Spread Error Distribution
    ax1 = axes[0, 0]
    ax1.hist(results_df['Spread_Error'], bins=20, edgecolor='black', alpha=0.7, color='steelblue')
    ax1.axvline(0, color='red', linestyle='--', linewidth=2, label='Perfect')
    ax1.axvline(results_df['Spread_Error'].mean(), color='orange', linestyle='--', 
                linewidth=2, label=f'Mean: {results_df["Spread_Error"].mean():.1f}')
    ax1.set_xlabel('Spread Error (Predicted - Actual)', fontsize=12)
    ax1.set_ylabel('Frequency', fontsize=12)
    ax1.set_title('Spread Prediction Error Distribution', fontsize=14)
    ax1.legend()
    
    # 2. Predicted vs Actual Spread
    ax2 = axes[0, 1]
    ax2.scatter(results_df['Actual_Spread'], results_df['Pred_Spread'], alpha=0.6, s=50)
    lims = [min(results_df['Actual_Spread'].min(), results_df['Pred_Spread'].min()) - 5,
            max(results_df['Actual_Spread'].max(), results_df['Pred_Spread'].max()) + 5]
    ax2.plot(lims, lims, 'r--', linewidth=2, label='Perfect Prediction')
    ax2.set_xlabel('Actual Spread', fontsize=12)
    ax2.set_ylabel('Predicted Spread', fontsize=12)
    ax2.set_title('Predicted vs Actual Spread', fontsize=14)
    ax2.legend()
    
    # 3. Accuracy by Confidence
    ax3 = axes[1, 0]
    results_df['Conf_Bin'] = pd.cut(results_df['Confidence'], 
                                     bins=[0, 0.4, 0.5, 0.6, 0.7, 1.0],
                                     labels=['<40%', '40-50%', '50-60%', '60-70%', '>70%'])
    conf_acc = results_df.groupby('Conf_Bin', observed=True)['Winner_Correct'].agg(['mean', 'count'])
    
    bars = ax3.bar(range(len(conf_acc)), conf_acc['mean'], color='steelblue')
    ax3.axhline(0.5, color='red', linestyle='--', linewidth=2, label='50% (Random)')
    ax3.set_xticks(range(len(conf_acc)))
    ax3.set_xticklabels(conf_acc.index)
    ax3.set_xlabel('Model Confidence', fontsize=12)
    ax3.set_ylabel('Win Accuracy', fontsize=12)
    ax3.set_title('Win Accuracy by Confidence Level', fontsize=14)
    ax3.set_ylim(0, 1)
    ax3.legend()
    
    # Add count labels
    for i, (bar, count) in enumerate(zip(bars, conf_acc['count'])):
        ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
                f'n={int(count)}', ha='center', fontsize=10)
    
    # 4. Cumulative Accuracy Over Time
    ax4 = axes[1, 1]
    results_df_sorted = results_df.sort_values('Date')
    results_df_sorted['Cumulative_Accuracy'] = results_df_sorted['Winner_Correct'].expanding().mean()
    
    ax4.plot(range(len(results_df_sorted)), results_df_sorted['Cumulative_Accuracy'], 
             linewidth=2, color='steelblue')
    ax4.axhline(0.5, color='red', linestyle='--', linewidth=2, label='50% (Random)')
    ax4.fill_between(range(len(results_df_sorted)), 0.5, results_df_sorted['Cumulative_Accuracy'],
                     where=results_df_sorted['Cumulative_Accuracy'] > 0.5, alpha=0.3, color='green')
    ax4.fill_between(range(len(results_df_sorted)), 0.5, results_df_sorted['Cumulative_Accuracy'],
                     where=results_df_sorted['Cumulative_Accuracy'] < 0.5, alpha=0.3, color='red')
    ax4.set_xlabel('Game Number', fontsize=12)
    ax4.set_ylabel('Cumulative Accuracy', fontsize=12)
    ax4.set_title('Cumulative Win Accuracy Over Time', fontsize=14)
    ax4.set_ylim(0.3, 0.8)
    ax4.legend()
    
    plt.tight_layout()
    plt.show()
    
    print("‚úÖ Visualizations complete")
elif not HAS_PLOTS:
    print("‚ö†Ô∏è  Install matplotlib for visualizations: pip install matplotlib")

---
## üèÜ Best and Worst Predictions

In [None]:
if not results_df.empty:
    print("\nüèÜ BEST PREDICTIONS (smallest spread error)")
    print("=" * 65)
    
    results_df['Abs_Error'] = results_df['Spread_Error'].abs()
    for _, row in results_df.nsmallest(5, 'Abs_Error').iterrows():
        correct = "‚úÖ" if row['Winner_Correct'] else "‚ùå"
        print(f"   {correct} {row['Away_Team'][:18]:18} @ {row['Home_Team'][:18]:18} | Error: {row['Spread_Error']:+.1f}")
    
    print("\n‚ö†Ô∏è WORST PREDICTIONS (largest spread error)")
    print("=" * 65)
    
    for _, row in results_df.nlargest(5, 'Abs_Error').iterrows():
        correct = "‚úÖ" if row['Winner_Correct'] else "‚ùå"
        print(f"   {correct} {row['Away_Team'][:18]:18} @ {row['Home_Team'][:18]:18} | Error: {row['Spread_Error']:+.1f}")

---
## üíæ Save Results

In [None]:
if not results_df.empty:
    output_dir = project_root / 'data' / 'results' / 'backtests'
    output_dir.mkdir(parents=True, exist_ok=True)
    
    timestamp = datetime.now().strftime('%Y%m%d_%H%M')
    filename = f"QEPC_v2_Backtest_{timestamp}.csv"
    output_path = output_dir / filename
    
    results_df.to_csv(output_path, index=False)
    print(f"üíæ Saved to: {output_path}")

---

## üìù Interpretation Guide

| Metric | Good | Great | Elite |
|--------|------|-------|-------|
| Win Accuracy | >52% | >55% | >58% |
| Spread MAE | <12 pts | <10 pts | <9 pts |
| Brier Score | <0.24 | <0.22 | <0.20 |

**Key insights:**
- If spread bias is positive ‚Üí model overestimates home team
- If high confidence accuracy < overall accuracy ‚Üí model is overconfident
- Perfect calibration = predicted probability matches actual win rate