# üî¨ QEPC Enhanced Backtest - FIXED

This notebook:
1. Uses **actual game results** from your data files
2. Compares QEPC predictions to real outcomes
3. Calculates detailed accuracy metrics
4. Generates visualizations

---

## üîß Setup

In [7]:
# --- Robust bootstrap to load notebook_header.py no matter where Jupyter started ---

import sys
import importlib.util
from pathlib import Path

# 1) Find the project root: the folder that contains notebook_header.py
cur = Path.cwd()
project_root = None

for _ in range(6):  # walk up a few levels just in case
    if (cur / "notebook_header.py").exists():
        project_root = cur
        break
    cur = cur.parent

if project_root is None:
    raise FileNotFoundError(
        "Could not find notebook_header.py in the current directory or its parents."
    )

# 2) Make sure project root is on sys.path
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# 3) Load notebook_header.py as a proper module
header_path = project_root / "notebook_header.py"
spec = importlib.util.spec_from_file_location("notebook_header", header_path)
notebook_header = importlib.util.module_from_spec(spec)

# IMPORTANT: register it in sys.modules so @dataclass doesn't break
sys.modules[spec.name] = notebook_header

spec.loader.exec_module(notebook_header)

# 4) Now call qepc_notebook_setup from that module
env = notebook_header.qepc_notebook_setup(run_diagnostics=False)
data_dir = env.data_dir
raw_dir = env.raw_dir

from qepc_autoload import qepc_step

print("‚úÖ QEPC environment initialized")
print("project_root:", project_root)
print("data_dir:", data_dir)
print("raw_dir:", raw_dir)


[NotebookHeader] QEPC project root: C:\Users\wdors\qepc_project
[NotebookHeader] Project root already on sys.path: C:\Users\wdors\qepc_project
[QEPC] Autoload complete.
[NotebookHeader] qepc_autoload imported successfully.
[NotebookHeader] data_dir: C:\Users\wdors\qepc_project\data
[NotebookHeader] raw_dir:  C:\Users\wdors\qepc_project\data\raw
[NotebookHeader] Notebook environment ready.
‚úÖ QEPC environment initialized
project_root: C:\Users\wdors\qepc_project
data_dir: C:\Users\wdors\qepc_project\data
raw_dir: C:\Users\wdors\qepc_project\data\raw


---
## üìä Load Actual Game Results

In [8]:
import pandas as pd
from pathlib import Path

# ---------------------------------------------------------
# STEP: Resolve data_dir and raw_dir using QEPC path helpers
# ---------------------------------------------------------
try:
    # Preferred: use autoload helpers (with get_raw_data_dir if available)
    from qepc.autoload.paths import get_data_dir, get_raw_data_dir

    data_dir = get_data_dir()
    raw_dir = get_raw_data_dir()
except ImportError:
    # Fallback: only get_data_dir exists
    from qepc.autoload.paths import get_data_dir

    data_dir = get_data_dir()
    raw_dir = data_dir / "raw"
except Exception:
    # Last-resort fallback (shouldn't normally be needed)
    print("‚ö†Ô∏è Could not import qepc.autoload.paths cleanly, falling back to cwd-based paths.")
    project_root = Path.cwd().parents[0]
    data_dir = project_root / "data"
    raw_dir = data_dir / "raw"

print("data_dir:", data_dir)
print("raw_dir:", raw_dir)

# ---------------------------------------------------------
# STEP: Load raw game-level team stats from best available CSV
# ---------------------------------------------------------
print("\nüìä Looking for game data...")

possible_paths = [
    raw_dir / "TeamStatistics.csv",
    data_dir / "TeamStatistics.csv",
    data_dir / "GameResults_2025.csv",
    data_dir / "Games.csv",
]

team_stats = None
for path in possible_paths:
    if path.exists():
        try:
            team_stats = pd.read_csv(path)
            print(f"‚úÖ Loaded: {path.name} ({len(team_stats):,} rows)")
            break
        except Exception as e:
            print(f"‚ö†Ô∏è Error reading {path}: {e}")

if team_stats is None:
    print("‚ùå No game data found!")
    print("   Searched:", [str(p) for p in possible_paths])
else:
    # ---------------------------------------------------------
    # STEP: Robust date parsing (2-pass, all tz-aware -> tz-naive)
    # ---------------------------------------------------------
    date_col = None
    for col in ["gameDate", "Date", "date", "GAME_DATE"]:
        if col in team_stats.columns:
            date_col = col
            break

    if not date_col:
        print("‚ö†Ô∏è No date column found in team_stats!")
    else:
        print(f"\nüìå Using date column: {date_col}")

        # Keep raw values for debugging
        team_stats["gameDate_raw"] = team_stats[date_col].astype(str)

        # --- Pass 1: generic parser, force UTC (tz-aware) ---
        parsed = pd.to_datetime(
            team_stats[date_col],
            errors="coerce",   # unparseable -> NaT
            utc=True,          # everything tz-aware
        )
        invalid_mask = parsed.isna()
        invalid_count = int(invalid_mask.sum())
        print(f"‚ö†Ô∏è NaT after generic parse: {invalid_count}")

        # --- Pass 2: explicit format for older rows like '11/3/1995 20:00' ---
        if invalid_count > 0:
            alt_parsed = pd.to_datetime(
                team_stats.loc[invalid_mask, date_col],
                format="%m/%d/%Y %H:%M",
                errors="coerce",
                utc=True,  # also tz-aware
            )
            parsed.loc[invalid_mask] = alt_parsed

            # Recompute invalids
            invalid_mask = parsed.isna()
            invalid_count = int(invalid_mask.sum())
            print(f"‚ö†Ô∏è Remaining NaT after m/d/Y H:M parse: {invalid_count}")

            if invalid_count > 0:
                print("\nüîç Sample of still-invalid 'gameDate_raw' values:")
                sample = (
                    team_stats.loc[invalid_mask, "gameDate_raw"]
                    .value_counts()
                    .head(10)
                )
                print(sample)

        # Now parsed is entirely tz-aware (where not NaT). Strip timezone -> tz-naive
        parsed = parsed.dt.tz_convert("UTC").dt.tz_localize(None)

        # Attach parsed dates
        team_stats["gameDate"] = parsed

        # Drop truly invalid rows
        valid_dates = team_stats["gameDate"].notna()
        dropped = int((~valid_dates).sum())
        if dropped > 0:
            print(f"\n‚ö†Ô∏è Dropping {dropped} rows with unparseable dates after both passes.")
        team_stats = team_stats[valid_dates].copy()

        if len(team_stats) > 0:
            # sort by date now that everything is tz-naive
            team_stats = team_stats.sort_values("gameDate").reset_index(drop=True)

            print(f"\n‚úÖ Remaining rows: {len(team_stats):,}")
            print(
                "üìÖ Date range:",
                team_stats["gameDate"].min().date(),
                "to",
                team_stats["gameDate"].max().date(),
            )
        else:
            print("‚ùå No valid dates in data after parsing!")

    print(f"\nüìã Columns available: {list(team_stats.columns)[:10]}...")


data_dir: C:\Users\wdors\qepc_project\data
raw_dir: C:\Users\wdors\qepc_project\data\raw

üìä Looking for game data...
‚úÖ Loaded: TeamStatistics.csv (144,314 rows)

üìå Using date column: gameDate
‚ö†Ô∏è NaT after generic parse: 143758
‚ö†Ô∏è Remaining NaT after m/d/Y H:M parse: 0

‚úÖ Remaining rows: 144,314
üìÖ Date range: 1946-11-26 to 2025-11-17

üìã Columns available: ['gameDate', 'teamCity', 'teamName', 'opponentTeamCity', 'opponentTeamName', 'teamScore', 'opponentScore', 'reboundsTotal', 'assists', 'threePointersMade']...


---
## üéØ Set Backtest Parameters

In [9]:
if team_stats is not None and 'gameDate' in team_stats.columns:
    # Auto-detect date range from data
    latest_date = team_stats['gameDate'].max()
    earliest_date = team_stats['gameDate'].min()
    
    # Default: last 30 days of available data
    BACKTEST_START = latest_date - timedelta(days=180)
    BACKTEST_END = latest_date
    
    print(f"üéØ Backtest Configuration:")
    print(f"   Start: {BACKTEST_START.date()}")
    print(f"   End:   {BACKTEST_END.date()}")
    print(f"   Days:  {(BACKTEST_END - BACKTEST_START).days}")
    
    # Filter to backtest window
    backtest_data = team_stats[
        (team_stats['gameDate'] >= BACKTEST_START) &
        (team_stats['gameDate'] <= BACKTEST_END)
    ].copy()
    
    # Get home games only (avoid duplicates)
    if 'home' in backtest_data.columns:
        backtest_games = backtest_data[backtest_data['home'] == 1].copy()
    else:
        backtest_games = backtest_data.copy()
    
    print(f"\nüìä Games in backtest window: {len(backtest_games)}")
    
    # Create standardized team name columns
    if 'teamName' in backtest_games.columns:
        backtest_games['Home_Team'] = (backtest_games.get('teamCity', '') + ' ' + backtest_games['teamName']).str.strip()
        backtest_games['Away_Team'] = (backtest_games.get('opponentTeamCity', '') + ' ' + backtest_games.get('opponentTeamName', '')).str.strip()
    
    # Create score columns
    for src, dst in [('teamScore', 'Home_Score'), ('opponentScore', 'Away_Score')]:
        if src in backtest_games.columns:
            backtest_games[dst] = backtest_games[src]
    
    if len(backtest_games) > 0:
        print("‚úÖ Ready to backtest!")
    else:
        print("‚ùå No games found in date range")
else:
    print("‚ùå Cannot set parameters - no data loaded")

üéØ Backtest Configuration:
   Start: 2025-05-21
   End:   2025-11-17
   Days:  180

üìä Games in backtest window: 294
‚úÖ Ready to backtest!


---
## üöÄ Run QEPC Predictions

In [10]:
print("üîÆ Running QEPC predictions...\n")

results = []
errors_log = []

if 'backtest_games' in dir() and len(backtest_games) > 0:
    total_games = len(backtest_games)
    
    for i, (idx, game) in enumerate(backtest_games.iterrows()):
        # Progress indicator
        if (i + 1) % 10 == 0 or i == 0:
            print(f"‚è≥ Processing game {i+1}/{total_games}...", end="\r")
        
        try:
            home_team = game.get('Home_Team', game.get('teamName', 'Home'))
            away_team = game.get('Away_Team', game.get('opponentTeamName', 'Away'))
            
            # Get team strengths
            strengths = calculate_advanced_strengths(verbose=False)
            
            if strengths.empty:
                errors_log.append(f"Game {i}: No strength data")
                continue
            
            # Build schedule
            schedule = pd.DataFrame([{
                'Home Team': home_team,
                'Away Team': away_team
            }])
            
            # Compute lambdas
            schedule_with_lambda = compute_lambda(schedule, strengths)
            
            # Run simulation
            predictions = run_qepc_simulation(schedule_with_lambda, num_trials=5000)
            
            if len(predictions) == 0:
                continue
            
            pred = predictions.iloc[0]
            
            # Get predictions
            pred_home = pred.get('Sim_Home_Score', pred.get('lambda_home', 110))
            pred_away = pred.get('Sim_Away_Score', pred.get('lambda_away', 108))
            home_win_prob = pred.get('Home_Win_Prob', 0.5)
            
            # Get actuals
            actual_home = game.get('Home_Score', game.get('teamScore', 0))
            actual_away = game.get('Away_Score', game.get('opponentScore', 0))
            
            # Calculate outcomes
            actual_home_won = actual_home > actual_away
            pred_home_won = home_win_prob > 0.5
            
            results.append({
                'Date': game['gameDate'],
                'Home_Team': home_team,
                'Away_Team': away_team,
                'Pred_Home_Score': round(pred_home, 1),
                'Pred_Away_Score': round(pred_away, 1),
                'Pred_Total': round(pred_home + pred_away, 1),
                'Pred_Spread': round(pred_home - pred_away, 1),
                'Home_Win_Prob': round(home_win_prob, 3),
                'Actual_Home_Score': actual_home,
                'Actual_Away_Score': actual_away,
                'Actual_Total': actual_home + actual_away,
                'Actual_Spread': actual_home - actual_away,
                'Winner_Correct': actual_home_won == pred_home_won,
                'Error_Total': abs((pred_home + pred_away) - (actual_home + actual_away)),
                'Error_Spread': abs((pred_home - pred_away) - (actual_home - actual_away)),
            })
            
        except Exception as e:
            errors_log.append(f"Game {i}: {str(e)[:40]}")
    
    print("\n")  # Clear progress line
    
    results_df = pd.DataFrame(results)
    print(f"‚úÖ Backtest complete!")
    print(f"   Games analyzed: {len(results_df)}")
    print(f"   Errors skipped: {len(errors_log)}")
else:
    print("‚ùå No games to backtest")

üîÆ Running QEPC predictions...

[Strengths] Loading game data from C:\Users\wdors\qepc_project\data\raw\TeamStatistics.csv
Computed real lambdas for 1 games.
[Strengths] Loading game data from C:\Users\wdors\qepc_project\data\raw\TeamStatistics.csv
Computed real lambdas for 1 games.
[Strengths] Loading game data from C:\Users\wdors\qepc_project\data\raw\TeamStatistics.csv
Computed real lambdas for 1 games.
[Strengths] Loading game data from C:\Users\wdors\qepc_project\data\raw\TeamStatistics.csv


KeyboardInterrupt: 

---
## üìà Analyze Results

In [None]:
if 'results_df' in dir() and len(results_df) > 0:
    # Calculate metrics
    win_accuracy = results_df['Winner_Correct'].mean()
    avg_total_error = results_df['Error_Total'].mean()
    avg_spread_error = results_df['Error_Spread'].mean()
    median_total_error = results_df['Error_Total'].median()
    
    print("="*60)
    print("üìä BACKTEST RESULTS")
    print("="*60)
    print(f"""
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ              PERFORMANCE SUMMARY                ‚îÇ
‚îú‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î§
‚îÇ  Games Analyzed:     {len(results_df):>6}                    ‚îÇ
‚îÇ  Win Accuracy:       {win_accuracy:>6.1%}                    ‚îÇ
‚îú‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î§
‚îÇ  Avg Total Error:    {avg_total_error:>6.1f} pts               ‚îÇ
‚îÇ  Median Total Error: {median_total_error:>6.1f} pts               ‚îÇ
‚îÇ  Avg Spread Error:   {avg_spread_error:>6.1f} pts               ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò
    """)
    
    # High confidence analysis
    results_df['Confidence'] = abs(results_df['Pred_Spread'])
    high_conf = results_df[results_df['Confidence'] > 5]
    
    if len(high_conf) > 0:
        print(f"\nüéØ High Confidence Games (|spread| > 5):")
        print(f"   Count: {len(high_conf)}")
        print(f"   Accuracy: {high_conf['Winner_Correct'].mean():.1%}")
    
    # Best predictions
    print(f"\nüèÜ Best Predictions (smallest error):")
    best = results_df.nsmallest(5, 'Error_Total')
    for _, row in best.iterrows():
        date = pd.Timestamp(row['Date']).strftime('%m-%d')
        print(f"   {date}: {row['Away_Team'][:18]:18} @ {row['Home_Team'][:18]:18} | Error: {row['Error_Total']:.1f}")
    
    # Worst predictions
    print(f"\n‚ö†Ô∏è Worst Predictions (largest error):")
    worst = results_df.nlargest(5, 'Error_Total')
    for _, row in worst.iterrows():
        date = pd.Timestamp(row['Date']).strftime('%m-%d')
        print(f"   {date}: {row['Away_Team'][:18]:18} @ {row['Home_Team'][:18]:18} | Error: {row['Error_Total']:.1f}")
    
    print("\n" + "="*60)
else:
    print("‚ùå No results to analyze")

---
## üìä Visualizations

In [None]:
if HAS_PLOTS and 'results_df' in dir() and len(results_df) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # 1. Predicted vs Actual Total
    ax1 = axes[0, 0]
    ax1.scatter(results_df['Actual_Total'], results_df['Pred_Total'], alpha=0.6, s=50)
    min_val = min(results_df['Actual_Total'].min(), results_df['Pred_Total'].min()) - 10
    max_val = max(results_df['Actual_Total'].max(), results_df['Pred_Total'].max()) + 10
    ax1.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2, label='Perfect')
    ax1.set_xlabel('Actual Total', fontsize=12)
    ax1.set_ylabel('Predicted Total', fontsize=12)
    ax1.set_title('Predicted vs Actual Total Score', fontsize=14)
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # 2. Error Distribution
    ax2 = axes[0, 1]
    ax2.hist(results_df['Error_Total'], bins=20, edgecolor='black', alpha=0.7, color='steelblue')
    ax2.axvline(results_df['Error_Total'].mean(), color='r', linestyle='--', linewidth=2, 
                label=f'Mean: {avg_total_error:.1f}')
    ax2.axvline(results_df['Error_Total'].median(), color='orange', linestyle='--', linewidth=2,
                label=f'Median: {median_total_error:.1f}')
    ax2.set_xlabel('Total Error (points)', fontsize=12)
    ax2.set_ylabel('Frequency', fontsize=12)
    ax2.set_title('Distribution of Total Score Error', fontsize=14)
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # 3. Error Over Time
    ax3 = axes[1, 0]
    results_sorted = results_df.sort_values('Date')
    ax3.plot(range(len(results_sorted)), results_sorted['Error_Total'], 
             marker='o', alpha=0.6, markersize=5, linewidth=1)
    ax3.axhline(avg_total_error, color='r', linestyle='--', linewidth=2, label='Mean Error')
    ax3.set_xlabel('Game Number', fontsize=12)
    ax3.set_ylabel('Total Error (points)', fontsize=12)
    ax3.set_title('Prediction Error Over Time', fontsize=14)
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    # 4. Win Accuracy by Confidence
    ax4 = axes[1, 1]
    bins = [0, 3, 6, 10, 100]
    labels = ['0-3', '3-6', '6-10', '10+']
    results_df['Conf_Bin'] = pd.cut(results_df['Confidence'], bins=bins, labels=labels)
    
    accuracy_by_conf = results_df.groupby('Conf_Bin', observed=True)['Winner_Correct'].agg(['mean', 'count'])
    
    bars = ax4.bar(range(len(accuracy_by_conf)), accuracy_by_conf['mean'], color='steelblue')
    ax4.axhline(0.5, color='r', linestyle='--', linewidth=2, label='50% (coin flip)')
    ax4.axhline(win_accuracy, color='green', linestyle='--', linewidth=2, label=f'Overall: {win_accuracy:.1%}')
    ax4.set_xticks(range(len(accuracy_by_conf)))
    ax4.set_xticklabels(labels)
    ax4.set_xlabel('Predicted Spread (confidence)', fontsize=12)
    ax4.set_ylabel('Win Accuracy', fontsize=12)
    ax4.set_title('Win Accuracy by Confidence Level', fontsize=14)
    ax4.set_ylim(0, 1)
    ax4.legend()
    ax4.grid(True, alpha=0.3, axis='y')
    
    # Add count labels on bars
    for i, (bar, count) in enumerate(zip(bars, accuracy_by_conf['count'])):
        ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, 
                 f'n={int(count)}', ha='center', fontsize=10)
    
    plt.tight_layout()
    plt.show()
    
    print("‚úÖ Visualizations complete")
elif not HAS_PLOTS:
    print("‚ö†Ô∏è Matplotlib not available - skipping visualizations")
else:
    print("‚ùå No data to visualize")

---
## üíæ Save Results

In [None]:
if 'results_df' in dir() and len(results_df) > 0:
    # Save detailed results
    output_dir = project_root / "data" / "results" / "backtests"
    output_dir.mkdir(parents=True, exist_ok=True)
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"Enhanced_Backtest_{timestamp}.csv"
    output_path = output_dir / filename
    
    results_df.to_csv(output_path, index=False)
    print(f"üíæ Saved results to: {output_path}")
    
    # Print final summary
    print(f"""
üìã FINAL SUMMARY
================
Period:       {BACKTEST_START.date()} to {BACKTEST_END.date()}
Games:        {len(results_df)}
Win Accuracy: {win_accuracy:.1%}
Avg Error:    {avg_total_error:.1f} pts
    """)
else:
    print("‚ùå No results to save")

print("üèÅ Backtest complete!")

---
## üéØ Next Steps

### Based on your results:

**If Win Accuracy < 55%:**
- Add recency weighting to team strengths
- Include rest day adjustments
- Consider injuries impact

**If Total Error > 15 points:**
- Calibrate lambda calculations
- Add pace adjustments
- Review team volatility modeling

**If High Confidence games underperform:**
- Add upset probability (quantum tunneling)
- Consider travel factors
- Review matchup-specific adjustments

---

**Use these insights to improve QEPC!** üöÄ

In [None]:
import pandas as pd
from pathlib import Path

# Try to get the true project root from QEPC's autoload paths module
try:
    from qepc.autoload.paths import get_project_root
    project_root = get_project_root()
except Exception:
    # Fallback if that import fails for some reason
    project_root = Path.cwd()
    print("‚ö†Ô∏è Falling back to cwd as project root")

print("Project root:", project_root)

# Helper: pick the "best" match for a file name among many
def pick_best_match(matches):
    if not matches:
        return None
    # Prefer paths that live under a 'data' folder and NOT under 'notebooks'
    scored = []
    for p in matches:
        score = 0
        parts = [str(part).lower() for part in p.parts]
        if "data" in parts:
            score += 2
        if "raw" in parts:
            score += 1
        if "props" in parts:
            score += 1
        if "results" in parts:
            score += 1
        if "notebooks" in parts:
            score -= 2
        if ".ipynb_checkpoints" in str(p):
            score -= 5
        scored.append((score, p))
    scored.sort(key=lambda x: x[0], reverse=True)
    return scored[0][1]

# (label, filename)
targets = [
    # Core game/team data
    ("TeamStatistics (team game logs)",      "TeamStatistics.csv"),
    ("Team_Stats (team season stats)",       "Team_Stats.csv"),
    ("PlayerStatistics (player logs)",       "PlayerStatistics.csv"),
    ("Canonical Games (schedule)",           "Games.csv"),
    ("GameResults_2025 (results)",           "GameResults_2025.csv"),
    ("Schedule_with_Rest",                   "Schedule_with_Rest.csv"),
    ("TeamForm",                             "TeamForm.csv"),

    # Roster / players
    ("Players",                              "Players.csv"),
    ("Players_Processed",                    "Players_Processed.csv"),

    # Injuries
    ("Injury_Overrides",                     "Injury_Overrides.csv"),
    ("Injury_Overrides_MASTER",              "Injury_Overrides_MASTER.csv"),
    ("Injury_Overrides_live_espn",           "Injury_Overrides_live_espn.csv"),

    # Props / aggregates
    ("Player_Season_Averages",               "Player_Season_Averages.csv"),
    ("Player_Averages_With_CI",              "Player_Averages_With_CI.csv"),
    ("Player_Recent_Form_L5",                "Player_Recent_Form_L5.csv"),
    ("Player_Recent_Form_L10",               "Player_Recent_Form_L10.csv"),
    ("Player_Recent_Form_L15",               "Player_Recent_Form_L15.csv"),
    ("Player_Home_Away_Splits",              "Player_Home_Away_Splits.csv"),
]

def preview_by_filename(label: str, filename: str, n: int = 3):
    print("\n" + "=" * 80)
    print(f"üìÑ {label}")
    print(f"Looking for filename: {filename}")

    # Find all matches anywhere under project_root
    matches = [p for p in project_root.rglob(filename)]
    if not matches:
        print("‚ö†Ô∏è No matches found in project.")
        return

    print("Found matches:")
    for m in matches:
        try:
            rel = m.relative_to(project_root)
        except ValueError:
            rel = m
        print("   ‚Ä¢", rel)

    best = pick_best_match(matches)
    if best is None:
        print("‚ö†Ô∏è Could not choose a best match.")
        return

    try:
        rel_best = best.relative_to(project_root)
    except ValueError:
        rel_best = best

    print(f"\n‚úÖ Using best match: {rel_best}")

    # Load a small sample (nrows=3) to avoid pulling full 300MB files
    try:
        df_sample = pd.read_csv(best, nrows=n)
        print(f"Sample shape: {df_sample.shape}")
        print("Columns:", list(df_sample.columns))
        print("\nSample rows:")
        display(df_sample)
    except Exception as e:
        print(f"‚ùå Error reading CSV sample: {e}")

for label, filename in targets:
    preview_by_filename(label, filename)


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# QEPC imports
from qepc.sports.nba.strengths_v2 import calculate_advanced_strengths
from qepc.core.lambda_engine import compute_lambda
from qepc.core.simulator import run_qepc_simulation

# ---------------------------------------------------------
# 0) Resolve data paths (project_root / data / raw)
# ---------------------------------------------------------
try:
    from qepc.autoload.paths import get_data_dir, get_raw_data_dir

    data_dir = get_data_dir()
    raw_dir = get_raw_data_dir()
except ImportError:
    # Fallback if get_raw_data_dir doesn't exist
    from qepc.autoload.paths import get_data_dir

    data_dir = get_data_dir()
    raw_dir = data_dir / "raw"
except Exception:
    # Last-resort fallback
    project_root = Path.cwd().parents[0]
    data_dir = project_root / "data"
    raw_dir = data_dir / "raw"

print("[QEPC] data_dir:", data_dir)
print("[QEPC] raw_dir:", raw_dir)

# ---------------------------------------------------------
# 1) Load TeamStatistics.csv and parse gameDate robustly
# ---------------------------------------------------------
team_stats_path = raw_dir / "TeamStatistics.csv"
print("\nüìä Loading team game logs from:", team_stats_path)

if not team_stats_path.exists():
    raise FileNotFoundError(f"TeamStatistics.csv not found at {team_stats_path}")

team_stats_raw = pd.read_csv(team_stats_path)
print(f"‚úÖ Loaded TeamStatistics: {len(team_stats_raw):,} rows, {len(team_stats_raw.columns)} columns")

# Detect date column
date_col = None
for col in ["gameDate", "Date", "date", "GAME_DATE"]:
    if col in team_stats_raw.columns:
        date_col = col
        break

if date_col is None:
    raise RuntimeError("No date-like column found in TeamStatistics.csv")

print(f"üìå Using date column: {date_col}")

team_stats = team_stats_raw.copy()
team_stats["gameDate_raw"] = team_stats[date_col].astype(str)

# Pass 1: generic parser, tz-aware
parsed = pd.to_datetime(
    team_stats[date_col],
    errors="coerce",
    utc=True,
)
invalid_mask = parsed.isna()
invalid_count = int(invalid_mask.sum())
print(f"‚ö†Ô∏è NaT after generic parse: {invalid_count}")

# Pass 2: explicit "%m/%d/%Y %H:%M" for old records (also tz-aware)
if invalid_count > 0:
    alt_parsed = pd.to_datetime(
        team_stats.loc[invalid_mask, date_col],
        format="%m/%d/%Y %H:%M",
        errors="coerce",
        utc=True,
    )
    parsed.loc[invalid_mask] = alt_parsed
    invalid_mask = parsed.isna()
    invalid_count = int(invalid_mask.sum())
    print(f"‚ö†Ô∏è Remaining NaT after m/d/Y H:M parse: {invalid_count}")

    if invalid_count > 0:
        print("\nüîç Sample of still-invalid 'gameDate_raw' values:")
        sample = (
            team_stats.loc[invalid_mask, "gameDate_raw"]
            .value_counts()
            .head(10)
        )
        print(sample)

# Strip timezone ‚Üí tz-naive
parsed = parsed.dt.tz_convert("UTC").dt.tz_localize(None)
team_stats["gameDate"] = parsed

# Drop rows with no valid date
valid_dates = team_stats["gameDate"].notna()
dropped = int((~valid_dates).sum())
if dropped > 0:
    print(f"\n‚ö†Ô∏è Dropping {dropped} rows with unparseable dates after both passes.")
team_stats = team_stats[valid_dates].copy()

# Sort by date
team_stats = team_stats.sort_values("gameDate").reset_index(drop=True)

print(f"\n‚úÖ Final TeamStatistics rows: {len(team_stats):,}")
print(
    "üìÖ Date range:",
    team_stats["gameDate"].min().date(),
    "to",
    team_stats["gameDate"].max().date(),
)

# ---------------------------------------------------------
# 2) Build recent evaluation set (one row per game)
# ---------------------------------------------------------
recent_cutoff = pd.Timestamp("2024-10-01")
recent_games = team_stats[team_stats["gameDate"] >= recent_cutoff].copy()

print(f"\nüïí Recent games since {recent_cutoff.date()}: {len(recent_games):,} team-rows")

# Collapse to one row per game by keeping one of the pair.
# NOTE: This uses a deterministic rule (teamName < opponentTeamName)
# just to avoid duplicates. For totals calibration, home/away doesn't matter.
mask_keep = recent_games["teamName"] < recent_games["opponentTeamName"]
games_eval = recent_games[mask_keep].copy()

games_eval = games_eval.rename(
    columns={
        "teamName": "Home Team",          # label only; not true home
        "opponentTeamName": "Away Team",  # label only; not true away
        "teamScore": "Home_Score",
        "opponentScore": "Away_Score",
    }
)

print(f"üéØ Unique game-rows for evaluation (calibration): {len(games_eval):,}")

# ---------------------------------------------------------
# 3) Compute team strengths from full game log
# ---------------------------------------------------------
print("\n‚ßâ QEPC: Computing team strengths (calculate_advanced_strengths)...")
strengths_df = calculate_advanced_strengths(
    game_data=team_stats,
    cutoff_date=None,
    verbose=True,
)

if strengths_df is None or strengths_df.empty:
    raise RuntimeError("strengths_df is empty. Check strengths_v2 configuration.")

# ---------------------------------------------------------
# 4) Build schedule df and compute lambdas
# ---------------------------------------------------------
schedule_df = games_eval[["Home Team", "Away Team"]].reset_index(drop=True)

lambda_df = compute_lambda(
    schedule_df=schedule_df,
    team_stats_df=strengths_df,
    include_situational=False,  # keep it clean for calibration
)

print(f"Computed lambdas for {len(lambda_df):,} games.")

# ---------------------------------------------------------
# 5) Run QEPC simulation on these lambdas
# ---------------------------------------------------------
print("\n‚ßâ QEPC: Running QEPC simulation on calibration sample...")
sim_results = run_qepc_simulation(
    df=lambda_df,
    num_trials=3000,
)

# ---------------------------------------------------------
# 6) Assemble eval_df: actual vs predicted totals & margins
# ---------------------------------------------------------
eval_df = games_eval.reset_index(drop=True).copy()

# Attach lambdas & sim outputs
eval_df["lambda_home"] = lambda_df["lambda_home"].values
eval_df["lambda_away"] = lambda_df["lambda_away"].values
eval_df["Sim_Home_Score"] = sim_results["Sim_Home_Score"].values
eval_df["Sim_Away_Score"] = sim_results["Sim_Away_Score"].values
eval_df["Home_Win_Prob"] = sim_results["Home_Win_Prob"].values
eval_df["Expected_Score_Total"] = sim_results["Expected_Score_Total"].values

# Actual totals / margins
eval_df["Actual_Total"] = eval_df["Home_Score"] + eval_df["Away_Score"]
eval_df["Actual_Margin"] = eval_df["Home_Score"] - eval_df["Away_Score"]

# Predicted totals / margins
eval_df["Pred_Total"] = eval_df["Expected_Score_Total"]
eval_df["Pred_Margin"] = eval_df["Sim_Home_Score"] - eval_df["Sim_Away_Score"]

# Binary outcome (based on fake "home" label; totals calibration doesn't care)
eval_df["Actual_Home_Win"] = (eval_df["Home_Score"] > eval_df["Away_Score"]).astype(int)
eval_df["Pred_Home_Win"] = (eval_df["Home_Win_Prob"] >= 0.5).astype(int)

# Errors
eval_df["Total_Error"] = (eval_df["Pred_Total"] - eval_df["Actual_Total"]).abs()
eval_df["Spread_Error"] = (eval_df["Pred_Margin"] - eval_df["Actual_Margin"]).abs()

# ---------------------------------------------------------
# 7) Summary metrics
# ---------------------------------------------------------
n_games = len(eval_df)
win_acc = (eval_df["Pred_Home_Win"] == eval_df["Actual_Home_Win"]).mean() * 100.0
mae_total = eval_df["Total_Error"].mean()
med_total = eval_df["Total_Error"].median()
mae_spread = eval_df["Spread_Error"].mean()

print("\nüìä MINI BACKTEST SUMMARY (CALIBRATION SAMPLE)")
print("============================================================")
print(f"Games Evaluated:       {n_games}")
print(f"Home-Win Accuracy:     {win_acc:5.1f}%  (NOTE: 'home' label here is synthetic)")
print(f"Mean Total Error:      {mae_total:5.1f} pts")
print(f"Median Total Error:    {med_total:5.1f} pts")
print(f"Mean Spread Error:     {mae_spread:5.1f} pts")

# ---------------------------------------------------------
# 8) Compute calibration factor for Œª totals
# ---------------------------------------------------------
mean_actual_total = eval_df["Actual_Total"].mean()
mean_pred_total = eval_df["Pred_Total"].mean()
calib_factor = mean_actual_total / mean_pred_total

print("\nüéØ CALIBRATION FACTOR")
print("============================================================")
print(f"Mean actual total:     {mean_actual_total:.2f}")
print(f"Mean predicted total:  {mean_pred_total:.2f}")
print(f"Suggested LAMBDA_TOTAL_SCALE: {calib_factor:.4f}")

print("\nüîé Sample rows (Actual vs Pred totals):")
display(
    eval_df[
        [
            "gameDate",
            "Home Team",
            "Away Team",
            "Home_Score",
            "Away_Score",
            "Actual_Total",
            "Pred_Total",
            "Total_Error",
        ]
    ].head(10)
)
