# OrderBook Accuracy Analysis

This notebook analyzes the accuracy of OrderBook floor price predictions compared to actual subsequent sales.

## Key Questions
1. **How accurate are OrderBook predictions?** (MAE, RMSE, % error)
2. **Is confidence calibrated?** (Do high-confidence predictions perform better?)
3. **What are the failure modes?** (When does OrderBook fail?)
4. **OrderBook vs Sales Fallback?** (Which source is more accurate?)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Style
plt.style.use('dark_background')
sns.set_palette('viridis')

# Load backtest data
DATA_PATH = Path('../data/orderbook_backtest.csv')
if DATA_PATH.exists():
    df = pd.read_csv(DATA_PATH, parse_dates=['prediction_date', 'next_sale_date'])
    print(f"Loaded {len(df)} observations")
    print(f"Date range: {df['prediction_date'].min()} to {df['prediction_date'].max()}")
else:
    print(f"Data file not found: {DATA_PATH}")
    print("Run: python scripts/backtest_orderbook.py --days 90")

## 1. Overall Accuracy Metrics

In [None]:
def calculate_metrics(data):
    """Calculate accuracy metrics for a dataset."""
    errors = data['error']
    abs_errors = data['absolute_error']
    pct_errors = data['percentage_error']
    
    return {
        'count': len(data),
        'mae': abs_errors.mean(),
        'rmse': np.sqrt((errors ** 2).mean()),
        'median_error': errors.median(),
        'median_abs_error': abs_errors.median(),
        'median_pct_error': pct_errors.median(),
        'std_error': errors.std(),
        'p95_abs_error': abs_errors.quantile(0.95),
        'overestimate_pct': (errors > 0).mean() * 100,
    }

if 'df' in dir():
    overall = calculate_metrics(df)
    print("="*60)
    print("OVERALL ACCURACY")
    print("="*60)
    print(f"Observations:      {overall['count']:,}")
    print(f"MAE:               ${overall['mae']:.2f}")
    print(f"RMSE:              ${overall['rmse']:.2f}")
    print(f"Median Error:      ${overall['median_error']:.2f}")
    print(f"Median Abs Error:  ${overall['median_abs_error']:.2f}")
    print(f"Median % Error:    {overall['median_pct_error']:.1f}%")
    print(f"95th %ile Error:   ${overall['p95_abs_error']:.2f}")
    print(f"Overestimate %:    {overall['overestimate_pct']:.1f}%")

## 2. Error Distribution

In [None]:
if 'df' in dir():
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    # Error distribution
    ax1 = axes[0]
    ax1.hist(df['error'], bins=50, edgecolor='white', alpha=0.7)
    ax1.axvline(0, color='red', linestyle='--', linewidth=2)
    ax1.set_xlabel('Error (Predicted - Actual)')
    ax1.set_ylabel('Frequency')
    ax1.set_title('Error Distribution')
    
    # Absolute error distribution
    ax2 = axes[1]
    ax2.hist(df['absolute_error'], bins=50, edgecolor='white', alpha=0.7, color='orange')
    ax2.axvline(overall['mae'], color='red', linestyle='--', linewidth=2, label=f'MAE=${overall["mae"]:.2f}')
    ax2.set_xlabel('Absolute Error ($)')
    ax2.set_ylabel('Frequency')
    ax2.set_title('Absolute Error Distribution')
    ax2.legend()
    
    # Percentage error distribution
    ax3 = axes[2]
    ax3.hist(df['percentage_error'].clip(upper=100), bins=50, edgecolor='white', alpha=0.7, color='green')
    ax3.axvline(overall['median_pct_error'], color='red', linestyle='--', linewidth=2, label=f'Median={overall["median_pct_error"]:.1f}%')
    ax3.set_xlabel('Percentage Error (%)')
    ax3.set_ylabel('Frequency')
    ax3.set_title('Percentage Error Distribution')
    ax3.legend()
    
    plt.tight_layout()
    plt.show()

## 3. Confidence Calibration

Do high-confidence predictions actually perform better?

In [None]:
if 'df' in dir():
    # Create confidence buckets
    df['confidence_bucket'] = pd.cut(
        df['confidence'], 
        bins=[0, 0.2, 0.4, 0.6, 0.8, 1.0],
        labels=['0-0.2', '0.2-0.4', '0.4-0.6', '0.6-0.8', '0.8-1.0']
    )
    
    # Calculate metrics by confidence bucket
    conf_metrics = df.groupby('confidence_bucket', observed=True).agg({
        'absolute_error': ['count', 'mean', 'median'],
        'percentage_error': 'median',
        'error': lambda x: (x > 0).mean() * 100  # Overestimate %
    }).round(2)
    conf_metrics.columns = ['Count', 'MAE', 'Median Abs Err', 'Median % Err', 'Overestimate %']
    
    print("Accuracy by Confidence Level")
    print("="*70)
    print(conf_metrics.to_string())
    
    # Visualization
    fig, ax = plt.subplots(figsize=(10, 5))
    bucket_mids = [0.1, 0.3, 0.5, 0.7, 0.9]
    maes = conf_metrics['MAE'].values
    counts = conf_metrics['Count'].values
    
    bars = ax.bar(bucket_mids, maes, width=0.15, alpha=0.7)
    ax.set_xlabel('Confidence Score')
    ax.set_ylabel('Mean Absolute Error ($)')
    ax.set_title('Confidence Calibration: Does Higher Confidence = Lower Error?')
    ax.set_xticks(bucket_mids)
    ax.set_xticklabels(['0-0.2', '0.2-0.4', '0.4-0.6', '0.6-0.8', '0.8-1.0'])
    
    # Add count labels
    for bar, count in zip(bars, counts):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, 
                f'n={int(count)}', ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.show()
    
    # Calibration assessment
    if len(maes) >= 2:
        is_calibrated = maes[-1] < maes[0]  # High confidence should have lower MAE
        print(f"\nCalibration Assessment: {'PASS' if is_calibrated else 'FAIL'}")
        print(f"  Low confidence MAE: ${maes[0]:.2f}")
        print(f"  High confidence MAE: ${maes[-1]:.2f}")

## 4. OrderBook vs Sales Fallback

In [None]:
if 'df' in dir():
    source_metrics = df.groupby('source').apply(calculate_metrics).apply(pd.Series)
    
    print("Accuracy by Prediction Source")
    print("="*70)
    print(source_metrics[['count', 'mae', 'rmse', 'median_abs_error', 'median_pct_error', 'overestimate_pct']].round(2).to_string())
    
    # Visualization
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    # MAE comparison
    ax1 = axes[0]
    sources = source_metrics.index.tolist()
    maes = source_metrics['mae'].values
    ax1.bar(sources, maes, color=['#4ecdc4', '#ff6b6b'])
    ax1.set_ylabel('Mean Absolute Error ($)')
    ax1.set_title('MAE by Source')
    for i, (src, mae) in enumerate(zip(sources, maes)):
        ax1.text(i, mae + 0.3, f'${mae:.2f}', ha='center')
    
    # Box plots
    ax2 = axes[1]
    df.boxplot(column='absolute_error', by='source', ax=ax2)
    ax2.set_ylabel('Absolute Error ($)')
    ax2.set_title('Error Distribution by Source')
    plt.suptitle('')
    
    plt.tight_layout()
    plt.show()

## 5. Price-Dependent Accuracy

Is accuracy different for high-value vs low-value cards?

In [None]:
if 'df' in dir():
    # Create price buckets
    df['price_bucket'] = pd.cut(
        df['next_sale_price'], 
        bins=[0, 10, 25, 50, 100, float('inf')],
        labels=['$0-10', '$10-25', '$25-50', '$50-100', '$100+']
    )
    
    price_metrics = df.groupby('price_bucket', observed=True).agg({
        'absolute_error': ['count', 'mean', 'median'],
        'percentage_error': 'median',
    }).round(2)
    price_metrics.columns = ['Count', 'MAE', 'Median Abs Err', 'Median % Err']
    
    print("Accuracy by Price Range")
    print("="*60)
    print(price_metrics.to_string())
    
    # Scatter plot
    fig, ax = plt.subplots(figsize=(10, 6))
    scatter = ax.scatter(
        df['next_sale_price'], 
        df['absolute_error'],
        c=df['confidence'],
        cmap='viridis',
        alpha=0.5,
        s=20
    )
    ax.set_xlabel('Actual Sale Price ($)')
    ax.set_ylabel('Absolute Error ($)')
    ax.set_title('Error vs Price (colored by confidence)')
    plt.colorbar(scatter, label='Confidence')
    
    # Add trend line
    z = np.polyfit(df['next_sale_price'], df['absolute_error'], 1)
    p = np.poly1d(z)
    x_line = np.linspace(df['next_sale_price'].min(), df['next_sale_price'].max(), 100)
    ax.plot(x_line, p(x_line), 'r--', alpha=0.8, label=f'Trend')
    ax.legend()
    
    plt.tight_layout()
    plt.show()

## 6. Listing Count Impact

How many listings are needed for accurate predictions?

In [None]:
if 'df' in dir():
    # Create listing count buckets
    df['listing_bucket'] = pd.cut(
        df['total_listings'], 
        bins=[0, 3, 5, 10, 20, float('inf')],
        labels=['1-3', '4-5', '6-10', '11-20', '20+']
    )
    
    listing_metrics = df.groupby('listing_bucket', observed=True).agg({
        'absolute_error': ['count', 'mean', 'median'],
        'percentage_error': 'median',
    }).round(2)
    listing_metrics.columns = ['Count', 'MAE', 'Median Abs Err', 'Median % Err']
    
    print("Accuracy by Number of Listings")
    print("="*60)
    print(listing_metrics.to_string())
    
    # Visualization
    fig, ax = plt.subplots(figsize=(10, 5))
    buckets = listing_metrics.index.tolist()
    maes = listing_metrics['MAE'].values
    counts = listing_metrics['Count'].values
    
    bars = ax.bar(range(len(buckets)), maes, color='#4ecdc4')
    ax.set_xlabel('Number of Listings')
    ax.set_ylabel('Mean Absolute Error ($)')
    ax.set_title('Does More Data = Better Predictions?')
    ax.set_xticks(range(len(buckets)))
    ax.set_xticklabels(buckets)
    
    for bar, count in zip(bars, counts):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3, 
                f'n={int(count)}', ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.show()

## 7. Time-to-Sale Impact

Are predictions less accurate when sales happen days/weeks later?

In [None]:
if 'df' in dir():
    # Create time-to-sale buckets
    df['days_bucket'] = pd.cut(
        df['days_to_sale'], 
        bins=[-1, 0, 1, 3, 7, 14, float('inf')],
        labels=['Same day', '1 day', '2-3 days', '4-7 days', '8-14 days', '14+ days']
    )
    
    time_metrics = df.groupby('days_bucket', observed=True).agg({
        'absolute_error': ['count', 'mean', 'median'],
        'percentage_error': 'median',
    }).round(2)
    time_metrics.columns = ['Count', 'MAE', 'Median Abs Err', 'Median % Err']
    
    print("Accuracy by Time to Next Sale")
    print("="*60)
    print(time_metrics.to_string())
    
    # Visualization
    fig, ax = plt.subplots(figsize=(10, 5))
    buckets = time_metrics.index.tolist()
    maes = time_metrics['MAE'].values
    
    ax.bar(range(len(buckets)), maes, color='#ff6b6b')
    ax.set_xlabel('Days Until Next Sale')
    ax.set_ylabel('Mean Absolute Error ($)')
    ax.set_title('Prediction Accuracy Over Time')
    ax.set_xticks(range(len(buckets)))
    ax.set_xticklabels(buckets, rotation=45)
    
    plt.tight_layout()
    plt.show()

## 8. Worst Predictions (Edge Cases)

What went wrong in the biggest failures?

In [None]:
if 'df' in dir():
    worst = df.nlargest(10, 'absolute_error')[[
        'card_name', 'treatment', 'predicted_floor', 'next_sale_price',
        'error', 'confidence', 'source', 'total_listings'
    ]]
    
    print("Top 10 Worst Predictions")
    print("="*80)
    print(worst.to_string(index=False))
    
    # Common patterns in failures
    print("\n\nFailure Mode Analysis (Top 20% worst predictions):")
    print("="*60)
    threshold = df['absolute_error'].quantile(0.80)
    failures = df[df['absolute_error'] >= threshold]
    
    print(f"\nFailures with low listing count (<5): {(failures['total_listings'] < 5).sum()} ({(failures['total_listings'] < 5).mean()*100:.1f}%)")
    print(f"Failures with sales_fallback: {(failures['source'] == 'sales_fallback').sum()} ({(failures['source'] == 'sales_fallback').mean()*100:.1f}%)")
    print(f"Failures with high confidence (>0.6): {(failures['confidence'] > 0.6).sum()} ({(failures['confidence'] > 0.6).mean()*100:.1f}%)")
    print(f"Overestimates: {(failures['error'] > 0).sum()} ({(failures['error'] > 0).mean()*100:.1f}%)")

## 9. Conclusions & Recommendations

In [None]:
if 'df' in dir():
    print("="*60)
    print("SUMMARY & RECOMMENDATIONS")
    print("="*60)
    
    # Key findings
    print("\n1. OVERALL ACCURACY")
    print(f"   - MAE: ${overall['mae']:.2f}")
    print(f"   - Median % Error: {overall['median_pct_error']:.1f}%")
    quality = "GOOD" if overall['median_pct_error'] < 15 else "NEEDS IMPROVEMENT" if overall['median_pct_error'] < 30 else "POOR"
    print(f"   - Quality: {quality}")
    
    # Confidence calibration
    print("\n2. CONFIDENCE CALIBRATION")
    high_conf = df[df['confidence'] >= 0.6]['absolute_error'].mean()
    low_conf = df[df['confidence'] < 0.4]['absolute_error'].mean()
    is_calibrated = high_conf < low_conf
    print(f"   - High confidence MAE: ${high_conf:.2f}")
    print(f"   - Low confidence MAE: ${low_conf:.2f}")
    print(f"   - Calibration: {'GOOD' if is_calibrated else 'POOR - High confidence predictions are not better!'}")
    
    # Source comparison
    print("\n3. ORDERBOOK VS SALES FALLBACK")
    ob_data = df[df['source'] == 'order_book']
    sf_data = df[df['source'] == 'sales_fallback']
    if len(ob_data) > 0 and len(sf_data) > 0:
        print(f"   - OrderBook MAE: ${ob_data['absolute_error'].mean():.2f} (n={len(ob_data)})")
        print(f"   - Sales Fallback MAE: ${sf_data['absolute_error'].mean():.2f} (n={len(sf_data)})")
        better = "OrderBook" if ob_data['absolute_error'].mean() < sf_data['absolute_error'].mean() else "Sales Fallback"
        print(f"   - Better source: {better}")
    
    # Recommendations
    print("\n4. RECOMMENDATIONS")
    if not is_calibrated:
        print("   - [ ] Fix confidence algorithm - high confidence should = low error")
    if overall['median_pct_error'] > 20:
        print("   - [ ] Improve bucket algorithm for better floor estimation")
    if overall['overestimate_pct'] > 60:
        print("   - [ ] Algorithm tends to OVERESTIMATE - consider bias correction")
    elif overall['overestimate_pct'] < 40:
        print("   - [ ] Algorithm tends to UNDERESTIMATE - consider bias correction")
    print("   - [ ] Keep OrderBook as FALLBACK, not primary pricing source")
    print("   - [ ] Use sales-based floor as primary when available")