# Backtest Results Analysis

This notebook analyzes the backtest results from all models:
- 3 Baselines: PCA+Ridge, Autoencoder, MLP
- 3 QCML variants: Full, No-Ranking, Real-Only

In [None]:
import json
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

## 1. Load Results

In [None]:
# Load backtest results
with open('../outputs/backtest/backtest_results.json', 'r') as f:
    results = json.load(f)

# Load processed data for dates and SPY prices
with open('../data/processed/processed_data.pkl', 'rb') as f:
    data = pickle.load(f)

prices = data['prices']
test_dates = results['pca_ridge']['dates']
print(f"Test period: {test_dates[0]} to {test_dates[-1]}")
print(f"Models: {list(results.keys())}")

## 2. Model Comparison Table

In [None]:
# Build comparison DataFrame
comparison = []
for model_name, model_results in results.items():
    metrics = model_results['metrics']
    comparison.append({
        'Model': model_name,
        'Total Return': f"{metrics['total_return']*100:.2f}%",
        'Ann. Return': f"{metrics['annual_return']*100:.2f}%",
        'Ann. Volatility': f"{metrics['annual_volatility']*100:.2f}%",
        'Sharpe': f"{metrics['sharpe_ratio']:.3f}",
        'Sortino': f"{metrics['sortino_ratio']:.3f}",
        'Max DD': f"{metrics['max_drawdown']*100:.2f}%",
        'Calmar': f"{metrics['calmar_ratio']:.3f}",
        'Hit Rate': f"{metrics['hit_rate']*100:.2f}%",
        'Avg Turnover': f"{metrics['avg_turnover']*100:.2f}%"
    })

comparison_df = pd.DataFrame(comparison).set_index('Model')
comparison_df

## 3. Equity Curves

In [None]:
fig, ax = plt.subplots(figsize=(14, 7))

# Get dates as datetime
dates = pd.to_datetime(test_dates)

# Define colors for models
colors = {
    'pca_ridge': '#e74c3c',
    'autoencoder': '#3498db',
    'mlp': '#2ecc71',
    'qcml_full': '#9b59b6',
    'qcml_no_ranking': '#f39c12',
    'qcml_real_only': '#1abc9c'
}

# Plot each model
for model_name, model_results in results.items():
    equity = model_results['equity_curve']
    ax.plot(dates, equity, label=model_name, color=colors[model_name], linewidth=2)

# Add SPY benchmark
spy_prices = prices['SPY'].loc[dates]
spy_normalized = spy_prices / spy_prices.iloc[0]
ax.plot(dates, spy_normalized, label='SPY (benchmark)', color='black', linestyle='--', linewidth=1.5, alpha=0.7)

ax.axhline(y=1.0, color='gray', linestyle=':', alpha=0.5)
ax.set_xlabel('Date')
ax.set_ylabel('Portfolio Value (normalized)')
ax.set_title('Equity Curves: All Models vs SPY Benchmark')
ax.legend(loc='upper left')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/backtest/equity_curves.png', dpi=150)
plt.show()

## 4. Drawdown Analysis

In [None]:
def compute_drawdown(equity_curve):
    """Compute drawdown series from equity curve."""
    equity = np.array(equity_curve)
    peak = np.maximum.accumulate(equity)
    drawdown = (equity - peak) / peak
    return drawdown

fig, ax = plt.subplots(figsize=(14, 6))

for model_name, model_results in results.items():
    dd = compute_drawdown(model_results['equity_curve'])
    ax.fill_between(dates, dd * 100, 0, alpha=0.3, label=model_name, color=colors[model_name])

ax.set_xlabel('Date')
ax.set_ylabel('Drawdown (%)')
ax.set_title('Drawdown Analysis by Model')
ax.legend(loc='lower left')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/backtest/drawdowns.png', dpi=150)
plt.show()

## 5. Monthly Returns Heatmap

In [None]:
# Compute monthly returns for the best model
best_model = 'qcml_real_only'
returns = np.array(results[best_model]['returns'])
dates_dt = pd.to_datetime(test_dates)

# Create DataFrame with returns
returns_df = pd.DataFrame({'date': dates_dt, 'return': returns})
returns_df['year'] = returns_df['date'].dt.year
returns_df['month'] = returns_df['date'].dt.month

# Aggregate by month
monthly = returns_df.groupby(['year', 'month'])['return'].apply(lambda x: (1 + x).prod() - 1)
monthly_pivot = monthly.unstack(level=1) * 100

fig, ax = plt.subplots(figsize=(12, 5))
sns.heatmap(monthly_pivot, annot=True, fmt='.1f', cmap='RdYlGn', center=0,
            cbar_kws={'label': 'Return (%)'}, ax=ax)
ax.set_title(f'Monthly Returns Heatmap: {best_model}')
ax.set_xlabel('Month')
ax.set_ylabel('Year')

plt.tight_layout()
plt.savefig('../outputs/backtest/monthly_returns.png', dpi=150)
plt.show()

## 6. Hit Rate Analysis

In [None]:
# Compare hit rates across models
hit_rates = {name: results[name]['metrics']['hit_rate'] * 100 for name in results}

fig, ax = plt.subplots(figsize=(10, 5))
bars = ax.bar(hit_rates.keys(), hit_rates.values(), color=[colors[k] for k in hit_rates.keys()])
ax.axhline(y=50, color='red', linestyle='--', label='Random (50%)')
ax.set_ylabel('Hit Rate (%)')
ax.set_title('Hit Rate by Model (% of top picks outperforming SPY)')
ax.set_ylim(40, 55)
ax.legend()

# Add value labels
for bar, val in zip(bars, hit_rates.values()):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3, f'{val:.1f}%',
            ha='center', va='bottom', fontsize=10)

plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('../outputs/backtest/hit_rates.png', dpi=150)
plt.show()

## 7. Risk-Return Scatter

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))

for model_name, model_results in results.items():
    metrics = model_results['metrics']
    vol = metrics['annual_volatility'] * 100
    ret = metrics['annual_return'] * 100
    sharpe = metrics['sharpe_ratio']
    
    ax.scatter(vol, ret, s=200, c=colors[model_name], label=f'{model_name} (SR={sharpe:.2f})', edgecolors='black')
    ax.annotate(model_name, (vol, ret), xytext=(5, 5), textcoords='offset points', fontsize=9)

# Add reference lines
ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
ax.axvline(x=6, color='gray', linestyle=':', alpha=0.5)

ax.set_xlabel('Annual Volatility (%)')
ax.set_ylabel('Annual Return (%)')
ax.set_title('Risk-Return Profile by Model')
ax.legend(loc='upper left')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/backtest/risk_return.png', dpi=150)
plt.show()

## 8. Turnover Analysis

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Average turnover comparison
turnovers = {name: results[name]['metrics']['avg_turnover'] * 100 for name in results}
axes[0].bar(turnovers.keys(), turnovers.values(), color=[colors[k] for k in turnovers.keys()])
axes[0].set_ylabel('Average Weekly Turnover (%)')
axes[0].set_title('Average Turnover by Model')
axes[0].tick_params(axis='x', rotation=45)

# Turnover over time for best model
turnover_ts = np.array(results[best_model]['turnovers']) * 100
axes[1].plot(dates, turnover_ts, color=colors[best_model], alpha=0.7)
axes[1].axhline(y=turnovers[best_model], color='red', linestyle='--', label=f'Mean: {turnovers[best_model]:.1f}%')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Weekly Turnover (%)')
axes[1].set_title(f'Turnover Over Time: {best_model}')
axes[1].legend()

plt.tight_layout()
plt.savefig('../outputs/backtest/turnover_analysis.png', dpi=150)
plt.show()

## 9. Key Findings Summary

In [None]:
# Find best model by each metric
metrics_to_compare = ['sharpe_ratio', 'total_return', 'max_drawdown', 'hit_rate']
best_by_metric = {}

for metric in metrics_to_compare:
    if metric == 'max_drawdown':
        # For drawdown, less negative is better
        best_model_name = max(results.keys(), key=lambda x: results[x]['metrics'][metric])
    else:
        best_model_name = max(results.keys(), key=lambda x: results[x]['metrics'][metric])
    best_by_metric[metric] = (best_model_name, results[best_model_name]['metrics'][metric])

print("="*60)
print("KEY FINDINGS")
print("="*60)
print(f"\nBest by Sharpe Ratio: {best_by_metric['sharpe_ratio'][0]} ({best_by_metric['sharpe_ratio'][1]:.3f})")
print(f"Best by Total Return: {best_by_metric['total_return'][0]} ({best_by_metric['total_return'][1]*100:.2f}%)")
print(f"Best by Max Drawdown: {best_by_metric['max_drawdown'][0]} ({best_by_metric['max_drawdown'][1]*100:.2f}%)")
print(f"Best by Hit Rate: {best_by_metric['hit_rate'][0]} ({best_by_metric['hit_rate'][1]*100:.2f}%)")

print("\n" + "="*60)
print("OBSERVATIONS")
print("="*60)
print("""
1. qcml_real_only is the only model with positive returns (+4.0%)
2. Complex embeddings (qcml_full) underperform real-only embeddings in live trading
3. All baseline models have negative returns in the test period (2023-2025)
4. QCML models generally have lower volatility than baselines
5. Ranking loss helps in prediction correlation but not in trading returns
""")