# Prediction Analysis

Analyze model predictions vs actual returns:
- Scatter plots (predictions vs actuals)
- Directional accuracy
- Error distributions
- Ranking agreement

In [None]:
import sys
sys.path.insert(0, '..')

import json
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr, pearsonr
import torch

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Models and Data

In [None]:
# Load processed data
with open('../data/processed/processed_data.pkl', 'rb') as f:
    data = pickle.load(f)

# Load models
with open('../outputs/baselines/baseline_models.pkl', 'rb') as f:
    baseline_models = pickle.load(f)

with open('../outputs/qcml/qcml_models.pkl', 'rb') as f:
    qcml_models = pickle.load(f)

splits = data['splits']
feature_cols = data['feature_cols']

# Get test data
X_test = splits.test[feature_cols].values
y_test = splits.test['excess_return'].values

print(f"Test samples: {len(X_test)}")
print(f"Features: {feature_cols}")

## 2. Generate Predictions

In [None]:
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

predictions = {}

# Baseline predictions
for name, model in baseline_models.items():
    if hasattr(model, 'predict'):
        predictions[name] = model.predict(X_test)
    else:
        # PyTorch model
        model.eval()
        with torch.no_grad():
            X_tensor = torch.FloatTensor(X_test).to(device)
            predictions[name] = model(X_tensor).cpu().numpy().flatten()

# QCML predictions
for name, model in qcml_models.items():
    model.to(device)
    model.eval()
    with torch.no_grad():
        X_tensor = torch.FloatTensor(X_test).to(device)
        predictions[name] = model(X_tensor).cpu().numpy().flatten()

print(f"Models: {list(predictions.keys())}")

## 3. Predictions vs Actuals Scatter

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, (name, preds) in enumerate(predictions.items()):
    ax = axes[idx]
    
    # Scatter plot
    ax.scatter(y_test * 100, preds * 100, alpha=0.3, s=10)
    
    # Reference line
    lims = [min(y_test.min(), preds.min()) * 100 - 1, max(y_test.max(), preds.max()) * 100 + 1]
    ax.plot(lims, lims, 'r--', alpha=0.5, label='Perfect prediction')
    
    # Compute metrics
    corr = np.corrcoef(y_test, preds)[0, 1]
    spearman = spearmanr(y_test, preds)[0]
    
    ax.set_xlabel('Actual Excess Return (%)')
    ax.set_ylabel('Predicted Excess Return (%)')
    ax.set_title(f'{name}\nPearson: {corr:.3f}, Spearman: {spearman:.3f}')
    ax.legend(loc='upper left')
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/backtest/predictions_scatter.png', dpi=150)
plt.show()

## 4. Directional Accuracy

In [None]:
# Compute directional accuracy (sign agreement)
directional_acc = {}
for name, preds in predictions.items():
    sign_match = np.sign(preds) == np.sign(y_test)
    # Handle zeros
    sign_match[y_test == 0] = True
    directional_acc[name] = sign_match.mean() * 100

# Plot
fig, ax = plt.subplots(figsize=(10, 5))
colors = plt.cm.Set2(np.linspace(0, 1, len(directional_acc)))
bars = ax.bar(directional_acc.keys(), directional_acc.values(), color=colors)
ax.axhline(y=50, color='red', linestyle='--', label='Random (50%)')
ax.set_ylabel('Directional Accuracy (%)')
ax.set_title('Sign Prediction Accuracy by Model')
ax.legend()

for bar, val in zip(bars, directional_acc.values()):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.3, f'{val:.1f}%',
            ha='center', va='bottom', fontsize=10)

plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('../outputs/backtest/directional_accuracy.png', dpi=150)
plt.show()

## 5. Error Distribution

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

for idx, (name, preds) in enumerate(predictions.items()):
    ax = axes[idx]
    errors = (preds - y_test) * 100  # In percentage points
    
    ax.hist(errors, bins=50, alpha=0.7, edgecolor='black')
    ax.axvline(x=0, color='red', linestyle='--', alpha=0.7)
    ax.axvline(x=errors.mean(), color='green', linestyle='-', label=f'Mean: {errors.mean():.3f}%')
    
    ax.set_xlabel('Prediction Error (%)')
    ax.set_ylabel('Frequency')
    ax.set_title(f'{name}\nStd: {errors.std():.3f}%, MAE: {np.abs(errors).mean():.3f}%')
    ax.legend()

plt.tight_layout()
plt.savefig('../outputs/backtest/error_distributions.png', dpi=150)
plt.show()

## 6. Ranking Agreement Analysis

In [None]:
# Group by week and compute ranking correlation
test_df = splits.test.copy()
test_df['week_idx'] = test_df.groupby('date').ngroup()

# Add predictions to dataframe
for name, preds in predictions.items():
    test_df[f'pred_{name}'] = preds

# Compute weekly Spearman correlation
weekly_corrs = {name: [] for name in predictions}

for week in test_df['week_idx'].unique():
    week_data = test_df[test_df['week_idx'] == week]
    actuals = week_data['excess_return'].values
    
    for name in predictions:
        preds_week = week_data[f'pred_{name}'].values
        if len(actuals) > 2:  # Need at least 3 points for ranking
            corr, _ = spearmanr(actuals, preds_week)
            if not np.isnan(corr):
                weekly_corrs[name].append(corr)

# Plot distribution of weekly ranking correlations
fig, ax = plt.subplots(figsize=(12, 5))

data_to_plot = [weekly_corrs[name] for name in predictions]
bp = ax.boxplot(data_to_plot, labels=list(predictions.keys()), patch_artist=True)

colors = plt.cm.Set2(np.linspace(0, 1, len(predictions)))
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)

ax.axhline(y=0, color='red', linestyle='--', alpha=0.5)
ax.set_ylabel('Weekly Spearman Correlation')
ax.set_title('Distribution of Weekly Ranking Correlations')
plt.xticks(rotation=45, ha='right')

# Add mean values
means = [np.mean(weekly_corrs[name]) for name in predictions]
for i, mean in enumerate(means, 1):
    ax.annotate(f'Î¼={mean:.3f}', xy=(i, mean), xytext=(5, 0), textcoords='offset points', fontsize=9)

plt.tight_layout()
plt.savefig('../outputs/backtest/ranking_correlations.png', dpi=150)
plt.show()

## 7. Performance by Magnitude

In [None]:
# Analyze accuracy by return magnitude
abs_returns = np.abs(y_test)
quintiles = pd.qcut(abs_returns, 5, labels=['Q1 (small)', 'Q2', 'Q3', 'Q4', 'Q5 (large)'])

accuracy_by_quintile = {}
for name, preds in predictions.items():
    sign_match = np.sign(preds) == np.sign(y_test)
    accuracy_by_quintile[name] = pd.Series(sign_match).groupby(quintiles).mean() * 100

# Plot
acc_df = pd.DataFrame(accuracy_by_quintile)

fig, ax = plt.subplots(figsize=(12, 6))
acc_df.plot(kind='bar', ax=ax, width=0.8)
ax.axhline(y=50, color='red', linestyle='--', label='Random')
ax.set_xlabel('Return Magnitude Quintile')
ax.set_ylabel('Directional Accuracy (%)')
ax.set_title('Prediction Accuracy by Return Magnitude')
ax.legend(loc='upper right')
plt.xticks(rotation=0)

plt.tight_layout()
plt.savefig('../outputs/backtest/accuracy_by_magnitude.png', dpi=150)
plt.show()

## 8. Summary Statistics

In [None]:
# Compile summary statistics
summary = []
for name, preds in predictions.items():
    errors = preds - y_test
    summary.append({
        'Model': name,
        'Pearson Corr': f"{np.corrcoef(y_test, preds)[0,1]:.4f}",
        'Spearman Corr': f"{spearmanr(y_test, preds)[0]:.4f}",
        'Sign Accuracy': f"{(np.sign(preds) == np.sign(y_test)).mean()*100:.2f}%",
        'MSE': f"{np.mean(errors**2):.6f}",
        'MAE': f"{np.mean(np.abs(errors))*100:.4f}%",
        'Bias': f"{np.mean(errors)*100:.4f}%"
    })

summary_df = pd.DataFrame(summary).set_index('Model')
summary_df