## Cell 1: Environment Setup and Imports

In [None]:
"""
Phase 8.5: Environment Setup and Imports

This cell configures the environment and imports all necessary modules
for selective prediction evaluation metrics.
"""

import os
import sys
import warnings
from pathlib import Path
from datetime import datetime

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Set project root
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
os.chdir(PROJECT_ROOT)
sys.path.insert(0, str(PROJECT_ROOT))

# Core imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json

# PyTorch imports
import torch
import torch.nn.functional as F

# Phase 8.5: Selective Metrics imports
from src.selection import (
    # Data classes
    SelectiveMetrics,
    RiskCoverageCurve,
    
    # Core metric functions
    compute_coverage,
    compute_selective_accuracy,
    compute_selective_risk,
    compute_risk_on_rejected,
    compute_improvement,
    compute_rejection_quality,
    compute_rejection_precision_recall,
    
    # Risk-coverage
    compute_risk_coverage_curve,
    compute_aurc,
    
    # Calibration
    compute_ece_post_selection,
    
    # Main entry point
    compute_selective_metrics,
    
    # Comparison
    compare_strategies,
    
    # Visualization
    plot_risk_coverage_curve,
    plot_accuracy_coverage_curve,
    plot_strategy_comparison,
    
    # Utilities
    find_threshold_for_coverage,
    compute_metrics_at_coverage,
    validate_hypothesis_h3a,
)

# Plot configuration
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams.update({
    'figure.figsize': (12, 8),
    'font.size': 12,
    'axes.labelsize': 14,
    'axes.titlesize': 16,
    'legend.fontsize': 11,
})

# Create results directory
RESULTS_DIR = PROJECT_ROOT / 'results' / 'phase_8_5'
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# Device configuration
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print("="*60)
print("PHASE 8.5: SELECTIVE PREDICTION EVALUATION METRICS")
print("="*60)
print(f"Project Root: {PROJECT_ROOT}")
print(f"Results Dir:  {RESULTS_DIR}")
print(f"Device:       {DEVICE}")
print(f"PyTorch:      {torch.__version__}")
print(f"NumPy:        {np.__version__}")
print(f"Timestamp:    {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*60)
print("‚úÖ Environment setup complete!")

## Cell 2: Load Selective Prediction Data

Load predictions, labels, confidence scores, and stability scores from Phase 8.4.

In [None]:
"""
Phase 8.5: Load Selective Prediction Data

Load the prediction data from previous phases or generate synthetic data
for demonstration purposes.
"""

# Configuration
USE_REAL_DATA = False  # Set to True when real data is available
DATA_PATH = PROJECT_ROOT / 'results' / 'phase_8' / 'stability_scores.csv'

if USE_REAL_DATA and DATA_PATH.exists():
    print("Loading real data from Phase 8.4...")
    data = pd.read_csv(DATA_PATH)
    predictions = data['prediction'].values
    labels = data['label'].values
    confidences = data['confidence'].values
    stability = data['stability'].values
    print(f"Loaded {len(predictions)} samples")
else:
    print("Generating synthetic data for demonstration...")
    np.random.seed(42)
    
    # Parameters
    N_SAMPLES = 1000
    N_CLASSES = 3
    ERROR_RATE = 0.15  # 15% error rate
    
    # Generate ground truth labels
    labels = np.random.randint(0, N_CLASSES, size=N_SAMPLES)
    
    # Generate predictions (with controlled error rate)
    predictions = labels.copy()
    n_errors = int(N_SAMPLES * ERROR_RATE)
    error_indices = np.random.choice(N_SAMPLES, size=n_errors, replace=False)
    predictions[error_indices] = (labels[error_indices] + np.random.randint(1, N_CLASSES, size=n_errors)) % N_CLASSES
    
    # Generate confidence scores (errors have lower confidence)
    confidences = np.random.uniform(0.75, 0.99, size=N_SAMPLES)
    confidences[error_indices] = np.random.uniform(0.40, 0.70, size=n_errors)
    
    # Generate stability scores (errors have lower stability)
    stability = np.random.uniform(0.70, 0.95, size=N_SAMPLES)
    stability[error_indices] = np.random.uniform(0.30, 0.60, size=n_errors)
    
    print(f"Generated {N_SAMPLES} synthetic samples")

# Compute derived quantities
is_correct = predictions == labels
overall_accuracy = np.mean(is_correct)

# Summary statistics
print("\n" + "="*60)
print("DATA SUMMARY")
print("="*60)
print(f"Total Samples:      {len(predictions):,}")
print(f"Overall Accuracy:   {overall_accuracy:.2%}")
print(f"Error Rate:         {1-overall_accuracy:.2%}")
print(f"\nConfidence Scores:")
print(f"  Mean:   {confidences.mean():.3f}")
print(f"  Std:    {confidences.std():.3f}")
print(f"  Range:  [{confidences.min():.3f}, {confidences.max():.3f}]")
print(f"\nStability Scores:")
print(f"  Mean:   {stability.mean():.3f}")
print(f"  Std:    {stability.std():.3f}")
print(f"  Range:  [{stability.min():.3f}, {stability.max():.3f}]")
print("="*60)

## Cell 3: Compute Core Selective Metrics

Compute all core metrics for a given selection threshold.

In [None]:
"""
Phase 8.5: Compute Core Selective Metrics

Compute comprehensive selective prediction metrics using combined
confidence + stability scoring at 90% coverage target.
"""

# Configuration
TARGET_COVERAGE = 0.90
CONFIDENCE_WEIGHT = 0.5
STABILITY_WEIGHT = 0.5

# Compute combined scores
combined_scores = CONFIDENCE_WEIGHT * confidences + STABILITY_WEIGHT * stability

# Find threshold for target coverage
threshold = find_threshold_for_coverage(combined_scores, TARGET_COVERAGE)
is_accepted = combined_scores >= threshold

print(f"Target Coverage: {TARGET_COVERAGE:.0%}")
print(f"Score Threshold: {threshold:.4f}")
print(f"Actual Coverage: {np.mean(is_accepted):.2%}")

# Compute comprehensive metrics with bootstrap CIs
print("\nComputing metrics with bootstrap confidence intervals...")
metrics = compute_selective_metrics(
    predictions=predictions,
    labels=labels,
    is_accepted=is_accepted,
    confidences=confidences,
    scores=combined_scores,
    compute_ci=True,
    n_bootstrap=1000,
    confidence_level=0.95,
    metadata={
        "target_coverage": TARGET_COVERAGE,
        "threshold": threshold,
        "confidence_weight": CONFIDENCE_WEIGHT,
        "stability_weight": STABILITY_WEIGHT,
    }
)

# Display summary
print(metrics.summary())

## Cell 4: Hypothesis H3a Validation

Validate hypothesis H3a: ‚â•4pp improvement at 90% coverage.

In [None]:
"""
Phase 8.5: Hypothesis H3a Validation

Validate the primary hypothesis:
H3a: Selective prediction achieves ‚â•4pp improvement at 90% coverage
"""

# Validate H3a
h3a_result = validate_hypothesis_h3a(
    metrics,
    target_improvement=0.04,  # 4 percentage points
    target_coverage=0.90
)

# Display results
print("="*60)
print("HYPOTHESIS H3a VALIDATION")
print("="*60)
print(f"\nHypothesis: {h3a_result['description']}")
print(f"\n{'='*40}")
print(f"  Coverage:           {h3a_result['coverage']:.1%}")
print(f"  Selective Accuracy: {metrics.selective_accuracy:.2%}")
print(f"  Overall Accuracy:   {metrics.overall_accuracy:.2%}")
print(f"  Improvement:        {h3a_result['improvement_pp']:+.2f}pp")
print(f"  Target:             ‚â•{h3a_result['target_improvement']*100:.0f}pp")
print(f"  Margin:             {h3a_result['margin']*100:+.2f}pp")
print(f"{'='*40}")

if h3a_result['passed']:
    print(f"\n‚úÖ HYPOTHESIS H3a: PASSED")
    print(f"   Selective prediction achieves {h3a_result['improvement_pp']:.2f}pp improvement")
    print(f"   at {h3a_result['coverage']:.1%} coverage (exceeds target by {h3a_result['margin']*100:.2f}pp)")
else:
    print(f"\n‚ùå HYPOTHESIS H3a: NOT PASSED")
    print(f"   Improvement of {h3a_result['improvement_pp']:.2f}pp is below target of 4pp")

# Confidence interval for improvement
if 'improvement' in metrics.confidence_intervals:
    ci_low, ci_high = metrics.confidence_intervals['improvement']
    print(f"\n95% CI for Improvement: [{ci_low*100:.2f}pp, {ci_high*100:.2f}pp]")
    if ci_low >= 0.04:
        print("   ‚úÖ Entire CI above target threshold - strong evidence!")

print("="*60)

## Cell 5: Strategy Comparison (H3b Validation)

Compare different gating strategies to validate H3b:
Combined gating outperforms single-signal approaches.

In [None]:
"""
Phase 8.5: Strategy Comparison for H3b Validation

Compare four gating strategies:
1. Confidence-only: Accept if confidence ‚â• threshold
2. Stability-only: Accept if stability ‚â• threshold
3. Combined (AND): Accept if both thresholds met
4. Combined Score: Accept based on weighted average score
"""

# Configuration for comparison
CONFIDENCE_THRESHOLD = 0.85
STABILITY_THRESHOLD = 0.75

# Compare strategies
strategy_results = compare_strategies(
    predictions=predictions,
    labels=labels,
    confidence_scores=confidences,
    stability_scores=stability,
    confidence_threshold=CONFIDENCE_THRESHOLD,
    stability_threshold=STABILITY_THRESHOLD,
    target_coverage=TARGET_COVERAGE
)

# Display comparison table
print("="*80)
print("STRATEGY COMPARISON (H3b VALIDATION)")
print("="*80)
print(f"\nThresholds: œÑ_conf={CONFIDENCE_THRESHOLD}, œÑ_stab={STABILITY_THRESHOLD}")
print(f"Target Coverage: {TARGET_COVERAGE:.0%}\n")

print(f"{'Strategy':<20} {'Coverage':>10} {'Sel.Acc':>10} {'Improvement':>12} {'AURC':>10} {'E-AURC':>10}")
print("-"*80)

for name, m in strategy_results.items():
    print(f"{name:<20} {m.coverage:>10.1%} {m.selective_accuracy:>10.1%} {m.improvement*100:>+11.2f}pp {m.aurc:>10.4f} {m.e_aurc:>10.4f}")

print("-"*80)

# H3b Validation: Does combined outperform single-signal?
conf_only = strategy_results['confidence_only']
stab_only = strategy_results['stability_only']
combined_score = strategy_results['combined_score']

print("\n" + "="*60)
print("H3b VALIDATION: Combined vs Single-Signal")
print("="*60)

# Compare at similar coverage levels
print(f"\nAt ~{TARGET_COVERAGE:.0%} coverage (combined_score strategy):")
print(f"  Combined Score Improvement:   {combined_score.improvement*100:+.2f}pp")
print(f"  Combined Score AURC:          {combined_score.aurc:.4f}")

# Check if combined is better
if combined_score.aurc <= min(conf_only.aurc, stab_only.aurc):
    print(f"\n‚úÖ HYPOTHESIS H3b: SUPPORTED")
    print(f"   Combined scoring achieves lower AURC than single-signal approaches")
else:
    print(f"\n‚ö†Ô∏è HYPOTHESIS H3b: MIXED RESULTS")
    print(f"   Further investigation needed")

print("="*60)

## Cell 6: Risk-Coverage Curves

Generate publication-ready risk-coverage curves for all strategies.

In [None]:
"""
Phase 8.5: Risk-Coverage Curves

Generate risk-coverage curves showing the trade-off between
coverage (fraction of samples accepted) and risk (error rate).

Lower curves indicate better selective prediction performance.
"""

# Compute risk-coverage curves for each strategy
curves = {}

# Confidence-only
curves['Confidence'] = compute_risk_coverage_curve(
    predictions, labels, confidences
)

# Stability-only
curves['Stability'] = compute_risk_coverage_curve(
    predictions, labels, stability
)

# Combined score
curves['Combined'] = compute_risk_coverage_curve(
    predictions, labels, combined_scores
)

# Print AURC comparison
print("="*60)
print("RISK-COVERAGE CURVE ANALYSIS")
print("="*60)
print(f"\n{'Strategy':<15} {'AURC':>10} {'E-AURC':>10} {'Better than Conf?':>20}")
print("-"*60)

conf_aurc = curves['Confidence'].aurc
for name, curve in curves.items():
    better = "‚úÖ Yes" if curve.aurc < conf_aurc else ("- Same" if curve.aurc == conf_aurc else "‚ùå No")
    if name == 'Confidence':
        better = "- Baseline"
    print(f"{name:<15} {curve.aurc:>10.4f} {curve.e_aurc:>10.4f} {better:>20}")

print("="*60)

# Plot risk-coverage curves
fig = plot_risk_coverage_curve(
    curves,
    title="Risk-Coverage Curves: Strategy Comparison",
    save_path=RESULTS_DIR / 'risk_coverage_curves.png',
    show_optimal=True,
    figsize=(12, 8)
)
plt.show()
print(f"\n‚úÖ Saved: {RESULTS_DIR / 'risk_coverage_curves.png'}")

## Cell 7: Accuracy-Coverage Curves

Generate accuracy-coverage curves showing selective accuracy vs coverage.

In [None]:
"""
Phase 8.5: Accuracy-Coverage Curves

Generate accuracy-coverage curves showing how selective accuracy
improves as we reduce coverage (accept fewer samples).
"""

# Plot accuracy-coverage curves
fig = plot_accuracy_coverage_curve(
    curves,
    title="Accuracy-Coverage Curves: Strategy Comparison",
    save_path=RESULTS_DIR / 'accuracy_coverage_curves.png',
    figsize=(12, 8)
)
plt.show()
print(f"\n‚úÖ Saved: {RESULTS_DIR / 'accuracy_coverage_curves.png'}")

## Cell 8: Strategy Comparison Visualization

Generate bar plots comparing metrics across strategies.

In [None]:
"""
Phase 8.5: Strategy Comparison Visualization

Generate bar plots comparing coverage, selective accuracy,
and improvement across gating strategies.
"""

# Plot strategy comparison
fig = plot_strategy_comparison(
    strategy_results,
    title="Selective Prediction Strategy Comparison",
    save_path=RESULTS_DIR / 'strategy_comparison.png',
    figsize=(14, 6)
)
plt.show()
print(f"\n‚úÖ Saved: {RESULTS_DIR / 'strategy_comparison.png'}")

## Cell 9: Coverage Sweep Analysis

Analyze metrics across different coverage levels.

In [None]:
"""
Phase 8.5: Coverage Sweep Analysis

Analyze how selective accuracy and improvement vary
across different coverage levels (50% to 100%).
"""

# Coverage levels to analyze
coverage_levels = [0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 1.00]

# Compute metrics at each coverage level
sweep_results = []

print("="*70)
print("COVERAGE SWEEP ANALYSIS")
print("="*70)
print(f"\n{'Coverage':>10} {'Sel.Acc':>10} {'Improvement':>12} {'Rej.Precision':>15} {'AURC':>10}")
print("-"*70)

for target_cov in coverage_levels:
    m = compute_metrics_at_coverage(
        predictions, labels, combined_scores,
        target_coverage=target_cov,
        confidences=confidences
    )
    sweep_results.append({
        'coverage': m.coverage,
        'selective_accuracy': m.selective_accuracy,
        'improvement': m.improvement,
        'rejection_precision': m.rejection_precision,
        'aurc': m.aurc
    })
    
    prec_str = f"{m.rejection_precision:.1%}" if not np.isnan(m.rejection_precision) else "N/A"
    print(f"{m.coverage:>10.1%} {m.selective_accuracy:>10.2%} {m.improvement*100:>+11.2f}pp {prec_str:>15} {m.aurc:>10.4f}")

print("-"*70)

# Find coverage where improvement ‚â• 4pp
for r in sweep_results:
    if r['improvement'] >= 0.04:
        print(f"\n‚úÖ H3a satisfied at {r['coverage']:.0%} coverage: {r['improvement']*100:.2f}pp improvement")
        break

# Plot coverage sweep
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

coverages = [r['coverage'] for r in sweep_results]
sel_accs = [r['selective_accuracy'] for r in sweep_results]
improvements = [r['improvement'] * 100 for r in sweep_results]
aurcs = [r['aurc'] for r in sweep_results]

# Plot 1: Selective Accuracy vs Coverage
axes[0].plot(coverages, sel_accs, 'bo-', linewidth=2, markersize=8)
axes[0].axhline(y=overall_accuracy, color='r', linestyle='--', label=f'Overall Acc ({overall_accuracy:.1%})')
axes[0].set_xlabel('Coverage')
axes[0].set_ylabel('Selective Accuracy')
axes[0].set_title('Selective Accuracy vs Coverage')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Plot 2: Improvement vs Coverage
axes[1].plot(coverages, improvements, 'go-', linewidth=2, markersize=8)
axes[1].axhline(y=4, color='r', linestyle='--', label='H3a Target (4pp)')
axes[1].axhline(y=0, color='gray', linestyle='-', alpha=0.5)
axes[1].set_xlabel('Coverage')
axes[1].set_ylabel('Improvement (pp)')
axes[1].set_title('Accuracy Improvement vs Coverage')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# Plot 3: AURC vs Coverage
axes[2].plot(coverages, aurcs, 'mo-', linewidth=2, markersize=8)
axes[2].set_xlabel('Coverage')
axes[2].set_ylabel('AURC')
axes[2].set_title('AURC vs Coverage')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'coverage_sweep.png', dpi=300, bbox_inches='tight')
plt.show()
print(f"\n‚úÖ Saved: {RESULTS_DIR / 'coverage_sweep.png'}")

## Cell 10: Rejection Analysis

Analyze the quality of rejection decisions.

In [None]:
"""
Phase 8.5: Rejection Analysis

Analyze the quality of rejection decisions:
- What fraction of rejected samples are actually errors?
- What fraction of errors are successfully rejected?
"""

# Rejection analysis
precision, recall = compute_rejection_precision_recall(predictions, labels, is_accepted)
rejection_quality = compute_rejection_quality(metrics.risk_on_rejected, metrics.selective_risk)

print("="*60)
print("REJECTION ANALYSIS")
print("="*60)

print(f"\nüìä Sample Breakdown:")
print(f"   Total Samples:       {metrics.n_total:,}")
print(f"   Accepted:            {metrics.n_accepted:,} ({metrics.coverage:.1%})")
print(f"   Rejected:            {metrics.n_rejected:,} ({1-metrics.coverage:.1%})")

print(f"\nüéØ Rejection Quality:")
print(f"   Correct Rejected (Type I Error):     {metrics.n_correct_rejected:,}")
print(f"   Incorrect Rejected (Good Rejection): {metrics.n_incorrect_rejected:,}")
print(f"   Correct Accepted:                    {metrics.n_correct_accepted:,}")
print(f"   Incorrect Accepted (Missed):         {metrics.n_incorrect_accepted:,}")

print(f"\nüìà Rejection Metrics:")
print(f"   Rejection Precision: {precision:.1%}")
print(f"      (Fraction of rejected that are errors)")
print(f"   Rejection Recall:    {recall:.1%}")
print(f"      (Fraction of errors that are rejected)")

if not np.isnan(rejection_quality):
    print(f"   Rejection Quality:   {rejection_quality:.2f}x")
    print(f"      (Rejected samples are {rejection_quality:.1f}x more likely to be errors)")

# Visualize rejection quality
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Confusion matrix style plot
confusion_data = np.array([
    [metrics.n_correct_accepted, metrics.n_incorrect_accepted],
    [metrics.n_correct_rejected, metrics.n_incorrect_rejected]
])

sns.heatmap(
    confusion_data, annot=True, fmt='d', cmap='Blues',
    xticklabels=['Correct', 'Incorrect'],
    yticklabels=['Accepted', 'Rejected'],
    ax=axes[0]
)
axes[0].set_xlabel('Prediction Correctness')
axes[0].set_ylabel('Acceptance Decision')
axes[0].set_title('Rejection Decision Matrix')

# Risk comparison
risks = [metrics.selective_risk, metrics.risk_on_rejected]
labels_risk = ['Accepted (Selective Risk)', 'Rejected (Risk on Rejected)']
colors = ['green', 'red']

bars = axes[1].bar(labels_risk, risks, color=colors, alpha=0.7)
axes[1].set_ylabel('Error Rate')
axes[1].set_title('Error Rate Comparison: Accepted vs Rejected')
axes[1].set_ylim(0, max(risks) * 1.2)

for bar, risk in zip(bars, risks):
    if not np.isnan(risk):
        axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
                    f'{risk:.1%}', ha='center', fontsize=12)

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'rejection_analysis.png', dpi=300, bbox_inches='tight')
plt.show()
print(f"\n‚úÖ Saved: {RESULTS_DIR / 'rejection_analysis.png'}")
print("="*60)

## Cell 11: Calibration Post-Selection

Analyze calibration of accepted predictions using ECE.

In [None]:
"""
Phase 8.5: Calibration Post-Selection

Analyze Expected Calibration Error (ECE) on accepted samples
to verify confidence reliability after selective prediction.
"""

# Compute ECE for different subsets
ece_overall = compute_ece_post_selection(
    predictions, labels, confidences, np.ones(len(predictions), dtype=bool)
)
ece_accepted = compute_ece_post_selection(
    predictions, labels, confidences, is_accepted
)

print("="*60)
print("CALIBRATION ANALYSIS (POST-SELECTION)")
print("="*60)

print(f"\nüìê Expected Calibration Error (ECE):")
print(f"   Overall (all samples):  {ece_overall:.4f}")
print(f"   Post-Selection:         {ece_accepted:.4f}")

if ece_accepted < ece_overall:
    improvement = (ece_overall - ece_accepted) / ece_overall * 100
    print(f"\n‚úÖ Calibration improved by {improvement:.1f}% after selection")
else:
    print(f"\n‚ö†Ô∏è Calibration did not improve (may need recalibration)")

# Reliability diagram
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for idx, (subset_name, mask) in enumerate([('All Samples', np.ones(len(predictions), dtype=bool)), 
                                            ('Accepted Only', is_accepted)]):
    ax = axes[idx]
    
    # Get subset data
    sub_preds = predictions[mask]
    sub_labels = labels[mask]
    sub_confs = confidences[mask]
    
    # Bin confidences
    n_bins = 10
    bin_boundaries = np.linspace(0, 1, n_bins + 1)
    bin_accs = []
    bin_confs = []
    bin_counts = []
    
    for i in range(n_bins):
        in_bin = (sub_confs > bin_boundaries[i]) & (sub_confs <= bin_boundaries[i+1])
        if np.sum(in_bin) > 0:
            bin_accs.append(np.mean(sub_preds[in_bin] == sub_labels[in_bin]))
            bin_confs.append(np.mean(sub_confs[in_bin]))
            bin_counts.append(np.sum(in_bin))
        else:
            bin_accs.append(0)
            bin_confs.append((bin_boundaries[i] + bin_boundaries[i+1]) / 2)
            bin_counts.append(0)
    
    # Plot reliability diagram
    ax.bar(range(n_bins), bin_accs, width=0.8, alpha=0.5, label='Accuracy')
    ax.plot([0, n_bins-1], [0, 1], 'r--', label='Perfect Calibration')
    ax.set_xlabel('Confidence Bin')
    ax.set_ylabel('Accuracy')
    ax.set_title(f'Reliability Diagram: {subset_name}')
    ax.legend()
    ax.set_ylim(0, 1.1)

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'calibration_analysis.png', dpi=300, bbox_inches='tight')
plt.show()
print(f"\n‚úÖ Saved: {RESULTS_DIR / 'calibration_analysis.png'}")
print("="*60)

## Cell 12: Export Results

Export all metrics and results for dissertation reporting.

In [None]:
"""
Phase 8.5: Export Results

Export all computed metrics and validation results for
dissertation Chapter 7 and publication.
"""

# Export main metrics to JSON
metrics_path = RESULTS_DIR / 'selective_metrics.json'
metrics.to_json(metrics_path)
print(f"‚úÖ Saved: {metrics_path}")

# Export H3a validation results
h3a_path = RESULTS_DIR / 'h3a_validation.json'
with open(h3a_path, 'w') as f:
    json.dump(h3a_result, f, indent=2, default=str)
print(f"‚úÖ Saved: {h3a_path}")

# Export strategy comparison
strategy_data = {
    name: m.to_dict() for name, m in strategy_results.items()
}
strategy_path = RESULTS_DIR / 'strategy_comparison.json'
with open(strategy_path, 'w') as f:
    json.dump(strategy_data, f, indent=2)
print(f"‚úÖ Saved: {strategy_path}")

# Export coverage sweep
sweep_df = pd.DataFrame(sweep_results)
sweep_path = RESULTS_DIR / 'coverage_sweep.csv'
sweep_df.to_csv(sweep_path, index=False)
print(f"‚úÖ Saved: {sweep_path}")

# Export risk-coverage curve data
rc_data = {name: curve.to_dict() for name, curve in curves.items()}
rc_path = RESULTS_DIR / 'risk_coverage_curves.json'
with open(rc_path, 'w') as f:
    json.dump(rc_data, f, indent=2)
print(f"‚úÖ Saved: {rc_path}")

# Summary table for dissertation
print("\n" + "="*60)
print("DISSERTATION SUMMARY TABLE")
print("="*60)
print(f"""
| Metric | Value | Target | Status |
|--------|-------|--------|--------|
| Coverage | {metrics.coverage:.1%} | ‚â•90% | {'‚úÖ' if metrics.coverage >= 0.90 else '‚ùå'} |
| Selective Accuracy | {metrics.selective_accuracy:.2%} | - | - |
| Overall Accuracy | {metrics.overall_accuracy:.2%} | - | - |
| Improvement | {metrics.improvement*100:+.2f}pp | ‚â•4pp | {'‚úÖ' if metrics.improvement >= 0.04 else '‚ùå'} |
| AURC | {metrics.aurc:.4f} | Lower | - |
| E-AURC | {metrics.e_aurc:.4f} | ‚âà0 | {'‚úÖ' if metrics.e_aurc < 0.01 else '‚ùå'} |
| Rejection Precision | {metrics.rejection_precision:.1%} | High | {'‚úÖ' if metrics.rejection_precision >= 0.8 else '‚ö†Ô∏è'} |
| ECE Post-Selection | {metrics.ece_post_selection:.4f} | Low | {'‚úÖ' if metrics.ece_post_selection < 0.1 else '‚ö†Ô∏è'} |
""")
print("="*60)

## Cell 13: Final Summary

Complete summary of Phase 8.5 evaluation.

In [None]:
"""
Phase 8.5: Final Summary

Complete summary of all Phase 8.5 selective metrics evaluation.
"""

print("="*70)
print("PHASE 8.5: SELECTIVE PREDICTION EVALUATION METRICS - COMPLETE")
print("="*70)

print(f"""
üìä CORE METRICS
   Coverage:            {metrics.coverage:.1%}
   Selective Accuracy:  {metrics.selective_accuracy:.2%}
   Overall Accuracy:    {metrics.overall_accuracy:.2%}
   Improvement:         {metrics.improvement*100:+.2f}pp

üìà HYPOTHESIS VALIDATION
   H3a (‚â•4pp at 90%):   {'‚úÖ PASSED' if h3a_result['passed'] else '‚ùå NOT PASSED'}
   H3b (Combined > Single): {'‚úÖ SUPPORTED' if combined_score.aurc <= min(conf_only.aurc, stab_only.aurc) else '‚ö†Ô∏è MIXED'}

üéØ REJECTION QUALITY
   Precision:           {metrics.rejection_precision:.1%}
   Recall:              {metrics.rejection_recall:.1%}
   AURC:                {metrics.aurc:.4f}
   E-AURC:              {metrics.e_aurc:.4f}

üìê CALIBRATION
   ECE Post-Selection:  {metrics.ece_post_selection:.4f}

üìÅ OUTPUTS SAVED
   {RESULTS_DIR / 'selective_metrics.json'}
   {RESULTS_DIR / 'h3a_validation.json'}
   {RESULTS_DIR / 'strategy_comparison.json'}
   {RESULTS_DIR / 'coverage_sweep.csv'}
   {RESULTS_DIR / 'risk_coverage_curves.json'}
   {RESULTS_DIR / 'risk_coverage_curves.png'}
   {RESULTS_DIR / 'accuracy_coverage_curves.png'}
   {RESULTS_DIR / 'strategy_comparison.png'}
   {RESULTS_DIR / 'coverage_sweep.png'}
   {RESULTS_DIR / 'rejection_analysis.png'}
   {RESULTS_DIR / 'calibration_analysis.png'}
""")

print("="*70)
print("‚úÖ PHASE 8.5 COMPLETE - Ready for Dissertation Chapter 7")
print("="*70)