# Comprehensive Results Analysis

This notebook provides an in-depth analysis of the model results, including:
- Detailed performance metrics (precision, recall, F1-score)
- ROC curves and AUC analysis
- Occlusion sensitivity analysis
- Statistical significance testing
- Impact of synthetic data per class

In [None]:
# Import necessary libraries
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd()))))

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pathlib import Path
import tensorflow as tf
from sklearn.metrics import (
    confusion_matrix, classification_report, 
    precision_recall_fscore_support, roc_curve, auc
)
from sklearn.preprocessing import label_binarize
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
from src.kew_synthetic.data.loader import KewMNISTLoader
from src.kew_synthetic.evaluation.metrics import ModelEvaluator
from src.kew_synthetic.evaluation.occlusion import OcclusionAnalyzer
from src.kew_synthetic.evaluation.visualization import ResultVisualizer
from src.kew_synthetic.utils.config import load_config

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
print("Analysis environment ready!")

## 1. Load Models and Data

First, let's load the trained models and test data for comprehensive analysis.

In [None]:
# Load configuration
config_path = Path("../configs/")
model_config = load_config(config_path / "model_config.yaml")
training_config = load_config(config_path / "training_config.yaml")

# Load models
model_dir = Path("../models")
model_original = tf.keras.models.load_model(model_dir / "kew_mnist_original.h5")
model_synthetic = tf.keras.models.load_model(model_dir / "kew_mnist_synthetic.h5")

print("✓ Models loaded successfully!")

# Load saved metrics
import pickle

with open(model_dir / "evaluation_metrics.pkl", 'rb') as f:
    saved_metrics = pickle.load(f)
    
with open(model_dir / "training_histories.pkl", 'rb') as f:
    training_histories = pickle.load(f)

print("✓ Saved metrics loaded successfully!")

In [None]:
# Load test data
data_dir = Path("../data")
loader = KewMNISTLoader(data_dir=data_dir)

# Load original dataset to get test set
(_, _), (X_test, y_test), class_names = loader.load_original_data()

print(f"✓ Test data loaded: {X_test.shape[0]} images")
print(f"✓ Classes: {', '.join(class_names)}")

# Get predictions from both models
print("\nGenerating predictions...")
y_pred_original = model_original.predict(X_test, verbose=0)
y_pred_original_classes = np.argmax(y_pred_original, axis=1)

y_pred_synthetic = model_synthetic.predict(X_test, verbose=0)
y_pred_synthetic_classes = np.argmax(y_pred_synthetic, axis=1)

print("✓ Predictions generated!")

## 2. Precision, Recall, and F1-Score Analysis

Let's perform a detailed analysis of precision, recall, and F1-scores for both models.

In [None]:
# Calculate detailed metrics
from sklearn.metrics import classification_report, precision_recall_fscore_support

# Get classification reports
print("="*60)
print("CLASSIFICATION REPORT - ORIGINAL MODEL")
print("="*60)
report_original = classification_report(y_test, y_pred_original_classes, 
                                      target_names=class_names, 
                                      output_dict=True)
print(classification_report(y_test, y_pred_original_classes, 
                          target_names=class_names))

print("\n" + "="*60)
print("CLASSIFICATION REPORT - SYNTHETIC MODEL")
print("="*60)
report_synthetic = classification_report(y_test, y_pred_synthetic_classes, 
                                       target_names=class_names,
                                       output_dict=True)
print(classification_report(y_test, y_pred_synthetic_classes, 
                          target_names=class_names))

# Extract metrics for visualization
precision_orig, recall_orig, f1_orig, _ = precision_recall_fscore_support(
    y_test, y_pred_original_classes, average=None
)
precision_synth, recall_synth, f1_synth, _ = precision_recall_fscore_support(
    y_test, y_pred_synthetic_classes, average=None
)

In [None]:
# Visualize precision, recall, and F1 scores
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

metrics = ['Precision', 'Recall', 'F1-Score']
orig_values = [precision_orig, recall_orig, f1_orig]
synth_values = [precision_synth, recall_synth, f1_synth]

x = np.arange(len(class_names))
width = 0.35

for idx, (ax, metric, orig_val, synth_val) in enumerate(zip(axes, metrics, orig_values, synth_values)):
    # Create bars
    bars1 = ax.bar(x - width/2, orig_val, width, label='Original', color='skyblue', alpha=0.8)
    bars2 = ax.bar(x + width/2, synth_val, width, label='Synthetic', color='lightgreen', alpha=0.8)
    
    # Customize plot
    ax.set_xlabel('Class', fontsize=12)
    ax.set_ylabel(metric, fontsize=12)
    ax.set_title(f'{metric} Comparison', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(class_names, rotation=45, ha='right')
    ax.legend(loc='lower right')
    ax.grid(True, axis='y', alpha=0.3)
    ax.set_ylim(0, 1.05)
    
    # Add value labels
    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            ax.annotate(f'{height:.3f}',
                       xy=(bar.get_x() + bar.get_width() / 2, height),
                       xytext=(0, 3),
                       textcoords="offset points",
                       ha='center', va='bottom', fontsize=8)

plt.suptitle('Precision, Recall, and F1-Score Comparison', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Calculate and display improvements
print("\n" + "="*50)
print("METRIC IMPROVEMENTS (Synthetic - Original)")
print("="*50)
print(f"{'Class':<15} {'Precision':<12} {'Recall':<12} {'F1-Score':<12}")
print("-"*50)

for i, class_name in enumerate(class_names):
    prec_diff = precision_synth[i] - precision_orig[i]
    rec_diff = recall_synth[i] - recall_orig[i]
    f1_diff = f1_synth[i] - f1_orig[i]
    print(f"{class_name:<15} {prec_diff:+.4f}      {rec_diff:+.4f}      {f1_diff:+.4f}")

## 3. ROC Curve Analysis

Let's analyze the ROC curves and AUC scores for both models to understand their discriminative ability.

In [None]:
# ROC Curve Analysis
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier

# Binarize the labels for multi-class ROC
y_test_bin = label_binarize(y_test, classes=np.arange(len(class_names)))
n_classes = y_test_bin.shape[1]

# Calculate ROC curve and AUC for each class
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))

# Colors for each class
colors = plt.cm.Set3(np.linspace(0, 1, n_classes))

# Original model ROC curves
fpr_orig = dict()
tpr_orig = dict()
roc_auc_orig = dict()

for i in range(n_classes):
    fpr_orig[i], tpr_orig[i], _ = roc_curve(y_test_bin[:, i], y_pred_original[:, i])
    roc_auc_orig[i] = auc(fpr_orig[i], tpr_orig[i])
    ax1.plot(fpr_orig[i], tpr_orig[i], color=colors[i], lw=2,
             label=f'{class_names[i]} (AUC = {roc_auc_orig[i]:.3f})')

# Plot diagonal
ax1.plot([0, 1], [0, 1], 'k--', lw=2)
ax1.set_xlim([0.0, 1.0])
ax1.set_ylim([0.0, 1.05])
ax1.set_xlabel('False Positive Rate', fontsize=12)
ax1.set_ylabel('True Positive Rate', fontsize=12)
ax1.set_title('ROC Curves - Original Model', fontsize=14, fontweight='bold')
ax1.legend(loc="lower right", fontsize=10)
ax1.grid(True, alpha=0.3)

# Synthetic model ROC curves
fpr_synth = dict()
tpr_synth = dict()
roc_auc_synth = dict()

for i in range(n_classes):
    fpr_synth[i], tpr_synth[i], _ = roc_curve(y_test_bin[:, i], y_pred_synthetic[:, i])
    roc_auc_synth[i] = auc(fpr_synth[i], tpr_synth[i])
    ax2.plot(fpr_synth[i], tpr_synth[i], color=colors[i], lw=2,
             label=f'{class_names[i]} (AUC = {roc_auc_synth[i]:.3f})')

# Plot diagonal
ax2.plot([0, 1], [0, 1], 'k--', lw=2)
ax2.set_xlim([0.0, 1.0])
ax2.set_ylim([0.0, 1.05])
ax2.set_xlabel('False Positive Rate', fontsize=12)
ax2.set_ylabel('True Positive Rate', fontsize=12)
ax2.set_title('ROC Curves - Synthetic Model', fontsize=14, fontweight='bold')
ax2.legend(loc="lower right", fontsize=10)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Calculate micro-average ROC curve and ROC area
fpr_orig["micro"], tpr_orig["micro"], _ = roc_curve(y_test_bin.ravel(), y_pred_original.ravel())
roc_auc_orig["micro"] = auc(fpr_orig["micro"], tpr_orig["micro"])

fpr_synth["micro"], tpr_synth["micro"], _ = roc_curve(y_test_bin.ravel(), y_pred_synthetic.ravel())
roc_auc_synth["micro"] = auc(fpr_synth["micro"], tpr_synth["micro"])

# Display AUC comparison
print("\n" + "="*50)
print("AUC COMPARISON")
print("="*50)
print(f"{'Class':<15} {'Original AUC':<15} {'Synthetic AUC':<15} {'Improvement':<15}")
print("-"*50)

for i in range(n_classes):
    improvement = roc_auc_synth[i] - roc_auc_orig[i]
    print(f"{class_names[i]:<15} {roc_auc_orig[i]:<15.4f} {roc_auc_synth[i]:<15.4f} {improvement:+.4f}")

print("-"*50)
improvement_micro = roc_auc_synth["micro"] - roc_auc_orig["micro"]
print(f"{'Micro-average':<15} {roc_auc_orig['micro']:<15.4f} {roc_auc_synth['micro']:<15.4f} {improvement_micro:+.4f}")

## 4. Occlusion Sensitivity Analysis

Let's analyze what parts of images the models focus on using occlusion sensitivity.

In [None]:
# Occlusion Sensitivity Analysis
analyzer = OcclusionAnalyzer(occlusion_size=50, stride=25)

# Find interesting examples - where synthetic model corrected original's mistakes
orig_wrong = y_pred_original_classes != y_test
synth_correct = y_pred_synthetic_classes == y_test
interesting_mask = orig_wrong & synth_correct
interesting_indices = np.where(interesting_mask)[0]

print(f"Found {len(interesting_indices)} cases where synthetic model corrected original's mistakes")

# Analyze one example per class (if available)
analyzed_classes = set()
examples_to_analyze = []

for idx in interesting_indices:
    true_class = y_test[idx]
    if true_class not in analyzed_classes and len(examples_to_analyze) < 3:
        analyzed_classes.add(true_class)
        examples_to_analyze.append(idx)

print(f"\nAnalyzing {len(examples_to_analyze)} examples...")

# Perform occlusion analysis
fig, axes = plt.subplots(len(examples_to_analyze), 4, figsize=(16, 4*len(examples_to_analyze)))

if len(examples_to_analyze) == 1:
    axes = axes.reshape(1, -1)

for i, idx in enumerate(examples_to_analyze):
    image = X_test[idx]
    true_label = y_test[idx]
    
    # Calculate occlusion maps
    print(f"\nAnalyzing example {i+1}/{len(examples_to_analyze)}...")
    occlusion_map_orig = analyzer.generate_occlusion_map(model_original, image, true_label)
    occlusion_map_synth = analyzer.generate_occlusion_map(model_synthetic, image, true_label)
    
    # Original image
    axes[i, 0].imshow(image, cmap='gray')
    axes[i, 0].set_title(f'Original Image\nTrue: {class_names[true_label]}', fontsize=12)
    axes[i, 0].axis('off')
    
    # Original model occlusion map
    im1 = axes[i, 1].imshow(occlusion_map_orig, cmap='hot', interpolation='bilinear')
    axes[i, 1].set_title(f'Original Model Focus\nPred: {class_names[y_pred_original_classes[idx]]} (✗)', 
                         fontsize=12)
    axes[i, 1].axis('off')
    plt.colorbar(im1, ax=axes[i, 1], fraction=0.046)
    
    # Synthetic model occlusion map
    im2 = axes[i, 2].imshow(occlusion_map_synth, cmap='hot', interpolation='bilinear')
    axes[i, 2].set_title(f'Synthetic Model Focus\nPred: {class_names[y_pred_synthetic_classes[idx]]} (✓)', 
                         fontsize=12)
    axes[i, 2].axis('off')
    plt.colorbar(im2, ax=axes[i, 2], fraction=0.046)
    
    # Difference map
    diff_map = occlusion_map_synth - occlusion_map_orig
    im3 = axes[i, 3].imshow(diff_map, cmap='RdBu_r', interpolation='bilinear')
    axes[i, 3].set_title('Focus Difference\n(Synth - Orig)', fontsize=12)
    axes[i, 3].axis('off')
    plt.colorbar(im3, ax=axes[i, 3], fraction=0.046)

plt.suptitle('Occlusion Sensitivity Analysis: Model Focus Comparison', 
             fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

## 5. Statistical Significance Testing

Let's perform statistical tests to determine if the improvements are statistically significant.

In [None]:
# Statistical Significance Testing
from scipy import stats

# McNemar's test for paired samples
def mcnemar_test(y_true, y_pred1, y_pred2):
    """
    Perform McNemar's test to compare two classifiers.
    """
    # Create contingency table
    correct1_correct2 = np.sum((y_pred1 == y_true) & (y_pred2 == y_true))
    correct1_wrong2 = np.sum((y_pred1 == y_true) & (y_pred2 != y_true))
    wrong1_correct2 = np.sum((y_pred1 != y_true) & (y_pred2 == y_true))
    wrong1_wrong2 = np.sum((y_pred1 != y_true) & (y_pred2 != y_true))
    
    # McNemar's test statistic
    n12 = correct1_wrong2
    n21 = wrong1_correct2
    
    # Calculate test statistic and p-value
    if n12 + n21 > 0:
        statistic = (abs(n12 - n21) - 1)**2 / (n12 + n21)
        p_value = 1 - stats.chi2.cdf(statistic, df=1)
    else:
        statistic = 0
        p_value = 1.0
    
    return statistic, p_value, n12, n21

# Perform McNemar's test
statistic, p_value, n12, n21 = mcnemar_test(y_test, y_pred_original_classes, y_pred_synthetic_classes)

print("="*50)
print("McNEMAR'S TEST RESULTS")
print("="*50)
print(f"Test statistic: {statistic:.4f}")
print(f"P-value: {p_value:.6f}")
print(f"Original correct, Synthetic wrong: {n12}")
print(f"Original wrong, Synthetic correct: {n21}")
print(f"Net improvement: {n21 - n12} predictions")

if p_value < 0.05:
    print("\n✓ The improvement is statistically significant (p < 0.05)")
else:
    print("\n✗ The improvement is not statistically significant (p >= 0.05)")

# Per-class statistical analysis using binomial test
print("\n" + "="*50)
print("PER-CLASS SIGNIFICANCE TESTING")
print("="*50)
print(f"{'Class':<15} {'Orig Acc':<10} {'Synth Acc':<10} {'P-value':<10} {'Significant':<12}")
print("-"*50)

for i in range(len(class_names)):
    # Get predictions for this class
    class_mask = y_test == i
    if np.sum(class_mask) == 0:
        continue
        
    orig_correct = np.sum((y_pred_original_classes[class_mask] == i))
    synth_correct = np.sum((y_pred_synthetic_classes[class_mask] == i))
    total = np.sum(class_mask)
    
    # Binomial test
    if orig_correct != synth_correct:
        p_value = stats.binom_test(synth_correct, total, orig_correct/total, alternative='two-sided')
    else:
        p_value = 1.0
    
    orig_acc = orig_correct / total
    synth_acc = synth_correct / total
    significant = "Yes" if p_value < 0.05 else "No"
    
    print(f"{class_names[i]:<15} {orig_acc:<10.4f} {synth_acc:<10.4f} {p_value:<10.4f} {significant:<12}")

# Bootstrap confidence intervals
print("\n" + "="*50)
print("BOOTSTRAP CONFIDENCE INTERVALS (95%)")
print("="*50)

def bootstrap_accuracy(y_true, y_pred, n_bootstrap=1000):
    """Calculate bootstrap confidence interval for accuracy."""
    accuracies = []
    n_samples = len(y_true)
    
    for _ in range(n_bootstrap):
        # Sample with replacement
        indices = np.random.choice(n_samples, n_samples, replace=True)
        acc = np.mean(y_true[indices] == y_pred[indices])
        accuracies.append(acc)
    
    # Calculate 95% confidence interval
    lower = np.percentile(accuracies, 2.5)
    upper = np.percentile(accuracies, 97.5)
    mean = np.mean(accuracies)
    
    return mean, lower, upper

# Calculate bootstrap CIs
orig_mean, orig_lower, orig_upper = bootstrap_accuracy(y_test, y_pred_original_classes)
synth_mean, synth_lower, synth_upper = bootstrap_accuracy(y_test, y_pred_synthetic_classes)

print(f"Original Model:  {orig_mean:.4f} [{orig_lower:.4f}, {orig_upper:.4f}]")
print(f"Synthetic Model: {synth_mean:.4f} [{synth_lower:.4f}, {synth_upper:.4f}]")

# Check if confidence intervals overlap
if orig_upper < synth_lower:
    print("\n✓ Confidence intervals do not overlap - strong evidence of improvement")
elif orig_lower > synth_upper:
    print("\n✗ Original model significantly better than synthetic model")
else:
    print("\n⚠ Confidence intervals overlap - improvement may not be significant")

## 6. Impact of Synthetic Data Per Class

Let's analyze how synthetic data affected each class specifically.

In [None]:
# Load training data information to analyze synthetic data distribution
(X_train_orig, y_train_orig), _, _ = loader.load_original_data()
(X_train_synth, y_train_synth), _, _ = loader.load_synthetic_enhanced_data()

# Calculate class distributions
orig_counts = pd.Series(y_train_orig).value_counts().sort_index()
synth_counts = pd.Series(y_train_synth).value_counts().sort_index()
synthetic_added = synth_counts - orig_counts

# Create comprehensive analysis dataframe
impact_analysis = pd.DataFrame({
    'Class': class_names,
    'Original Count': orig_counts.values,
    'Synthetic Added': synthetic_added.values,
    'Total Count': synth_counts.values,
    'Increase %': (synthetic_added.values / orig_counts.values * 100),
    'Original Accuracy': [report_original[cn]['recall'] for cn in class_names],
    'Synthetic Accuracy': [report_synthetic[cn]['recall'] for cn in class_names],
    'Accuracy Change': [report_synthetic[cn]['recall'] - report_original[cn]['recall'] for cn in class_names],
    'F1 Original': [report_original[cn]['f1-score'] for cn in class_names],
    'F1 Synthetic': [report_synthetic[cn]['f1-score'] for cn in class_names],
    'F1 Change': [report_synthetic[cn]['f1-score'] - report_original[cn]['f1-score'] for cn in class_names]
})

print("IMPACT OF SYNTHETIC DATA PER CLASS")
print("="*100)
print(impact_analysis.to_string(index=False, float_format='%.4f'))

# Visualize the relationship between synthetic data added and performance improvement
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

# 1. Synthetic data added vs Accuracy improvement
ax1.scatter(impact_analysis['Synthetic Added'], impact_analysis['Accuracy Change'], 
           s=100, alpha=0.7, c=range(len(class_names)), cmap='Set3')
for i, txt in enumerate(class_names):
    ax1.annotate(txt, (impact_analysis['Synthetic Added'].iloc[i], 
                      impact_analysis['Accuracy Change'].iloc[i]),
                xytext=(5, 5), textcoords='offset points', fontsize=10)
ax1.set_xlabel('Number of Synthetic Images Added', fontsize=12)
ax1.set_ylabel('Accuracy Improvement', fontsize=12)
ax1.set_title('Synthetic Data Volume vs Accuracy Improvement', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)
ax1.axhline(y=0, color='r', linestyle='--', alpha=0.5)

# 2. Original class imbalance vs improvement
ax2.scatter(impact_analysis['Original Count'], impact_analysis['Accuracy Change'],
           s=100, alpha=0.7, c=range(len(class_names)), cmap='Set3')
for i, txt in enumerate(class_names):
    ax2.annotate(txt, (impact_analysis['Original Count'].iloc[i], 
                      impact_analysis['Accuracy Change'].iloc[i]),
                xytext=(5, 5), textcoords='offset points', fontsize=10)
ax2.set_xlabel('Original Training Samples', fontsize=12)
ax2.set_ylabel('Accuracy Improvement', fontsize=12)
ax2.set_title('Original Class Size vs Improvement', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3)
ax2.axhline(y=0, color='r', linestyle='--', alpha=0.5)

# 3. Percentage increase vs F1 improvement
ax3.scatter(impact_analysis['Increase %'], impact_analysis['F1 Change'],
           s=100, alpha=0.7, c=range(len(class_names)), cmap='Set3')
for i, txt in enumerate(class_names):
    ax3.annotate(txt, (impact_analysis['Increase %'].iloc[i], 
                      impact_analysis['F1 Change'].iloc[i]),
                xytext=(5, 5), textcoords='offset points', fontsize=10)
ax3.set_xlabel('Percentage Increase in Training Data', fontsize=12)
ax3.set_ylabel('F1-Score Improvement', fontsize=12)
ax3.set_title('Data Increase Percentage vs F1 Improvement', fontsize=14, fontweight='bold')
ax3.grid(True, alpha=0.3)
ax3.axhline(y=0, color='r', linestyle='--', alpha=0.5)

# 4. Bar chart of net improvement
colors = ['green' if x > 0 else 'red' for x in impact_analysis['Accuracy Change']]
bars = ax4.bar(class_names, impact_analysis['Accuracy Change'], color=colors, alpha=0.7)
ax4.set_xlabel('Class', fontsize=12)
ax4.set_ylabel('Accuracy Change', fontsize=12)
ax4.set_title('Net Accuracy Change by Class', fontsize=14, fontweight='bold')
ax4.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
ax4.tick_params(axis='x', rotation=45)
ax4.grid(True, axis='y', alpha=0.3)

# Add value labels
for bar, val, synth_count in zip(bars, impact_analysis['Accuracy Change'], impact_analysis['Synthetic Added']):
    ax4.annotate(f'{val:.3f}\n(+{int(synth_count)})',
                xy=(bar.get_x() + bar.get_width() / 2, val),
                xytext=(0, 3 if val >= 0 else -20),
                textcoords="offset points",
                ha='center', va='bottom' if val >= 0 else 'top', fontsize=9)

plt.suptitle('Impact of Synthetic Data on Model Performance', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Correlation analysis
print("\n" + "="*50)
print("CORRELATION ANALYSIS")
print("="*50)

correlations = {
    'Synthetic Added vs Accuracy Change': impact_analysis['Synthetic Added'].corr(impact_analysis['Accuracy Change']),
    'Original Count vs Accuracy Change': impact_analysis['Original Count'].corr(impact_analysis['Accuracy Change']),
    'Increase % vs F1 Change': impact_analysis['Increase %'].corr(impact_analysis['F1 Change'])
}

for metric, corr in correlations.items():
    print(f"{metric}: {corr:.4f}")
    
# Identify most and least benefited classes
most_improved = impact_analysis.nlargest(3, 'Accuracy Change')
least_improved = impact_analysis.nsmallest(3, 'Accuracy Change')

print("\n" + "="*50)
print("CLASSES THAT BENEFITED MOST:")
print("="*50)
for _, row in most_improved.iterrows():
    print(f"{row['Class']}: +{row['Accuracy Change']:.4f} accuracy "
          f"(added {int(row['Synthetic Added'])} synthetic images)")

print("\n" + "="*50)
print("CLASSES THAT BENEFITED LEAST:")
print("="*50)
for _, row in least_improved.iterrows():
    print(f"{row['Class']}: {row['Accuracy Change']:.4f} accuracy "
          f"(added {int(row['Synthetic Added'])} synthetic images)")

## 7. Comprehensive Summary and Insights

Let's summarize all findings from our comprehensive analysis.

In [ ]:
# Generate comprehensive summary report
print("="*80)
print("COMPREHENSIVE RESULTS ANALYSIS SUMMARY")
print("="*80)

# 1. Overall Performance
print("\n1. OVERALL MODEL PERFORMANCE:")
print("-"*40)
orig_acc = saved_metrics['original']['accuracy']
synth_acc = saved_metrics['synthetic']['accuracy']
improvement = synth_acc - orig_acc
print(f"   Original Model Accuracy: {orig_acc:.4f}")
print(f"   Synthetic Model Accuracy: {synth_acc:.4f}")
print(f"   Absolute Improvement: {improvement:.4f}")
print(f"   Relative Improvement: {improvement/orig_acc*100:.2f}%")

# 2. Statistical Significance
print("\n2. STATISTICAL SIGNIFICANCE:")
print("-"*40)
# Using the McNemar's test results from earlier
print(f"   McNemar's test p-value: {p_value:.6f}")
print(f"   Result: {'Statistically significant' if p_value < 0.05 else 'Not statistically significant'}")
print(f"   95% CI Original: [{orig_lower:.4f}, {orig_upper:.4f}]")
print(f"   95% CI Synthetic: [{synth_lower:.4f}, {synth_upper:.4f}]")

# 3. Metric Summary
print("\n3. AVERAGE METRIC IMPROVEMENTS:")
print("-"*40)
avg_prec_improvement = np.mean(precision_synth - precision_orig)
avg_recall_improvement = np.mean(recall_synth - recall_orig)
avg_f1_improvement = np.mean(f1_synth - f1_orig)
print(f"   Average Precision Improvement: {avg_prec_improvement:.4f}")
print(f"   Average Recall Improvement: {avg_recall_improvement:.4f}")
print(f"   Average F1-Score Improvement: {avg_f1_improvement:.4f}")

# 4. ROC-AUC Summary
print("\n4. ROC-AUC ANALYSIS:")
print("-"*40)
avg_auc_orig = np.mean([roc_auc_orig[i] for i in range(n_classes)])
avg_auc_synth = np.mean([roc_auc_synth[i] for i in range(n_classes)])
print(f"   Average AUC Original: {avg_auc_orig:.4f}")
print(f"   Average AUC Synthetic: {avg_auc_synth:.4f}")
print(f"   Average AUC Improvement: {avg_auc_synth - avg_auc_orig:.4f}")

# 5. Synthetic Data Impact
print("\n5. SYNTHETIC DATA IMPACT:")
print("-"*40)
total_synthetic = impact_analysis['Synthetic Added'].sum()
print(f"   Total Synthetic Images Added: {total_synthetic:,}")
print(f"   Average Synthetic per Class: {total_synthetic/len(class_names):.0f}")
print(f"   Dataset Size Increase: {total_synthetic/len(y_train_orig)*100:.1f}%")

# 6. Class-Specific Insights
print("\n6. CLASS-SPECIFIC INSIGHTS:")
print("-"*40)
improved_classes = impact_analysis[impact_analysis['Accuracy Change'] > 0]
degraded_classes = impact_analysis[impact_analysis['Accuracy Change'] < 0]
print(f"   Classes Improved: {len(improved_classes)} / {len(class_names)}")
print(f"   Classes Degraded: {len(degraded_classes)} / {len(class_names)}")
print(f"   Most Improved: {most_improved.iloc[0]['Class']} (+{most_improved.iloc[0]['Accuracy Change']:.4f})")
if len(degraded_classes) > 0:
    print(f"   Most Degraded: {least_improved.iloc[0]['Class']} ({least_improved.iloc[0]['Accuracy Change']:.4f})")

# 7. Key Findings
print("\n7. KEY FINDINGS:")
print("-"*40)
findings = []

if improvement > 0:
    findings.append("✓ Synthetic data augmentation successfully improved overall model accuracy")
    
if p_value < 0.05:
    findings.append("✓ The improvement is statistically significant")
    
if len(improved_classes) > len(degraded_classes):
    findings.append("✓ More classes benefited from synthetic data than were harmed")
    
if correlations['Original Count vs Accuracy Change'] < 0:
    findings.append("✓ Underrepresented classes benefited more from synthetic augmentation")
    
if avg_auc_synth > avg_auc_orig:
    findings.append("✓ Model's discriminative ability improved across all classes")

for finding in findings:
    print(f"   {finding}")

# 8. Recommendations
print("\n8. RECOMMENDATIONS:")
print("-"*40)
recommendations = []

if len(degraded_classes) > 0:
    recommendations.append("• Consider adjusting synthetic data generation for classes that degraded")
    
if impact_analysis['Increase %'].std() > 50:
    recommendations.append("• Consider more uniform synthetic data distribution across classes")
    
recommendations.append("• Continue monitoring model performance on new data")
recommendations.append("• Consider iterative synthetic data generation based on error analysis")

for rec in recommendations:
    print(f"   {rec}")

# Create final visualization summary
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# 1. Overall metrics comparison
metrics_names = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
orig_metrics = [saved_metrics['original']['accuracy'], 
                saved_metrics['original']['precision'],
                saved_metrics['original']['recall'], 
                saved_metrics['original']['f1_score']]
synth_metrics = [saved_metrics['synthetic']['accuracy'],
                 saved_metrics['synthetic']['precision'],
                 saved_metrics['synthetic']['recall'],
                 saved_metrics['synthetic']['f1_score']]

x = np.arange(len(metrics_names))
width = 0.35

bars1 = axes[0, 0].bar(x - width/2, orig_metrics, width, label='Original', color='skyblue')
bars2 = axes[0, 0].bar(x + width/2, synth_metrics, width, label='Synthetic', color='lightgreen')

axes[0, 0].set_xlabel('Metric')
axes[0, 0].set_ylabel('Score')
axes[0, 0].set_title('Overall Performance Metrics')
axes[0, 0].set_xticks(x)
axes[0, 0].set_xticklabels(metrics_names)
axes[0, 0].legend()
axes[0, 0].set_ylim(0, 1)

for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        axes[0, 0].annotate(f'{height:.3f}',
                          xy=(bar.get_x() + bar.get_width() / 2, height),
                          xytext=(0, 3),
                          textcoords="offset points",
                          ha='center', va='bottom')

# 2. Class-wise improvement summary
axes[0, 1].bar(class_names, impact_analysis['Accuracy Change'], 
              color=['green' if x > 0 else 'red' for x in impact_analysis['Accuracy Change']])
axes[0, 1].axhline(y=0, color='black', linestyle='-')
axes[0, 1].set_xlabel('Class')
axes[0, 1].set_ylabel('Accuracy Change')
axes[0, 1].set_title('Performance Change by Class')
axes[0, 1].tick_params(axis='x', rotation=45)

# 3. Training data distribution
train_data = pd.DataFrame({
    'Original': orig_counts.values,
    'Synthetic Added': synthetic_added.values
})
train_data.plot(kind='bar', ax=axes[1, 0], color=['skyblue', 'lightgreen'])
axes[1, 0].set_xlabel('Class')
axes[1, 0].set_ylabel('Number of Images')
axes[1, 0].set_title('Training Data Distribution')
axes[1, 0].set_xticklabels(class_names, rotation=45)
axes[1, 0].legend()

# 4. F1-Score comparison
axes[1, 1].plot(class_names, impact_analysis['F1 Original'], 'o-', label='Original', color='blue')
axes[1, 1].plot(class_names, impact_analysis['F1 Synthetic'], 'o-', label='Synthetic', color='green')
axes[1, 1].set_xlabel('Class')
axes[1, 1].set_ylabel('F1-Score')
axes[1, 1].set_title('F1-Score Comparison')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.suptitle('Comprehensive Results Summary', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print("\n" + "="*80)
print("Analysis complete! Results have been thoroughly evaluated.")
print("="*80)