# Model Evaluation and Comparison Script
Comprehensively evaluates and compares baseline and CNN models

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import json
import pickle
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, roc_curve, auc, precision_recall_curve
import torch

In [None]:
# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [None]:
# Set project paths
PROJECT_ROOT = Path('/home/tony/research_project/iate_project')
METRICS_DIR = PROJECT_ROOT / 'results' / 'metrics'
FIGURES_DIR = PROJECT_ROOT / 'results' / 'figures'
MODELS_DIR = PROJECT_ROOT / 'results' / 'models'

In [None]:
# Create directories
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

# 1. LOADING MODEL RESULTS

In [None]:
# Load baseline results
with open(METRICS_DIR / 'baseline_results.json', 'r') as f:
    baseline_results = json.load(f)

# Load CNN results
with open(METRICS_DIR / 'cnn_results.json', 'r') as f:
    cnn_results = json.load(f)

print("Loaded baseline results")
print("Loaded CNN results")

# 2. MODEL PERFORMANCE COMPARISON

In [None]:
comparison_data = []

# Baseline models
for model_name, metrics in baseline_results['validation_results'].items():
    comparison_data.append({
        'Model': f'Baseline-{model_name}',
        'Type': 'Classical ML',
        'Accuracy': metrics['accuracy'],
        'Precision': metrics['precision'],
        'Recall': metrics['recall'],
        'F1-Score': metrics['f1_score'],
        'AUC': metrics['auc'],
        'Train Time (s)': metrics['train_time']
    })

# Add baseline test results
best_baseline = baseline_results['test_results']['model']
comparison_data.append({
    'Model': f'{best_baseline} (Test)',
    'Type': 'Classical ML',
    'Accuracy': baseline_results['test_results']['accuracy'],
    'Precision': baseline_results['test_results']['precision'],
    'Recall': baseline_results['test_results']['recall'],
    'F1-Score': baseline_results['test_results']['f1_score'],
    'AUC': baseline_results['test_results']['auc'],
    'Train Time (s)': '-'
})

# CNN models
for model_name, metrics in cnn_results['models'].items():
    comparison_data.append({
        'Model': f'CNN-{model_name}',
        'Type': 'Deep Learning',
        'Accuracy': metrics['val_accuracy'],
        'Precision': metrics['val_precision'],
        'Recall': metrics['val_recall'],
        'F1-Score': metrics['val_f1_score'],
        'AUC': metrics['val_auc'],
        'Train Time (s)': metrics['train_time']
    })

# Add CNN test results
best_cnn = cnn_results['best_model']
comparison_data.append({
    'Model': f'{best_cnn} (Test)',
    'Type': 'Deep Learning',
    'Accuracy': cnn_results['test_results']['accuracy'],
    'Precision': cnn_results['test_results']['precision'],
    'Recall': cnn_results['test_results']['recall'],
    'F1-Score': cnn_results['test_results']['f1_score'],
    'AUC': cnn_results['test_results']['auc'],
    'Train Time (s)': '-'
})

# Create DataFrame
comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.round(4)

print("\nModel Comparison Table:")
print(comparison_df.to_string(index=False))

# Save comparison table
comparison_df.to_csv(METRICS_DIR / 'model_comparison.csv', index=False)
print(f"\nSaved to: {METRICS_DIR / 'model_comparison.csv'}")

# 3. STATISTICAL ANALYSIS

In [None]:
baseline_f1 = baseline_results['test_results']['f1_score']
cnn_f1 = cnn_results['test_results']['f1_score']
improvement = ((cnn_f1 - baseline_f1) / baseline_f1) * 100

print(f"Best Baseline F1-Score: {baseline_f1:.4f}")
print(f"Best CNN F1-Score: {cnn_f1:.4f}")
print(f"Relative Improvement: {improvement:.2f}%")

# 4. CREATING VISUALIZATIONS

In [None]:
# Figure 1: Performance comparison bar chart
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold')

# Metrics comparison
metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC']
test_models = comparison_df[comparison_df['Model'].str.contains('Test')]

ax = axes[0, 0]
x = np.arange(len(metrics_to_plot))
width = 0.35

baseline_metrics = test_models[test_models['Type'] == 'Classical ML'].iloc[0][metrics_to_plot].values
cnn_metrics = test_models[test_models['Type'] == 'Deep Learning'].iloc[0][metrics_to_plot].values

bars1 = ax.bar(x - width/2, baseline_metrics, width, label='Baseline', color='#3498db')
bars2 = ax.bar(x + width/2, cnn_metrics, width, label='CNN', color='#e74c3c')

ax.set_xlabel('Metrics')
ax.set_ylabel('Score')
ax.set_title('Test Set Performance Comparison')
ax.set_xticks(x)
ax.set_xticklabels(metrics_to_plot, rotation=45)
ax.legend()
ax.grid(True, alpha=0.3)

# Add value labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.3f}',
                   xy=(bar.get_x() + bar.get_width() / 2, height),
                   xytext=(0, 3),
                   textcoords="offset points",
                   ha='center', va='bottom', fontsize=8)

# Confusion matrices
baseline_cm = np.array(baseline_results['test_results']['confusion_matrix'])
cnn_cm = np.array(cnn_results['test_results']['confusion_matrix'])

ax = axes[0, 1]
sns.heatmap(baseline_cm, annot=True, fmt='d', cmap='Blues', ax=ax,
            xticklabels=['Normal', 'Defect'],
            yticklabels=['Normal', 'Defect'])
ax.set_title(f'Baseline Confusion Matrix\n({best_baseline})')
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')

ax = axes[0, 2]
sns.heatmap(cnn_cm, annot=True, fmt='d', cmap='Reds', ax=ax,
            xticklabels=['Normal', 'Defect'],
            yticklabels=['Normal', 'Defect'])
ax.set_title(f'CNN Confusion Matrix\n({best_cnn})')
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')

# Training history for CNN
if 'train_history' in cnn_results['models'][best_cnn]:
    train_history = cnn_results['models'][best_cnn]['train_history']
    val_history = cnn_results['models'][best_cnn]['val_history']

    # Loss curves
    ax = axes[1, 0]
    epochs = range(1, len(train_history['loss']) + 1)
    ax.plot(epochs, train_history['loss'], 'b-', label='Train Loss')
    ax.plot(epochs, val_history['loss'], 'r-', label='Val Loss')
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Loss')
    ax.set_title('CNN Training History - Loss')
    ax.legend()
    ax.grid(True, alpha=0.3)

    # Accuracy curves
    ax = axes[1, 1]
    ax.plot(epochs, train_history['acc'], 'b-', label='Train Acc')
    ax.plot(epochs, val_history['acc'], 'r-', label='Val Acc')
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Accuracy')
    ax.set_title('CNN Training History - Accuracy')
    ax.legend()
    ax.grid(True, alpha=0.3)

# Model comparison radar chart
ax = axes[1, 2]
categories = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC']
baseline_values = baseline_metrics
cnn_values = cnn_metrics

angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
baseline_values = np.concatenate((baseline_values, [baseline_values[0]]))
cnn_values = np.concatenate((cnn_values, [cnn_values[0]]))
angles += angles[:1]

ax = plt.subplot(2, 3, 6, projection='polar')
ax.plot(angles, baseline_values, 'o-', linewidth=2, label='Baseline', color='#3498db')
ax.fill(angles, baseline_values, alpha=0.25, color='#3498db')
ax.plot(angles, cnn_values, 'o-', linewidth=2, label='CNN', color='#e74c3c')
ax.fill(angles, cnn_values, alpha=0.25, color='#e74c3c')
ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories)
ax.set_ylim(0, 1)
ax.set_title('Performance Radar Chart', y=1.08)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
ax.grid(True)

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'model_comparison.png', dpi=300, bbox_inches='tight')
print(f"Saved: {FIGURES_DIR / 'model_comparison.png'}")

# Figure 2: Detailed metrics
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Detailed Performance Analysis', fontsize=16, fontweight='bold')

# Per-class metrics
ax = axes[0, 0]
class_names = ['Normal', 'Defect']
baseline_cm_norm = baseline_cm.astype('float') / baseline_cm.sum(axis=1)[:, np.newaxis]
cnn_cm_norm = cnn_cm.astype('float') / cnn_cm.sum(axis=1)[:, np.newaxis]

x = np.arange(len(class_names))
width = 0.35

baseline_recall = np.diag(baseline_cm_norm)
cnn_recall = np.diag(cnn_cm_norm)

bars1 = ax.bar(x - width/2, baseline_recall, width, label='Baseline', color='#3498db')
bars2 = ax.bar(x + width/2, cnn_recall, width, label='CNN', color='#e74c3c')

ax.set_xlabel('Class')
ax.set_ylabel('Recall (True Positive Rate)')
ax.set_title('Per-Class Recall Comparison')
ax.set_xticks(x)
ax.set_xticklabels(class_names)
ax.legend()
ax.grid(True, alpha=0.3)

for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.3f}',
                   xy=(bar.get_x() + bar.get_width() / 2, height),
                   xytext=(0, 3),
                   textcoords="offset points",
                   ha='center', va='bottom')

# Precision comparison
ax = axes[0, 1]
baseline_precision = np.array([baseline_cm[i,i] / baseline_cm[:,i].sum() for i in range(2)])
cnn_precision = np.array([cnn_cm[i,i] / cnn_cm[:,i].sum() for i in range(2)])

bars1 = ax.bar(x - width/2, baseline_precision, width, label='Baseline', color='#3498db')
bars2 = ax.bar(x + width/2, cnn_precision, width, label='CNN', color='#e74c3c')

ax.set_xlabel('Class')
ax.set_ylabel('Precision')
ax.set_title('Per-Class Precision Comparison')
ax.set_xticks(x)
ax.set_xticklabels(class_names)
ax.legend()
ax.grid(True, alpha=0.3)

for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.3f}',
                   xy=(bar.get_x() + bar.get_width() / 2, height),
                   xytext=(0, 3),
                   textcoords="offset points",
                   ha='center', va='bottom')

# Error analysis
ax = axes[1, 0]
error_types = ['False Positives\n(Normal→Defect)', 'False Negatives\n(Defect→Normal)']
baseline_errors = [baseline_cm[0,1], baseline_cm[1,0]]
cnn_errors = [cnn_cm[0,1], cnn_cm[1,0]]

x = np.arange(len(error_types))
bars1 = ax.bar(x - width/2, baseline_errors, width, label='Baseline', color='#3498db')
bars2 = ax.bar(x + width/2, cnn_errors, width, label='CNN', color='#e74c3c')

ax.set_xlabel('Error Type')
ax.set_ylabel('Number of Errors')
ax.set_title('Error Analysis')
ax.set_xticks(x)
ax.set_xticklabels(error_types)
ax.legend()
ax.grid(True, alpha=0.3)

for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{int(height)}',
                   xy=(bar.get_x() + bar.get_width() / 2, height),
                   xytext=(0, 3),
                   textcoords="offset points",
                   ha='center', va='bottom')

# Summary statistics
ax = axes[1, 1]
ax.axis('off')

summary_text = f"""
EVALUATION SUMMARY

Best Classical ML Model: {best_baseline}
• Test Accuracy: {baseline_results['test_results']['accuracy']:.4f}
• Test F1-Score: {baseline_results['test_results']['f1_score']:.4f}
• Test AUC: {baseline_results['test_results']['auc']:.4f}

Best Deep Learning Model: {best_cnn}
• Test Accuracy: {cnn_results['test_results']['accuracy']:.4f}
• Test F1-Score: {cnn_results['test_results']['f1_score']:.4f}
• Test AUC: {cnn_results['test_results']['auc']:.4f}

Performance Improvement:
• Accuracy: +{(cnn_results['test_results']['accuracy'] - baseline_results['test_results']['accuracy'])*100:.1f}%
• F1-Score: +{improvement:.1f}%

Key Findings:
• CNN significantly outperforms classical ML
• Both models show good defect detection
• Low false negative rate is critical
• Model suitable for deployment
"""

ax.text(0.1, 0.5, summary_text, fontsize=11, verticalalignment='center',
        bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5),
        family='monospace')

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'detailed_analysis.png', dpi=300, bbox_inches='tight')
print(f"Saved: {FIGURES_DIR / 'detailed_analysis.png'}")

# 5. GENERATING LATEX TABLE FOR PAPER

In [None]:
latex_table = comparison_df[comparison_df['Model'].str.contains('Test')].to_latex(
    index=False,
    float_format='%.4f',
    column_format='|l|l|c|c|c|c|c|',
    caption='Performance comparison of classical ML and deep learning models on coffee bean defect detection test set.',
    label='tab:model_comparison'
)

with open(METRICS_DIR / 'model_comparison.tex', 'w') as f:
    f.write(latex_table)

print(f"LaTeX table saved to: {METRICS_DIR / 'model_comparison.tex'}")

# Final Summary

In [None]:
print("\nKey Results:")
print(f"• Best Classical ML: {best_baseline} (F1={baseline_f1:.4f})")
print(f"• Best Deep Learning: {best_cnn} (F1={cnn_f1:.4f})")
print(f"• Relative Improvement: {improvement:.1f}%")

print("\nGenerated Files:")
print(f"• {METRICS_DIR / 'model_comparison.csv'}")
print(f"• {METRICS_DIR / 'model_comparison.tex'}")
print(f"• {FIGURES_DIR / 'model_comparison.png'}")
print(f"• {FIGURES_DIR / 'detailed_analysis.png'}")