# Franca Core ML Model Evaluation

This notebook provides comprehensive evaluation of Franca Core ML models including:
- Accuracy comparison with PyTorch reference
- Performance benchmarking
- Visual analysis of features
- Model inspection and debugging

In [1]:
# Setup and imports
import sys
import os
sys.path.append('..')

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import torch
import coremltools as ct
from pathlib import Path
import time
import json

# Import our evaluation script
from scripts.evaluate_coreml import FrancaCoreMLEvaluator

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ Setup complete!")

ModuleNotFoundError: No module named 'seaborn'

## 1. Model Loading and Basic Info

In [None]:
# Configuration
COREML_MODEL_PATH = "../coreml_models/franca_vitb14_in21k_fp32.mlpackage"
MODEL_NAME = "vitb14"
WEIGHTS = "IN21K"

# Check if model exists
if not os.path.exists(COREML_MODEL_PATH):
    print(f"❌ Model not found: {COREML_MODEL_PATH}")
    print("Run ../scripts/export_coreml_hub.py first to create the model")
else:
    print(f"✅ Model found: {COREML_MODEL_PATH}")
    
    # Create evaluator
    evaluator = FrancaCoreMLEvaluator(COREML_MODEL_PATH, MODEL_NAME, WEIGHTS)
    
    # Display model info
    spec = evaluator.coreml_model.get_spec()
    print(f"\n📋 Model Information:")
    print(f"  Input: {spec.description.input[0].name} - {spec.description.input[0].type}")
    print(f"  Output: {spec.description.output[0].name} - {spec.description.output[0].type}")
    print(f"  Model type: {type(evaluator.coreml_model)}")

## 2. Single Image Evaluation

In [None]:
# Test with sample image
sample_image_path = "../assets/dog.jpg"

if os.path.exists(sample_image_path):
    # Load and display image
    image = Image.open(sample_image_path)
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # Original image
    axes[0].imshow(image)
    axes[0].set_title(f"Original Image\n{image.size}")
    axes[0].axis('off')
    
    # Resized image (model input)
    image_resized = image.resize((518, 518), Image.Resampling.BILINEAR)
    axes[1].imshow(image_resized)
    axes[1].set_title(f"Model Input\n{image_resized.size}")
    axes[1].axis('off')
    
    plt.tight_layout()
    plt.show()
    
    # Evaluate
    print("🔍 Evaluating image...")
    result = evaluator.evaluate_single_image(sample_image_path)
    
    # Display results
    print(f"\n📊 Evaluation Results:")
    print(f"  Cosine Similarity: {result['cosine_similarity']:.6f}")
    print(f"  MSE: {result['mse']:.8f}")
    print(f"  MAE: {result['mae']:.8f}")
    print(f"  Max Difference: {result['max_diff']:.8f}")
    print(f"  PyTorch Time: {result['torch_time']*1000:.1f} ms")
    print(f"  Core ML Time: {result['coreml_time']*1000:.1f} ms")
    print(f"  Speedup: {result['speedup']:.2f}x")
    
else:
    print(f"❌ Sample image not found: {sample_image_path}")

## 3. Feature Vector Analysis

In [None]:
if 'result' in locals():
    # Get feature vectors for analysis
    image = Image.open(sample_image_path)
    
    # PyTorch features
    torch_input = evaluator.preprocess_image_torch(image)
    with torch.no_grad():
        torch_features = evaluator.torch_model(torch_input).numpy().flatten()
    
    # Core ML features
    coreml_input = evaluator.preprocess_image_coreml(image)
    coreml_result = evaluator.coreml_model.predict({"image": coreml_input})
    coreml_features = list(coreml_result.values())[0].flatten()
    
    # Visualize feature distributions
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Feature histograms
    axes[0, 0].hist(torch_features, bins=50, alpha=0.7, label='PyTorch', density=True)
    axes[0, 0].hist(coreml_features, bins=50, alpha=0.7, label='Core ML', density=True)
    axes[0, 0].set_title('Feature Value Distributions')
    axes[0, 0].set_xlabel('Feature Value')
    axes[0, 0].set_ylabel('Density')
    axes[0, 0].legend()
    
    # Feature comparison scatter
    axes[0, 1].scatter(torch_features, coreml_features, alpha=0.6, s=1)
    axes[0, 1].plot([torch_features.min(), torch_features.max()], 
                    [torch_features.min(), torch_features.max()], 'r--', alpha=0.8)
    axes[0, 1].set_title('PyTorch vs Core ML Features')
    axes[0, 1].set_xlabel('PyTorch Features')
    axes[0, 1].set_ylabel('Core ML Features')
    
    # Feature differences
    diff = torch_features - coreml_features
    axes[1, 0].hist(diff, bins=50, alpha=0.7)
    axes[1, 0].set_title('Feature Differences (PyTorch - Core ML)')
    axes[1, 0].set_xlabel('Difference')
    axes[1, 0].set_ylabel('Count')
    
    # Feature magnitudes
    indices = np.arange(len(torch_features))
    axes[1, 1].plot(indices, np.abs(torch_features), label='PyTorch', alpha=0.7)
    axes[1, 1].plot(indices, np.abs(coreml_features), label='Core ML', alpha=0.7)
    axes[1, 1].set_title('Feature Magnitudes')
    axes[1, 1].set_xlabel('Feature Index')
    axes[1, 1].set_ylabel('Absolute Value')
    axes[1, 1].legend()
    
    plt.tight_layout()
    plt.show()
    
    # Statistics
    print(f"\n📈 Feature Statistics:")
    print(f"  PyTorch - Mean: {torch_features.mean():.4f}, Std: {torch_features.std():.4f}")
    print(f"  Core ML - Mean: {coreml_features.mean():.4f}, Std: {coreml_features.std():.4f}")
    print(f"  Difference - Mean: {diff.mean():.6f}, Std: {diff.std():.6f}")
    print(f"  Correlation: {np.corrcoef(torch_features, coreml_features)[0,1]:.6f}")

## 4. Performance Benchmarking

In [None]:
# Run performance benchmark
print("⚡ Running performance benchmark...")
benchmark_results = evaluator.benchmark_performance(num_runs=20)

# Visualize timing results
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Timing comparison
torch_times = np.array(benchmark_results['torch_times']) * 1000  # Convert to ms
coreml_times = np.array(benchmark_results['coreml_times']) * 1000

axes[0].boxplot([torch_times, coreml_times], labels=['PyTorch', 'Core ML'])
axes[0].set_title('Inference Time Comparison')
axes[0].set_ylabel('Time (ms)')
axes[0].grid(True, alpha=0.3)

# Speedup distribution
speedups = torch_times / coreml_times
axes[1].hist(speedups, bins=15, alpha=0.7, edgecolor='black')
axes[1].axvline(speedups.mean(), color='red', linestyle='--', 
                label=f'Mean: {speedups.mean():.2f}x')
axes[1].set_title('Speedup Distribution')
axes[1].set_xlabel('Speedup Factor')
axes[1].set_ylabel('Count')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print summary
print(f"\n📊 Performance Summary:")
print(f"  PyTorch: {benchmark_results['torch_mean_time']*1000:.1f} ± {benchmark_results['torch_std_time']*1000:.1f} ms")
print(f"  Core ML: {benchmark_results['coreml_mean_time']*1000:.1f} ± {benchmark_results['coreml_std_time']*1000:.1f} ms")
print(f"  Average Speedup: {benchmark_results['speedup_mean']:.2f}x")
print(f"  Speedup Range: {speedups.min():.2f}x - {speedups.max():.2f}x")

## 5. Model Size and Memory Analysis

In [None]:
import os
from pathlib import Path

# Analyze model files
coreml_path = Path(COREML_MODEL_PATH)
if coreml_path.exists():
    # Get model size
    if coreml_path.is_dir():  # .mlpackage is a directory
        total_size = sum(f.stat().st_size for f in coreml_path.rglob('*') if f.is_file())
    else:
        total_size = coreml_path.stat().st_size
    
    size_mb = total_size / (1024 * 1024)
    
    print(f"📁 Model File Analysis:")
    print(f"  Path: {coreml_path}")
    print(f"  Size: {size_mb:.1f} MB")
    print(f"  Type: {'Directory' if coreml_path.is_dir() else 'File'}")
    
    # Check for other model variants
    model_dir = coreml_path.parent
    print(f"\n📂 Available Models in {model_dir}:")
    for model_file in model_dir.glob("*.mlpackage"):
        if model_file.is_dir():
            file_size = sum(f.stat().st_size for f in model_file.rglob('*') if f.is_file())
        else:
            file_size = model_file.stat().st_size
        print(f"  {model_file.name}: {file_size/(1024*1024):.1f} MB")

# Memory usage estimation
print(f"\n🧠 Memory Usage Estimation:")
print(f"  Input tensor (518x518x3 float32): {518*518*3*4/(1024*1024):.1f} MB")
print(f"  Output tensor (768 float16): {768*2/1024:.1f} KB")
print(f"  Model parameters: ~{size_mb:.1f} MB")
print(f"  Estimated peak memory: ~{size_mb + 518*518*3*4/(1024*1024) + 50:.1f} MB")

## 6. Batch Evaluation (if you have multiple images)

In [None]:
# Example: evaluate multiple images if available
# You can modify this to point to your own image directory

# Check for common image directories
possible_dirs = ["../assets", "../test_images", "./images"]
image_dir = None

for dir_path in possible_dirs:
    if os.path.exists(dir_path):
        # Check if it contains images
        image_files = list(Path(dir_path).glob("*.jpg")) + list(Path(dir_path).glob("*.png"))
        if image_files:
            image_dir = dir_path
            break

if image_dir:
    print(f"📁 Found images in: {image_dir}")
    
    # Evaluate directory (limit to 5 images for demo)
    batch_results = evaluator.evaluate_directory(image_dir, max_images=5)
    
    if batch_results.get('num_images', 0) > 0:
        # Plot batch results
        individual_results = batch_results['individual_results']
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # Cosine similarities
        cos_sims = [r['cosine_similarity'] for r in individual_results]
        axes[0, 0].bar(range(len(cos_sims)), cos_sims)
        axes[0, 0].set_title('Cosine Similarities')
        axes[0, 0].set_xlabel('Image Index')
        axes[0, 0].set_ylabel('Cosine Similarity')
        axes[0, 0].set_ylim([0, 1])
        
        # Inference times
        torch_times = [r['torch_time']*1000 for r in individual_results]
        coreml_times = [r['coreml_time']*1000 for r in individual_results]
        x = np.arange(len(torch_times))
        width = 0.35
        axes[0, 1].bar(x - width/2, torch_times, width, label='PyTorch')
        axes[0, 1].bar(x + width/2, coreml_times, width, label='Core ML')
        axes[0, 1].set_title('Inference Times')
        axes[0, 1].set_xlabel('Image Index')
        axes[0, 1].set_ylabel('Time (ms)')
        axes[0, 1].legend()
        
        # MSE values
        mse_values = [r['mse'] for r in individual_results]
        axes[1, 0].bar(range(len(mse_values)), mse_values)
        axes[1, 0].set_title('Mean Squared Errors')
        axes[1, 0].set_xlabel('Image Index')
        axes[1, 0].set_ylabel('MSE')
        
        # Speedups
        speedups = [r['speedup'] for r in individual_results]
        axes[1, 1].bar(range(len(speedups)), speedups)
        axes[1, 1].set_title('Speedup Factors')
        axes[1, 1].set_xlabel('Image Index')
        axes[1, 1].set_ylabel('Speedup (x)')
        
        plt.tight_layout()
        plt.show()
        
        # Summary statistics
        print(f"\n📊 Batch Evaluation Summary ({batch_results['num_images']} images):")
        print(f"  Average Cosine Similarity: {batch_results['cosine_similarity_mean']:.4f} ± {batch_results['cosine_similarity_std']:.4f}")
        print(f"  Average MSE: {batch_results['mse_mean']:.6f} ± {batch_results['mse_std']:.6f}")
        print(f"  Average Speedup: {batch_results['speedup_mean']:.2f}x ± {batch_results['speedup_std']:.2f}x")
        
else:
    print("📁 No image directories found for batch evaluation")
    print("   You can add images to ../assets/ or modify the paths above")

## 7. Export Results

In [None]:
# Generate comprehensive report
print("📋 Generating comprehensive evaluation report...")
report = evaluator.generate_report("../evaluation_report.json")

# Display key findings
print(f"\n🎯 Key Findings:")
perf = report['performance_benchmark']
print(f"  ⚡ Performance: {perf['speedup_mean']:.2f}x speedup over PyTorch")
print(f"  🎯 Core ML Time: {perf['coreml_mean_time']*1000:.1f} ± {perf['coreml_std_time']*1000:.1f} ms")

if 'sample_image_test' in report:
    sample = report['sample_image_test']
    print(f"  🔍 Accuracy: {sample['cosine_similarity']:.4f} cosine similarity")
    print(f"  📏 Precision: {sample['mse']:.6f} MSE")

print(f"\n✅ Evaluation complete! Report saved to: evaluation_report.json")
print(f"\n🚀 The Core ML model is ready for production use in iOS/macOS apps!")