# ⚡ Performance Benchmarking

## Overview
Comprehensive performance benchmarking for all system components:
- Model inference speed
- Data processing throughput
- Memory usage analysis
- Scalability testing

---

In [None]:
import time
import psutil
import torch
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Benchmark results storage
benchmark_results = {}

### 🚀 Model Inference Benchmarking

In [None]:
def benchmark_model_inference(model_name, test_texts, iterations=100):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    times = []
    memory_usage = []
    
    for i in range(iterations):
        start_time = time.time()
        start_memory = psutil.Process().memory_info().rss / 1024 / 1024  # MB
        
        for text in test_texts:
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        
        end_time = time.time()
        end_memory = psutil.Process().memory_info().rss / 1024 / 1024  # MB
        
        times.append(end_time - start_time)
        memory_usage.append(end_memory - start_memory)
    
    return {
        'avg_time': sum(times) / len(times),
        'min_time': min(times),
        'max_time': max(times),
        'avg_memory': sum(memory_usage) / len(memory_usage)
    }

# Test data
test_texts = [
    "ዋጋ 2500 ብር አድራሻ አዲስ አበባ",
    "የምርት ዋጋ 1500 ብር ለሱቅና ብዛት ተረካቢወች",
    "አድራሻ አዲስ አበባ ሀያሁለት ዋጋ 3000 ብር"
]

# Benchmark different models
models = {
    'XLM-RoBERTa': 'xlm-roberta-base',
    'DistilBERT': 'distilbert-base-multilingual-cased',
    'BERT-tiny': 'rasyosef/bert-tiny-amharic'
}

for name, model_name in models.items():
    print(f"Benchmarking {name}...")
    benchmark_results[name] = benchmark_model_inference(model_name, test_texts)
    print(f"Average time: {benchmark_results[name]['avg_time']:.4f}s")

### 📊 Data Processing Throughput

In [None]:
def benchmark_data_processing(data_size=1000):
    # Simulate data processing
    import sys
    import os
    sys.path.append(os.path.abspath('../src'))
    
    # Generate test data
    test_data = [f"ዋጋ {i*100} ብር አድራሻ አዲስ አበባ" for i in range(data_size)]
    
    start_time = time.time()
    
    # Process data (simplified)
    processed = []
    for text in test_data:
        # Simulate text cleaning and tokenization
        cleaned = text.replace('ዋጋ', 'Price')
        tokens = cleaned.split()
        processed.append(tokens)
    
    end_time = time.time()
    
    throughput = data_size / (end_time - start_time)
    return throughput

# Test different data sizes
data_sizes = [100, 500, 1000, 5000]
throughput_results = {}

for size in data_sizes:
    throughput = benchmark_data_processing(size)
    throughput_results[size] = throughput
    print(f"Data size {size}: {throughput:.2f} messages/second")

### 📈 Visualization

In [None]:
# Create performance comparison charts
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('⚡ Performance Benchmarking Results', fontsize=16, fontweight='bold')

# Model inference times
models = list(benchmark_results.keys())
avg_times = [benchmark_results[model]['avg_time'] for model in models]

ax1.bar(models, avg_times, color=['#1f77b4', '#ff7f0e', '#2ca02c'])
ax1.set_title('Model Inference Time')
ax1.set_ylabel('Time (seconds)')
ax1.tick_params(axis='x', rotation=45)

# Memory usage
memory_usage = [benchmark_results[model]['avg_memory'] for model in models]
ax2.bar(models, memory_usage, color=['#1f77b4', '#ff7f0e', '#2ca02c'])
ax2.set_title('Memory Usage')
ax2.set_ylabel('Memory (MB)')
ax2.tick_params(axis='x', rotation=45)

# Data processing throughput
sizes = list(throughput_results.keys())
throughputs = list(throughput_results.values())

ax3.plot(sizes, throughputs, marker='o', linewidth=2, markersize=6)
ax3.set_title('Data Processing Throughput')
ax3.set_xlabel('Data Size')
ax3.set_ylabel('Messages/Second')
ax3.grid(True, alpha=0.3)

# Performance vs Accuracy trade-off
f1_scores = [96.97, 95.74, 94.23]  # From model comparison
ax4.scatter(avg_times, f1_scores, s=200, alpha=0.7, c=['#1f77b4', '#ff7f0e', '#2ca02c'])
ax4.set_title('Performance vs Accuracy Trade-off')
ax4.set_xlabel('Inference Time (seconds)')
ax4.set_ylabel('F1-Score (%)')

for i, model in enumerate(models):
    ax4.annotate(model, (avg_times[i], f1_scores[i]), xytext=(5, 5), textcoords='offset points')

plt.tight_layout()
plt.show()

### 📋 Performance Summary

In [None]:
# Generate performance report
performance_df = pd.DataFrame({
    'Model': models,
    'Avg_Inference_Time_ms': [t*1000 for t in avg_times],
    'Memory_Usage_MB': memory_usage,
    'F1_Score': f1_scores
})

print("🏆 Performance Benchmarking Summary:")
print("=" * 50)
print(performance_df.to_string(index=False))

# Save results
performance_df.to_csv('../reports/performance_benchmark.csv', index=False)
print("\n💾 Results saved to: ../reports/performance_benchmark.csv")

# Recommendations
print("\n💡 Recommendations:")
print("• XLM-RoBERTa: Best accuracy, use for production")
print("• DistilBERT: Balanced performance, good for real-time")
print("• BERT-tiny: Fastest inference, suitable for mobile/edge")