# Task 4: Model Comparison & Selection
Comparing multiple NER models and selecting the best performer

In [None]:
# Setup and imports
import sys
sys.path.append('../src')

from ner.model_trainer import ModelComparator
from evaluation.model_evaluator import NERModelEvaluator
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Initialize model comparator
comparator = ModelComparator()
print("Model comparator initialized")
print(f"Models to compare: {comparator.config['ner']['model_names']}")

In [None]:
# Prepare data splits
train_data_path = "../data/labeled/ethiopian_ner_dataset_train.txt"
eval_data_path = "../data/labeled/ethiopian_ner_dataset_eval.txt"

# Create train/eval splits from main dataset
from ner.model_trainer import NERModelTrainer
trainer = NERModelTrainer()
sentences, labels = trainer.load_conll_data("../data/labeled/ethiopian_ner_dataset.txt")

split_idx = int(0.8 * len(sentences))
train_sentences = sentences[:split_idx]
train_labels = labels[:split_idx]
eval_sentences = sentences[split_idx:]
eval_labels = labels[split_idx:]

print(f"Training: {len(train_sentences)} sentences")
print(f"Evaluation: {len(eval_sentences)} sentences")

In [None]:
# Save split data
import os
os.makedirs("../data/labeled", exist_ok=True)

# Save training data
with open(train_data_path, 'w', encoding='utf-8') as f:
    for sent, labs in zip(train_sentences, train_labels):
        for token, label in zip(sent, labs):
            f.write(f"{token}\t{label}\n")
        f.write("\n")

# Save evaluation data
with open(eval_data_path, 'w', encoding='utf-8') as f:
    for sent, labs in zip(eval_sentences, eval_labels):
        for token, label in zip(sent, labs):
            f.write(f"{token}\t{label}\n")
        f.write("\n")

print("Data splits saved")

In [None]:
# Compare all models
print("Starting model comparison...")
results = comparator.compare_models(train_data_path, eval_data_path)
print("Model comparison completed!")

In [None]:
# Display comparison results
print("Model Comparison Results:")
print("-" * 60)
print(f"{'Model':<30} {'F1 Score':<12} {'Accuracy':<12}")
print("-" * 60)

for model_name, result in results.items():
    if 'error' not in result:
        f1 = result.get('eval_f1', 0)
        acc = result.get('eval_accuracy', 0)
        print(f"{model_name:<30} {f1:<12.3f} {acc:<12.3f}")
    else:
        print(f"{model_name:<30} {'ERROR':<12} {'ERROR':<12}")

print("-" * 60)

In [None]:
# Get best model
best_model = comparator.get_best_model()
print(f"Best performing model: {best_model}")

if best_model and best_model in results:
    best_result = results[best_model]
    if 'error' not in best_result:
        print(f"Best F1 Score: {best_result.get('eval_f1', 0):.3f}")
        print(f"Best Accuracy: {best_result.get('eval_accuracy', 0):.3f}")

In [None]:
# Create comparison visualization
valid_results = {k: v for k, v in results.items() if 'error' not in v}

if valid_results:
    models = list(valid_results.keys())
    f1_scores = [valid_results[model].get('eval_f1', 0) for model in models]
    accuracies = [valid_results[model].get('eval_accuracy', 0) for model in models]
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # F1 Score comparison
    ax1.bar(models, f1_scores, color='skyblue')
    ax1.set_title('F1 Score Comparison')
    ax1.set_ylabel('F1 Score')
    ax1.tick_params(axis='x', rotation=45)
    
    # Accuracy comparison
    ax2.bar(models, accuracies, color='lightgreen')
    ax2.set_title('Accuracy Comparison')
    ax2.set_ylabel('Accuracy')
    ax2.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
else:
    print("No valid results to visualize")

In [None]:
# Detailed evaluation with NERModelEvaluator
evaluator = NERModelEvaluator()

# Create mock predictions for demonstration
true_labels = eval_labels[:5]  # Use first 5 sentences
pred_labels = eval_labels[:5]  # Mock predictions (same as true for demo)

# Evaluate
detailed_results = evaluator.evaluate_predictions(true_labels, pred_labels, best_model or "XLM-RoBERTa")

print("Detailed Evaluation Results:")
print(f"Overall F1: {detailed_results['overall_metrics']['f1_score']:.3f}")
print(f"Overall Precision: {detailed_results['overall_metrics']['precision']:.3f}")
print(f"Overall Recall: {detailed_results['overall_metrics']['recall']:.3f}")

print("\nEntity-level Performance:")
for entity, metrics in detailed_results['entity_metrics'].items():
    print(f"  {entity}: F1={metrics['f1_score']:.3f}, P={metrics['precision']:.3f}, R={metrics['recall']:.3f}")

In [None]:
# Model selection summary
print("Model Selection Summary:")
print("=" * 50)
print(f"Models evaluated: {len(results)}")
print(f"Best model: {best_model}")
if best_model and best_model in results and 'error' not in results[best_model]:
    print(f"Best F1 score: {results[best_model].get('eval_f1', 0):.3f}")
print("\nRecommendation: Use the best performing model for production deployment")
print(f"Model path: ../models/checkpoints/{best_model.replace('/', '_') if best_model else 'xlm-roberta-base'}/final_model")