### Import libraries

In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
import pandas as pd
import json
import sys
import os
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from transformers.trainer_utils import EvalPrediction
from seqeval.metrics import precision_score, recall_score, f1_score
import torch

In [14]:
sys.path.append(os.path.abspath('../src/'))
sys.path.append(os.path.abspath('../scripts/'))

In [15]:
from compare.compare_ner_models import load_conll, logger, fine_tune_model

In [None]:
# Define paths and model
dataset_file = '../conLL/amharic_ner.conll'  # Conll format saved dataset

output_dir1 = "../models/amharic_ner_xlmr" 
output_dir2 = "../models/amharic_ner_mbert"
output_dir3 = "../models/amharic_ner_distilbert"

models = [
            ("Davlan/afro-xlmr-base", output_dir1),
            ("google-bert/bert-base-multilingual-cased", output_dir2),
            ("distilbert/distilbert-base-multilingual-cased", output_dir3)
        ]

output_comparison_file = "../data/model_comparison.csv"



In [17]:
# Define label mappings
label2id = {
    "O": 0,
    "B-Product": 1,
    "I-Product": 2,
    "B-PRICE": 3,
    "I-PRICE": 4,
    "B-LOC": 5,
    "I-LOC": 6
}

In [18]:
id2label = {v: k for k, v in label2id.items()}
logger.info(f"Label mappings: {label2id}")

2025-07-22 10:13:01,377 - INFO - Label mappings: {'O': 0, 'B-Product': 1, 'I-Product': 2, 'B-PRICE': 3, 'I-PRICE': 4, 'B-LOC': 5, 'I-LOC': 6}


In [19]:
# Load dataset
data = load_conll(dataset_file)
# Verify label consistency
unique_labels = set(label for sent_labels in data['ner_tags'] for label in sent_labels)
missing_labels = unique_labels - set(label2id.keys())
if missing_labels:
    logger.error(f"Labels in dataset not in label2id: {missing_labels}")

dataset = Dataset.from_dict(data)
        

2025-07-22 10:13:01,642 - INFO - Loaded 200 sentences from ../conLL/amharic_ner.conll
2025-07-22 10:13:01,642 - INFO - Unique labels: {'B-Product', 'B-PRICE', 'O', 'I-Product', 'I-PRICE', 'B-LOC'}
2025-07-22 10:13:01,642 - INFO - Label counts: {'B-Product': 48, 'B-PRICE': 167, 'O': 5099, 'I-Product': 271, 'I-PRICE': 343, 'B-LOC': 398}


In [20]:
# Fine-tune and evaluate each model
results = []
for model_name, output_dir in models:
    logger.info(f"Starting fine-tuning for {model_name}")
    result = fine_tune_model(model_name, dataset, label2id, id2label, output_dir)
    results.append(result)

2025-07-22 10:13:01,808 - INFO - Starting fine-tuning for Davlan/afro-xlmr-base
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-07-22 10:13:05,973 - INFO - Loaded Davlan/afro-xlmr-base on cpu
2025-07-22 10:13:05,973 - INFO - Dataset split: 160 train, 40 validation
Map: 100%|██████████| 160/160 [00:00<00:00, 2539.67 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 2574.02 examples/s]
2025-07-22 10:13:06,156 - INFO - Tokenized datasets for Davlan/afro-xlmr-base


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.6366,0.509468,0.493671,0.246835,0.329114
2,0.3464,0.223549,0.835912,0.873418,0.853691
3,0.2398,0.152906,0.869066,0.905063,0.886241
4,0.1693,0.127259,0.913694,0.936709,0.924895
5,0.119,0.115943,0.907694,0.93038,0.918734


2025-07-22 10:18:34,487 - INFO - Evaluation metrics: {'LOC': {'precision': np.float64(1.0), 'recall': np.float64(0.5), 'f1-score': np.float64(0.6666666666666666), 'support': np.int64(78)}, 'PRICE': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1-score': np.float64(0.0), 'support': np.int64(73)}, 'Product': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1-score': np.float64(0.0), 'support': np.int64(7)}, 'micro avg': {'precision': np.float64(0.5735294117647058), 'recall': np.float64(0.2468354430379747), 'f1-score': np.float64(0.34513274336283184), 'support': np.int64(158)}, 'macro avg': {'precision': np.float64(0.3333333333333333), 'recall': np.float64(0.16666666666666666), 'f1-score': np.float64(0.2222222222222222), 'support': np.int64(158)}, 'weighted avg': {'precision': np.float64(0.4936708860759494), 'recall': np.float64(0.2468354430379747), 'f1-score': np.float64(0.3291139240506329), 'support': np.int64(158)}}
2025-07-22 10:21:41,281 - INFO - Evaluation m

2025-07-22 10:32:33,523 - INFO - Evaluation metrics: {'LOC': {'precision': np.float64(1.0), 'recall': np.float64(1.0), 'f1-score': np.float64(1.0), 'support': np.int64(78)}, 'PRICE': {'precision': np.float64(0.8961038961038961), 'recall': np.float64(0.9452054794520548), 'f1-score': np.float64(0.9199999999999999), 'support': np.int64(73)}, 'Product': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1-score': np.float64(0.0), 'support': np.int64(7)}, 'micro avg': {'precision': np.float64(0.9130434782608695), 'recall': np.float64(0.930379746835443), 'f1-score': np.float64(0.9216300940438872), 'support': np.int64(158)}, 'macro avg': {'precision': np.float64(0.6320346320346321), 'recall': np.float64(0.6484018264840182), 'f1-score': np.float64(0.64), 'support': np.int64(158)}, 'weighted avg': {'precision': np.float64(0.9076935722505343), 'recall': np.float64(0.930379746835443), 'f1-score': np.float64(0.9187341772151899), 'support': np.int64(158)}}
2025-07-22 10:32:33,523 - INFO - 

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.5664,0.424004,0.612903,0.297468,0.400195
2,0.3948,0.30242,0.810768,0.841772,0.824993
3,0.3157,0.255224,0.831305,0.85443,0.842485
4,0.2695,0.215193,0.849075,0.873418,0.860843
5,0.1909,0.21008,0.851741,0.886076,0.868123


2025-07-22 10:36:47,759 - INFO - Evaluation metrics: {'LOC': {'precision': np.float64(1.0), 'recall': np.float64(0.5), 'f1-score': np.float64(0.6666666666666666), 'support': np.int64(78)}, 'PRICE': {'precision': np.float64(0.25806451612903225), 'recall': np.float64(0.1095890410958904), 'f1-score': np.float64(0.15384615384615383), 'support': np.int64(73)}, 'Product': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1-score': np.float64(0.0), 'support': np.int64(7)}, 'micro avg': {'precision': np.float64(0.6714285714285714), 'recall': np.float64(0.2974683544303797), 'f1-score': np.float64(0.41228070175438597), 'support': np.int64(158)}, 'macro avg': {'precision': np.float64(0.41935483870967744), 'recall': np.float64(0.20319634703196346), 'f1-score': np.float64(0.2735042735042735), 'support': np.int64(158)}, 'weighted avg': {'precision': np.float64(0.6129032258064516), 'recall': np.float64(0.2974683544303797), 'f1-score': np.float64(0.40019474196689386), 'support': np.int64(158

2025-07-22 10:43:42,920 - INFO - Evaluation metrics: {'LOC': {'precision': np.float64(1.0), 'recall': np.float64(1.0), 'f1-score': np.float64(1.0), 'support': np.int64(78)}, 'PRICE': {'precision': np.float64(0.775), 'recall': np.float64(0.8493150684931506), 'f1-score': np.float64(0.8104575163398693), 'support': np.int64(73)}, 'Product': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1-score': np.float64(0.0), 'support': np.int64(7)}, 'micro avg': {'precision': np.float64(0.8860759493670886), 'recall': np.float64(0.8860759493670886), 'f1-score': np.float64(0.8860759493670886), 'support': np.int64(158)}, 'macro avg': {'precision': np.float64(0.5916666666666667), 'recall': np.float64(0.6164383561643835), 'f1-score': np.float64(0.6034858387799564), 'support': np.int64(158)}, 'weighted avg': {'precision': np.float64(0.8517405063291138), 'recall': np.float64(0.8860759493670886), 'f1-score': np.float64(0.868122776536775), 'support': np.int64(158)}}
2025-07-22 10:43:42,920 - INFO 

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.6795,0.626131,0.493671,0.246835,0.329114
2,0.4691,0.381109,0.534406,0.639241,0.569955
3,0.3556,0.264875,0.789367,0.797468,0.793363
4,0.2614,0.241264,0.793687,0.810127,0.801688
5,0.2017,0.222806,0.79194,0.816456,0.803714


2025-07-22 10:45:15,609 - INFO - Evaluation metrics: {'LOC': {'precision': np.float64(1.0), 'recall': np.float64(0.5), 'f1-score': np.float64(0.6666666666666666), 'support': np.int64(78)}, 'PRICE': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1-score': np.float64(0.0), 'support': np.int64(73)}, 'Product': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1-score': np.float64(0.0), 'support': np.int64(7)}, 'micro avg': {'precision': np.float64(1.0), 'recall': np.float64(0.2468354430379747), 'f1-score': np.float64(0.39593908629441626), 'support': np.int64(158)}, 'macro avg': {'precision': np.float64(0.3333333333333333), 'recall': np.float64(0.16666666666666666), 'f1-score': np.float64(0.2222222222222222), 'support': np.int64(158)}, 'weighted avg': {'precision': np.float64(0.4936708860759494), 'recall': np.float64(0.2468354430379747), 'f1-score': np.float64(0.3291139240506329), 'support': np.int64(158)}}
2025-07-22 10:46:19,115 - INFO - Evaluation metrics: {'LOC':

2025-07-22 10:51:44,013 - INFO - Evaluation metrics: {'LOC': {'precision': np.float64(1.0), 'recall': np.float64(1.0), 'f1-score': np.float64(1.0), 'support': np.int64(78)}, 'PRICE': {'precision': np.float64(0.6455696202531646), 'recall': np.float64(0.6986301369863014), 'f1-score': np.float64(0.6710526315789473), 'support': np.int64(73)}, 'Product': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1-score': np.float64(0.0), 'support': np.int64(7)}, 'micro avg': {'precision': np.float64(0.821656050955414), 'recall': np.float64(0.8164556962025317), 'f1-score': np.float64(0.8190476190476191), 'support': np.int64(158)}, 'macro avg': {'precision': np.float64(0.5485232067510548), 'recall': np.float64(0.5662100456621005), 'f1-score': np.float64(0.5570175438596491), 'support': np.int64(158)}, 'weighted avg': {'precision': np.float64(0.7919403941676013), 'recall': np.float64(0.8164556962025317), 'f1-score': np.float64(0.8037141905396402), 'support': np.int64(158)}}
2025-07-22 10:51:4

In [21]:
# Save comparison results
df_results = pd.DataFrame(results)
df_results.to_csv(output_comparison_file, index=False, encoding='utf-8')
logger.info(f"Saved model comparison to {output_comparison_file}")


2025-07-22 10:52:44,312 - INFO - Saved model comparison to model_comparison.csv


In [22]:
# Select best model (highest F1-score, with speed as tiebreaker)
best_model = max(results, key=lambda x: (x["f1"], -x["inference_time"]))
logger.info(f"Best model: {best_model['model_name']} (F1: {best_model['f1']:.4f}, Inference Time: {best_model['inference_time']:.4f}s)")

2025-07-22 10:52:44,760 - INFO - Best model: Davlan/afro-xlmr-base (F1: 0.9187, Inference Time: 0.5105s)
