### Import libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import json
import sys
import os
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from transformers.trainer_utils import EvalPrediction
from seqeval.metrics import precision_score, recall_score, f1_score
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
sys.path.append(os.path.abspath('../src/'))
sys.path.append(os.path.abspath('../scripts/'))

In [4]:
from ner_model.finetune_ner_model import load_conll, tokenize_and_align_labels, compute_metrics, logger

2025-07-21 16:58:35,882 - INFO - Using transformers version: 4.53.2
2025-07-21 16:58:35,882 - INFO - Device available: CPU


In [5]:
# Define paths and model
dataset_file = '../conLL/amharic_ner.conll'  # Conll format saved dataset
model_name = "Davlan/afro-xlmr-base"   # Using afroxmlr for Amharic support
output_dir = "../models/amharic_ner_model" 

In [6]:
# Define label mappings (matching Task 2 labels exactly)
label2id = {
    "O": 0,
    "B-Product": 1,
    "I-Product": 2,
    "B-PRICE": 3,
    "I-PRICE": 4,
    "B-LOC": 5,
    "I-LOC": 6
}

In [7]:
id2label = {v: k for k, v in label2id.items()}

        
logger.info("Starting fine-tuning process")

2025-07-21 16:58:36,622 - INFO - Starting fine-tuning process


In [8]:
# Load dataset
data = load_conll(dataset_file)
# Verify label consistency
unique_labels = set(label for sent_labels in data['ner_tags'] for label in sent_labels)
missing_labels = unique_labels - set(label2id.keys())
if missing_labels:
    logger.error(f"Labels in dataset not in label2id: {missing_labels}")

dataset = Dataset.from_dict(data)
        

2025-07-21 16:58:37,082 - INFO - Loaded 200 sentences from ../conLL/amharic_ner.conll
2025-07-21 16:58:37,082 - INFO - Unique labels in dataset: {'B-LOC', 'I-PRICE', 'I-Product', 'O', 'B-Product', 'B-PRICE'}
2025-07-21 16:58:37,082 - INFO - Label counts: {'B-LOC': 398, 'I-PRICE': 343, 'I-Product': 271, 'O': 5099, 'B-Product': 48, 'B-PRICE': 167}


In [9]:
# Split dataset
train_size = int(0.8 * len(dataset))
train_dataset = dataset.select(range(train_size))
val_dataset = dataset.select(range(train_size, len(dataset)))
logger.info(f"Split dataset: {len(train_dataset)} train, {len(val_dataset)} validation")

2025-07-21 16:58:38,002 - INFO - Split dataset: 160 train, 40 validation


In [10]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id
)
logger.info(f"Loaded model and tokenizer: {model_name}")

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-07-21 16:58:50,765 - INFO - Loaded model and tokenizer: Davlan/afro-xlmr-base


In [14]:
# Tokenize datasets
tokenized_train = train_dataset.map(
    lambda x: tokenize_and_align_labels(x, tokenizer, label2id),
    batched=True
)
tokenized_val = val_dataset.map(
    lambda x: tokenize_and_align_labels(x, tokenizer, label2id),
    batched=True
)
logger.info("Tokenized datasets successfully")

Map: 100%|██████████| 160/160 [00:00<00:00, 2709.84 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 3196.94 examples/s]
2025-07-21 16:59:00,530 - INFO - Tokenized datasets successfully


In [15]:
# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Reduced for small dataset
    per_device_eval_batch_size=8,
    num_train_epochs=3,  # Increased to improve training
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    )

In [16]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics(id2label)
)
logger.info("Initialized trainer")

2025-07-21 16:59:13,949 - INFO - Initialized trainer


In [17]:
# Train model
trainer.train()
logger.info("Training completed")



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.7153,0.589528,0.0,0.0,0.0
2,0.453,0.318186,0.858428,0.873418,0.865772
3,0.3273,0.237595,0.851741,0.886076,0.868123


2025-07-21 17:05:17,208 - INFO - Evaluation metrics: {'LOC': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1-score': np.float64(0.0), 'support': np.int64(78)}, 'PRICE': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1-score': np.float64(0.0), 'support': np.int64(73)}, 'Product': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1-score': np.float64(0.0), 'support': np.int64(7)}, 'micro avg': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1-score': np.float64(0.0), 'support': np.int64(158)}, 'macro avg': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1-score': np.float64(0.0), 'support': np.int64(158)}, 'weighted avg': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1-score': np.float64(0.0), 'support': np.int64(158)}}
2025-07-21 17:12:12,707 - INFO - Evaluation metrics: {'LOC': {'precision': np.float64(1.0), 'recall': np.float64(1.0), 'f1-score': np.float64(1.0), 'support': np.int64(78)}, 'PRICE': {'precision': 

In [18]:
 # Save model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
logger.info(f"Model and tokenizer saved to {output_dir}")
        

2025-07-21 17:19:46,386 - INFO - Model and tokenizer saved to ../models/amharic_ner_model
