In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import Dataset
import numpy as np
import evaluate
from pathlib import Path

# 1. CONLL File Parser
def parse_conll(file_path):
    """Parse CONLL format file into tokens and labels"""
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    tokens, labels = [], []
    current_tokens, current_labels = [], []
    for line in lines:
        line = line.strip()
        if not line:
            if current_tokens:
                tokens.append(current_tokens)
                labels.append(current_labels)
                current_tokens, current_labels = [], []
            continue
        parts = line.split('\t')
        if len(parts) != 2:
            continue
        token, label = parts
        current_tokens.append(token)
        current_labels.append(label)
    if current_tokens:
        tokens.append(current_tokens)
        labels.append(current_labels)
    return {"tokens": tokens, "ner_tags": labels}

# 2. Load Dataset
conll_path = Path("../CoNLL/amharic_ner.conll")
if not conll_path.exists():
    raise FileNotFoundError(f"CONLL file not found at: {conll_path}")

conll_data = parse_conll(conll_path)
dataset = Dataset.from_dict(conll_data)

# 3. Define Labels
label_list = ["O", "B-PRODUCT", "I-PRODUCT", "B-PRICE", "I-PRICE", "B-LOC", "I-LOC"]
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for i, l in enumerate(label_list)}

# 4. Tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# 5. Tokenization & Label Alignment
def tokenize_and_align_labels(examples):
    tokenized = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding='max_length',
        max_length=128
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        label_ids = []
        prev_word = None
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id != prev_word:
                label_ids.append(label2id[label[word_id]])
            else:
                label_ids.append(-100)
            prev_word = word_id
        labels.append(label_ids)
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=['tokens', 'ner_tags']
)

# 6. Split Dataset
split_datasets = tokenized_dataset.train_test_split(test_size=0.2)

# 7. Model
model = AutoModelForTokenClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# 8. Training Args
training_args = TrainingArguments(
    output_dir="../results/amharic-ner-results",
    eval_strategy="epoch",  # Correct this if necessary
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='../logs',
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=3,
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to=None,  # Disabled reporting
)

# 9. Data Collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# 10. Metrics
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    preds, labels = p
    preds = np.argmax(preds, axis=2)
    true_preds = []
    true_labels = []
    for pred, lab in zip(preds, labels):
        pred_tags = []
        lab_tags = []
        for p_, l_ in zip(pred, lab):
            if l_ != -100:
                pred_tags.append(label_list[p_])
                lab_tags.append(label_list[l_])
        true_preds.append(pred_tags)
        true_labels.append(lab_tags)
    results = seqeval.compute(
        predictions=true_preds,
        references=true_labels
    )
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# 11. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_datasets["train"],
    eval_dataset=split_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 12. Train
print("Starting training...")
trainer.train()

# 13. Save
output_dir = "../models/amharic-ner-model"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

# 14. Evaluate
print("\nFinal Evaluation:")
eval_results = trainer.evaluate()
print(f"Precision: {eval_results['eval_precision']:.3f}")
print(f"Recall: {eval_results['eval_recall']:.3f}")
print(f"F1 Score: {eval_results['eval_f1']:.3f}")
print(f"Accuracy: {eval_results['eval_accuracy']:.3f}")


Map: 100%|██████████| 50/50 [00:00<00:00, 834.19 examples/s]
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,2.057569,0.0,0.0,0.0,0.12766
2,No log,1.669259,0.016949,0.111111,0.029412,0.322695


  _warn_prf(average, modifier, msg_start, len(result))


SafetensorError: Error while serializing: IoError(Os { code: 112, kind: StorageFull, message: "There is not enough space on the disk." })