In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
!pip install evaluate seqeval
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from evaluate import load  # use `evaluate` instead of deprecated `datasets.load_metric`
import numpy as np

from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

# Step 2: Read and parse CoNLL-format file
def read_conll_file(file_path):
    sentences, labels = [], []
    sentence, label_seq = [], []

    with open(file_path, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    labels.append(label_seq)
                    sentence, label_seq = [], []
            else:
                token, tag = line.split()
                sentence.append(token)
                label_seq.append(tag)

    if sentence:
        sentences.append(sentence)
        labels.append(label_seq)

    return sentences, labels

tokens, ner_tags = read_conll_file("drive/MyDrive/amharic-ner/ner_auto_labels.conll")

# Step 3: Create label mappings
unique_tags = sorted(set(tag for seq in ner_tags for tag in seq))
label2id = {label: i for i, label in enumerate(unique_tags)}
id2label = {i: label for label, i in label2id.items()}

# Step 4: Convert labels to ID format
ner_ids = [[label2id[tag] for tag in seq] for seq in ner_tags]

# Step 5: Split into train and test
train_tokens, test_tokens, train_labels, test_labels = train_test_split(
    tokens, ner_ids, test_size=0.2, random_state=42
)

# Step 6: Create Hugging Face datasets
dataset_dict = DatasetDict({
    "train": Dataset.from_dict({"tokens": train_tokens, "ner_tags": train_labels}),
    "test": Dataset.from_dict({"tokens": test_tokens, "ner_tags": test_labels})
})

# ✅ Done
print("✅ Dataset loaded and split successfully!")
print(dataset_dict)
print("Label2ID mapping:", label2id)

✅ Dataset loaded and split successfully!
DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 79
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 20
    })
})
Label2ID mapping: {'B-LOC': 0, 'B-PRICE': 1, 'B-PRODUCT': 2, 'I-LOC': 3, 'I-PRICE': 4, 'I-PRODUCT': 5, 'O': 6}


In [26]:

# Use tokenizer and model (can be replaced with "bert-tiny-amharic" or "afroxlmr" as needed)
model_name = "Davlan/afro-xlmr-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and align labels
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    labels = []

    word_ids = tokenized_inputs.word_ids()
    prev_word_id = None
    for word_id in word_ids:
        if word_id is None:
            labels.append(-100)
        elif word_id != prev_word_id:
            labels.append(example["ner_tags"][word_id])
        else:
            labels.append(example["ner_tags"][word_id])
        prev_word_id = word_id

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply preprocessing to both train and test
tokenized_dataset = dataset_dict.map(tokenize_and_align_labels, batched=False)


tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Map:   0%|          | 0/79 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [27]:
# Load model with label mappings
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="drive/MyDrive/models/afroxlmr",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,
    weight_decay=0.01,
)

# Load metric
metric = load("seqeval")
label_list = list(label2id.keys())

config.json:   0%|          | 0.00/714 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:

# Compute evaluation metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer),
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [29]:
# Train the model
trainer.train()

Step,Training Loss


TrainOutput(global_step=80, training_loss=0.29173526763916013, metrics={'train_runtime': 131.4441, 'train_samples_per_second': 4.808, 'train_steps_per_second': 0.609, 'total_flos': 234132452907144.0, 'train_loss': 0.29173526763916013, 'epoch': 8.0})

In [30]:
metrics = trainer.evaluate(
    eval_dataset=tokenized_dataset["test"],
    metric_key_prefix="test"
)
print(metrics)

{'test_loss': 0.13441325724124908, 'test_precision': 0.47692307692307695, 'test_recall': 0.5636363636363636, 'test_f1': 0.5166666666666667, 'test_accuracy': 0.959584926269798, 'test_runtime': 0.6706, 'test_samples_per_second': 29.826, 'test_steps_per_second': 4.474, 'epoch': 8.0}


In [31]:
# Save final model
model.save_pretrained("drive/MyDrive/models/afroxlmr/final")
tokenizer.save_pretrained("drive/MyDrive/models/afroxlmr/final")

('drive/MyDrive/models/afroxlmr/final/tokenizer_config.json',
 'drive/MyDrive/models/afroxlmr/final/special_tokens_map.json',
 'drive/MyDrive/models/afroxlmr/final/sentencepiece.bpe.model',
 'drive/MyDrive/models/afroxlmr/final/added_tokens.json',
 'drive/MyDrive/models/afroxlmr/final/tokenizer.json')