In [1]:

from google.colab import drive
drive.mount('/content/drive')

# Install necessary libraries
!pip install transformers datasets accelerate seqeval -q
!pip install torch # Ensure torch is installed if not already
!pip install seqeval -q
import os
import pandas as pd
from datasets import load_dataset, Dataset, Features, Value, ClassLabel
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from seqeval.metrics import classification_report
import numpy as np
import torch

Mounted at /content/drive
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[

In [2]:
import os
import pandas as pd
from datasets import load_dataset, Dataset, Features, Value, Sequence, ClassLabel
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, pipeline
from seqeval.metrics import classification_report
import numpy as np
import torch
from google.colab import drive
import time

drive.mount('/content/drive', force_remount=True)

conll_file_path = '/content/drive/MyDrive/labeled_amharic_ner_data.conll'

# Check if the file exists
if not os.path.exists(conll_file_path):
    print(f"Error: CoNLL file not found at {conll_file_path}")
    print("Please ensure 'labeled_amharic_ner_data.conll' is in your Google Drive's root and the path is correct.")
else:
    print(f"CoNLL file found at {conll_file_path}")

# Definelabels
label_list = ['O', 'B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE']
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

# Function to read CoNLL file
def read_conll_file(file_path):
    tokens = []
    ner_tags = []
    current_tokens = []
    current_tags = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split('\t')
                if len(parts) == 2:
                    current_tokens.append(parts[0])
                    current_tags.append(parts[1].strip())
            else:
                if current_tokens:
                    tokens.append(current_tokens)
                    ner_tags.append(current_tags)
                current_tokens = []
                current_tags = []
        if current_tokens:
            tokens.append(current_tokens)
            ner_tags.append(current_tags)
    return {'tokens': tokens, 'ner_tags': ner_tags}

# Load custom dataset
raw_data_for_hf = read_conll_file(conll_file_path)

# Define features for the dataset
features_for_dataset = Features({
    'tokens': Sequence(Value('string')),
    'ner_tags': Sequence(ClassLabel(names=label_list))
})

# Create Hugging Face Dataset
full_dataset = Dataset.from_dict(raw_data_for_hf, features=features_for_dataset)

print(f"Dataset loaded. Number of examples: {len(full_dataset)}")
print("First example:", full_dataset[0])

# Train-test split
train_test_split = full_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")

from transformers import DataCollatorForTokenClassification

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    report = classification_report(true_labels, true_predictions, output_dict=True, zero_division=0)
    overall_metrics = report.get('micro avg', {})
    f1_score = overall_metrics.get('f1', overall_metrics.get('f1-score', 0.0))
    return {
        "precision": overall_metrics.get('precision', 0.0),
        "recall": overall_metrics.get('recall', 0.0),
        "f1": f1_score,
        "accuracy": overall_metrics.get('precision', 0.0)
    }

!rm -rf ./results_*
!rm -rf ./logs_*
!rm -rf ~/.cache/huggingface/
!rm -rf /tmp/*

model_checkpoints_to_compare = {
    "XLM-R_Amharic_NER": "mbeukman/xlm-roberta-base-finetuned-ner-amharic",
    "mBERT": "bert-base-multilingual-cased",
    "DistilBERT_Multi": "distilbert-base-multilingual-cased",
}

results = {}

for model_name, checkpoint in model_checkpoints_to_compare.items():
    print(f"\n--- Fine-tuning {model_name} ({checkpoint}) ---")
    !rm -rf /tmp/*

    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModelForTokenClassification.from_pretrained(
        checkpoint,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True
    )
    def tokenize_and_align_labels_for_current_model(examples):
        tokenized_inputs = tokenizer(
            examples["tokens"], truncation=True, is_split_into_words=True
        )
        labels = []
        for i, label_ids_for_example in enumerate(examples["ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            current_label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    current_label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    current_label_ids.append(label_ids_for_example[word_idx])
                else:
                    original_word_label_id = label_ids_for_example[word_idx]
                    original_word_label_name = id2label[original_word_label_id]
                    if original_word_label_name.startswith("B-"):
                        new_label_name = f"I-{original_word_label_name[2:]}"
                        current_label_ids.append(label2id.get(new_label_name, original_word_label_id))
                    else:
                        current_label_ids.append(original_word_label_id)
                previous_word_idx = word_idx
            labels.append(current_label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    current_tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels_for_current_model, batched=True)
    current_tokenized_eval_dataset = eval_dataset.map(tokenize_and_align_labels_for_current_model, batched=True)

    current_data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

    current_output_dir = f"./results_{model_name}"
    current_logging_dir = f"./logs_{model_name}"
    if os.path.exists(current_output_dir):
        !rm -rf {current_output_dir}
    if os.path.exists(current_logging_dir):
        !rm -rf {current_logging_dir}
    os.makedirs(current_output_dir, exist_ok=True)
    os.makedirs(current_logging_dir, exist_ok=True)

    training_args_current = TrainingArguments(
        output_dir=current_output_dir,
        eval_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=7,
        weight_decay=0.01,
        logging_dir=current_logging_dir,
        logging_steps=10,
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none"
    )

    trainer_current = Trainer(
        model=model,
        args=training_args_current,
        train_dataset=current_tokenized_train_dataset,
        eval_dataset=current_tokenized_eval_dataset,
        tokenizer=tokenizer,
        data_collator=current_data_collator,
        compute_metrics=compute_metrics
    )

    train_start_time = time.time()
    trainer_current.train()
    train_end_time = time.time()
    training_time = train_end_time - train_start_time

    eval_metrics = trainer_current.evaluate()

    inference_start_time = time.time()
    sample_text = "አዲስ አበባ ላይ ቴሌቪዥን በ1000 ብር ይሸጣል"
    ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
    _ = ner_pipeline(sample_text)
    inference_end_time = time.time()
    inference_time = inference_end_time - inference_start_time

    results[model_name] = {
        "checkpoint": checkpoint,
        "eval_metrics": eval_metrics,
        "training_time_seconds": training_time,
        "inference_time_per_sample_seconds": inference_time / len(sample_text.split())
    }

    output_model_dir_specific = f"/content/drive/MyDrive/fine_tuned_amharic_ner_model_{model_name}"
    os.makedirs(output_model_dir_specific, exist_ok=True)

    # Save only the best model (which the trainer already loaded at the end)
    trainer_current.save_model(output_model_dir_specific)
    tokenizer.save_pretrained(output_model_dir_specific)
    print(f"Model {model_name} saved to: {output_model_dir_specific}")


# Print all results in a comparison table
print("\n--- Model Comparison Results ---")
for model_name, data in results.items():
    print(f"\nModel: {model_name}")
    print(f"  Checkpoint: {data['checkpoint']}")
    print(f"  Evaluation Metrics (on validation set):")
    for metric, value in data['eval_metrics'].items():
        print(f"    {metric}: {value:.4f}")
    print(f"  Training Time: {data['training_time_seconds']:.2f} seconds")
    print(f"  Approx. Inference Time per Sample: {data['inference_time_per_sample_seconds']:.4f} seconds (for a rough sentence length)")

Mounted at /content/drive
CoNLL file found at /content/drive/MyDrive/labeled_amharic_ner_data.conll
Dataset loaded. Number of examples: 46
First example: {'tokens': ['Electric', 'Charcoal', 'Burner', 'በቀላሉ', 'ከሰል', 'ለማያያዝ', 'የሚሆን', 'አነስ', 'ያለ', 'ቦታ', 'የማይዝ', 'የሚሰራ', 'ሻይ፣', 'ቡና', 'ለማፍላት', 'የሚሆን', 'ዋጋ፦', '1600', 'ብር', 'ውስን', 'ፍሬ', 'ነው', 'ያለው', 'አድራሻ', 'መገናኛ_መሰረት_ደፋር_ሞል_ሁለተኛ_ፎቅ', 'ቢሮ', 'ቁ', 'S05S06', '0902660722', '0928460606', 'በTelegram', 'ለማዘዝ', 'ይጠቀሙ', 'zemencallcenter', 'zemenexpressadmin', 'ለተጨማሪ', 'ማብራሪያ', 'የቴሌግራም', 'ገፃችን', 'httpstelegrammezemenexpress'], 'ner_tags': [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 6, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, 4, 4, 0, 0, 4, 4, 0, 0, 4, 4, 4]}
Train dataset size: 36
Eval dataset size: 10
rm: cannot remove '/tmp/colab_runtime.sock': Device or resource busy

--- Fine-tuning XLM-R_Amharic_NER (mbeukman/xlm-roberta-base-finetuned-ner-amharic) ---
rm: cannot remove '/tmp/colab_runtime.sock': Device or resource busy


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at mbeukman/xlm-roberta-base-finetuned-ner-amharic and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

  trainer_current = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.066432,0.35,0.466667,0.4,0.35
2,1.133100,0.540458,0.7,0.466667,0.56,0.7
3,1.133100,0.415589,0.7,0.466667,0.56,0.7
4,0.500100,0.329908,0.583333,0.466667,0.518519,0.583333
5,0.500100,0.285082,0.642857,0.6,0.62069,0.642857
6,0.350700,0.254206,0.625,0.666667,0.645161,0.625
7,0.350700,0.238934,0.625,0.666667,0.645161,0.625


Device set to use cuda:0


Model XLM-R_Amharic_NER saved to: /content/drive/MyDrive/fine_tuned_amharic_ner_model_XLM-R_Amharic_NER

--- Fine-tuning mBERT (bert-base-multilingual-cased) ---
rm: cannot remove '/tmp/colab_runtime.sock': Device or resource busy


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

  trainer_current = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.977332,0.09375,0.2,0.12766,0.09375
2,1.087300,0.55076,0.176471,0.2,0.1875,0.176471
3,1.087300,0.368005,0.4,0.266667,0.32,0.4
4,0.471000,0.304377,0.5,0.4,0.444444,0.5
5,0.471000,0.263074,0.545455,0.4,0.461538,0.545455
6,0.337600,0.242792,0.583333,0.466667,0.518519,0.583333
7,0.337600,0.236026,0.363636,0.266667,0.307692,0.363636


Device set to use cuda:0


Model mBERT saved to: /content/drive/MyDrive/fine_tuned_amharic_ner_model_mBERT

--- Fine-tuning DistilBERT_Multi (distilbert-base-multilingual-cased) ---
rm: cannot remove '/tmp/colab_runtime.sock': Device or resource busy


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

  trainer_current = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.22084,0.0,0.0,0.0,0.0
2,1.308200,0.955018,0.0625,0.066667,0.064516,0.0625
3,1.308200,0.758722,0.294118,0.333333,0.3125,0.294118
4,0.715000,0.622204,0.210526,0.266667,0.235294,0.210526
5,0.715000,0.537085,0.214286,0.2,0.206897,0.214286
6,0.492100,0.483804,0.416667,0.333333,0.37037,0.416667
7,0.492100,0.465328,0.583333,0.466667,0.518519,0.583333


Device set to use cuda:0


Model DistilBERT_Multi saved to: /content/drive/MyDrive/fine_tuned_amharic_ner_model_DistilBERT_Multi

--- Model Comparison Results ---

Model: XLM-R_Amharic_NER
  Checkpoint: mbeukman/xlm-roberta-base-finetuned-ner-amharic
  Evaluation Metrics (on validation set):
    eval_loss: 0.2542
    eval_precision: 0.6250
    eval_recall: 0.6667
    eval_f1: 0.6452
    eval_accuracy: 0.6250
    eval_runtime: 0.0954
    eval_samples_per_second: 104.8340
    eval_steps_per_second: 20.9670
    epoch: 7.0000
  Training Time: 813.13 seconds
  Approx. Inference Time per Sample: 0.0055 seconds (for a rough sentence length)

Model: mBERT
  Checkpoint: bert-base-multilingual-cased
  Evaluation Metrics (on validation set):
    eval_loss: 0.2428
    eval_precision: 0.5833
    eval_recall: 0.4667
    eval_f1: 0.5185
    eval_accuracy: 0.5833
    eval_runtime: 0.0776
    eval_samples_per_second: 128.8200
    eval_steps_per_second: 25.7640
    epoch: 7.0000
  Training Time: 731.85 seconds
  Approx. Inference