In [None]:

from google.colab import drive
drive.mount('/content/drive')

# Install necessary libraries
!pip install transformers datasets accelerate seqeval -q
!pip install torch # Ensure torch is installed if not already
!pip install seqeval -q
import os
import pandas as pd
from datasets import load_dataset, Dataset, Features, Value, ClassLabel
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from seqeval.metrics import classification_report
import numpy as np
import torch

In [None]:
import os
import pandas as pd
from datasets import load_dataset, Dataset, Features, Value, Sequence, ClassLabel
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, pipeline
from seqeval.metrics import classification_report
import numpy as np
import torch
from google.colab import drive
import time

drive.mount('/content/drive', force_remount=True)

conll_file_path = '/content/drive/MyDrive/labeled_amharic_ner_data.conll'

# Check if the file exists
if not os.path.exists(conll_file_path):
    print(f"Error: CoNLL file not found at {conll_file_path}")
    print("Please ensure 'labeled_amharic_ner_data.conll' is in your Google Drive's root and the path is correct.")
else:
    print(f"CoNLL file found at {conll_file_path}")

# Definelabels
label_list = ['O', 'B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE']
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

# Function to read CoNLL file
def read_conll_file(file_path):
    tokens = []
    ner_tags = []
    current_tokens = []
    current_tags = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split('\t')
                if len(parts) == 2:
                    current_tokens.append(parts[0])
                    current_tags.append(parts[1].strip())
            else:
                if current_tokens:
                    tokens.append(current_tokens)
                    ner_tags.append(current_tags)
                current_tokens = []
                current_tags = []
        if current_tokens:
            tokens.append(current_tokens)
            ner_tags.append(current_tags)
    return {'tokens': tokens, 'ner_tags': ner_tags}

# Load custom dataset
raw_data_for_hf = read_conll_file(conll_file_path)

# Define features for the dataset
features_for_dataset = Features({
    'tokens': Sequence(Value('string')),
    'ner_tags': Sequence(ClassLabel(names=label_list))
})

# Create Hugging Face Dataset
full_dataset = Dataset.from_dict(raw_data_for_hf, features=features_for_dataset)

print(f"Dataset loaded. Number of examples: {len(full_dataset)}")
print("First example:", full_dataset[0])

# Train-test split
train_test_split = full_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")

from transformers import DataCollatorForTokenClassification

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    report = classification_report(true_labels, true_predictions, output_dict=True, zero_division=0)
    overall_metrics = report.get('micro avg', {})
    f1_score = overall_metrics.get('f1', overall_metrics.get('f1-score', 0.0))
    return {
        "precision": overall_metrics.get('precision', 0.0),
        "recall": overall_metrics.get('recall', 0.0),
        "f1": f1_score,
        "accuracy": overall_metrics.get('precision', 0.0)
    }

!rm -rf ./results_*
!rm -rf ./logs_*
!rm -rf ~/.cache/huggingface/
!rm -rf /tmp/*

model_checkpoints_to_compare = {
    "XLM-R_Amharic_NER": "mbeukman/xlm-roberta-base-finetuned-ner-amharic",
    "mBERT": "bert-base-multilingual-cased",
    "DistilBERT_Multi": "distilbert-base-multilingual-cased",
}

results = {}

for model_name, checkpoint in model_checkpoints_to_compare.items():
    print(f"\n--- Fine-tuning {model_name} ({checkpoint}) ---")
    !rm -rf /tmp/*

    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModelForTokenClassification.from_pretrained(
        checkpoint,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True
    )
    def tokenize_and_align_labels_for_current_model(examples):
        tokenized_inputs = tokenizer(
            examples["tokens"], truncation=True, is_split_into_words=True
        )
        labels = []
        for i, label_ids_for_example in enumerate(examples["ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            current_label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    current_label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    current_label_ids.append(label_ids_for_example[word_idx])
                else:
                    original_word_label_id = label_ids_for_example[word_idx]
                    original_word_label_name = id2label[original_word_label_id]
                    if original_word_label_name.startswith("B-"):
                        new_label_name = f"I-{original_word_label_name[2:]}"
                        current_label_ids.append(label2id.get(new_label_name, original_word_label_id))
                    else:
                        current_label_ids.append(original_word_label_id)
                previous_word_idx = word_idx
            labels.append(current_label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    current_tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels_for_current_model, batched=True)
    current_tokenized_eval_dataset = eval_dataset.map(tokenize_and_align_labels_for_current_model, batched=True)

    current_data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

    current_output_dir = f"./results_{model_name}"
    current_logging_dir = f"./logs_{model_name}"
    if os.path.exists(current_output_dir):
        !rm -rf {current_output_dir}
    if os.path.exists(current_logging_dir):
        !rm -rf {current_logging_dir}
    os.makedirs(current_output_dir, exist_ok=True)
    os.makedirs(current_logging_dir, exist_ok=True)

    training_args_current = TrainingArguments(
        output_dir=current_output_dir,
        eval_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=7,
        weight_decay=0.01,
        logging_dir=current_logging_dir,
        logging_steps=10,
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none"
    )

    trainer_current = Trainer(
        model=model,
        args=training_args_current,
        train_dataset=current_tokenized_train_dataset,
        eval_dataset=current_tokenized_eval_dataset,
        tokenizer=tokenizer,
        data_collator=current_data_collator,
        compute_metrics=compute_metrics
    )

    train_start_time = time.time()
    trainer_current.train()
    train_end_time = time.time()
    training_time = train_end_time - train_start_time

    eval_metrics = trainer_current.evaluate()

    inference_start_time = time.time()
    sample_text = "አዲስ አበባ ላይ ቴሌቪዥን በ1000 ብር ይሸጣል"
    ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
    _ = ner_pipeline(sample_text)
    inference_end_time = time.time()
    inference_time = inference_end_time - inference_start_time

    results[model_name] = {
        "checkpoint": checkpoint,
        "eval_metrics": eval_metrics,
        "training_time_seconds": training_time,
        "inference_time_per_sample_seconds": inference_time / len(sample_text.split())
    }

    output_model_dir_specific = f"/content/drive/MyDrive/fine_tuned_amharic_ner_model_{model_name}"
    os.makedirs(output_model_dir_specific, exist_ok=True)

    # Save only the best model (which the trainer already loaded at the end)
    trainer_current.save_model(output_model_dir_specific)
    tokenizer.save_pretrained(output_model_dir_specific)
    print(f"Model {model_name} saved to: {output_model_dir_specific}")


# Print all results in a comparison table
print("\n--- Model Comparison Results ---")
for model_name, data in results.items():
    print(f"\nModel: {model_name}")
    print(f"  Checkpoint: {data['checkpoint']}")
    print(f"  Evaluation Metrics (on validation set):")
    for metric, value in data['eval_metrics'].items():
        print(f"    {metric}: {value:.4f}")
    print(f"  Training Time: {data['training_time_seconds']:.2f} seconds")
    print(f"  Approx. Inference Time per Sample: {data['inference_time_per_sample_seconds']:.4f} seconds (for a rough sentence length)")