In [3]:


# model_comparison.py
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import Dataset
import numpy as np
import evaluate
import pandas as pd
from pathlib import Path

def load_and_prepare_data(conll_path):
    """Load and prepare CONLL format data"""
    def parse_conll(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        
        tokens, labels = [], []
        current_tokens, current_labels = [], []
        
        for line in lines:
            line = line.strip()
            if not line:  # Sentence boundary
                if current_tokens:
                    tokens.append(current_tokens)
                    labels.append(current_labels)
                    current_tokens, current_labels = [], []
                continue
            
            parts = line.split('\t')
            if len(parts) == 2:
                current_tokens.append(parts[0])
                current_labels.append(parts[1])
        
        if current_tokens:
            tokens.append(current_tokens)
            labels.append(current_labels)
        
        return {'tokens': tokens, 'ner_tags': labels}

    # Load and parse data
    conll_data = parse_conll(conll_path)
    dataset = Dataset.from_dict({
        'tokens': conll_data['tokens'],
        'ner_tags': conll_data['ner_tags']
    })

    # Define label mappings
    label_list = ["O", "B-PRODUCT", "I-PRODUCT", "B-PRICE", "I-PRICE", "B-LOC", "I-LOC"]
    label2id = {l: i for i, l in enumerate(label_list)}
    id2label = {i: l for i, l in enumerate(label_list)}

    return dataset, label_list, label2id, id2label

def tokenize_and_align(dataset, tokenizer, label2id):
    """Tokenize and align labels with tokens"""
    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples["tokens"],
            truncation=True,
            is_split_into_words=True,
            padding='max_length',
            max_length=128
        )
        
        labels = []
        for i, label in enumerate(examples["ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(label2id[label[word_idx]])
                else:
                    label_ids.append(-100)
                previous_word_idx = word_idx
            
            labels.append(label_ids)
        
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    return dataset.map(tokenize_and_align_labels, batched=True)

def get_compute_metrics(label_list):
    """Return metrics computation function"""
    seqeval = evaluate.load("seqeval")
    
    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        true_predictions = [
            [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        results = seqeval.compute(predictions=true_predictions, references=true_labels)
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }
    
    return compute_metrics

def compare_models(conll_path, models_to_compare):
    """Compare multiple NER models"""
    # Load and prepare data
    dataset, label_list, label2id, id2label = load_and_prepare_data(conll_path)
    
    # Tokenize dataset
    tokenized_datasets = tokenize_and_align(dataset, AutoTokenizer.from_pretrained("xlm-roberta-base"), label2id)
    split_datasets = tokenized_datasets.train_test_split(test_size=0.2)
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir="../comparison_results",
        eval_strategy="epoch",
        per_device_eval_batch_size=16,
        fp16=torch.cuda.is_available(),
        report_to=None
    )
    
    # Data collator
    data_collator = DataCollatorForTokenClassification(AutoTokenizer.from_pretrained("xlm-roberta-base"))
    
    # Compute metrics
    compute_metrics = get_compute_metrics(label_list)
    
    # Compare models
    results = {}
    
    for model_name in models_to_compare:
        print(f"\n=== Evaluating {model_name} ===")
        
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForTokenClassification.from_pretrained(
                model_name,
                num_labels=len(label_list),
                id2label=id2label,
                label2id=label2id
            )
            
            trainer = Trainer(
                model=model,
                args=training_args,
                eval_dataset=split_datasets["test"],
                tokenizer=tokenizer,
                data_collator=data_collator,
                compute_metrics=compute_metrics,
            )
            
            metrics = trainer.evaluate()
            results[model_name] = {
                'f1': metrics['eval_f1'],
                'precision': metrics['eval_precision'],
                'recall': metrics['eval_recall'],
                'accuracy': metrics['eval_accuracy'],
                'speed': metrics['eval_runtime']
            }
            
        except Exception as e:
            print(f"Error evaluating {model_name}: {str(e)}")
            results[model_name] = {
                'f1': None,
                'precision': None,
                'recall': None,
                'accuracy': None,
                'speed': None,
                'error': str(e)
            }
    
    return pd.DataFrame(results).T.sort_values('f1', ascending=False)

if __name__ == "__main__":
    # Configuration
    CONLL_PATH = "../CoNLL/amharic_ner.conll"  # Update with your path
    MODELS_TO_COMPARE = [
        "xlm-roberta-base",
        "bert-base-multilingual-cased",
        "Davlan/bert-base-multilingual-cased-ner-hrl",
        "afro-xlmr-base"
    ]
    
    # Run comparison
    results_df = compare_models(CONLL_PATH, MODELS_TO_COMPARE)
    
    # Save and display results
    print("\nModel Comparison Results:")
    print(results_df)
    
    results_df.to_csv("../comparison_result/model_comparison_results.csv")
    print("\nResults saved to model_comparison_results.csv")



Map: 100%|██████████| 50/50 [00:00<00:00, 3235.05 examples/s]



=== Evaluating xlm-roberta-base ===


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  _warn_prf(average, modifier, msg_start, len(result))



=== Evaluating bert-base-multilingual-cased ===


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Error evaluating bert-base-multilingual-cased: [Errno 28] No space left on device

=== Evaluating Davlan/bert-base-multilingual-cased-ner-hrl ===


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Error evaluating Davlan/bert-base-multilingual-cased-ner-hrl: [Errno 28] No space left on device

=== Evaluating afro-xlmr-base ===
Error evaluating afro-xlmr-base: afro-xlmr-base is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

Model Comparison Results:
                                                   f1 precision    recall  \
xlm-roberta-base                             0.005587  0.002976  0.045455   
bert-base-multilingual-cased                     None      None      None   
Davlan/bert-base-multilingual-cased-ner-hrl      None      None      None   
afro-xlmr-base                                   None      None      None   

                                            accuracy  speed  \
xlm-roberta-base                             0.01462  2.192   
bert-b