In [16]:
#!pip install transformers datasets seqeval -q

!pip install seqeval



In [17]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict, load_metric
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import torch
import shutil
from google.colab import files

In [18]:
uploaded = files.upload()

#df = load_conll_file("/content/ner_train.conll")

KeyboardInterrupt: 

#Load and Parse CoNLL File

In [None]:
def load_conll_file(path):
    tokens, labels = [], []
    all_tokens, all_labels = [], []

    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip() == "":
                if tokens:
                    all_tokens.append(tokens)
                    all_labels.append(labels)
                    tokens, labels = [], []
            else:
                token, label = line.strip().split()
                tokens.append(token)
                labels.append(label)

    return pd.DataFrame({'tokens': all_tokens, 'ner_tags': all_labels})

df = load_conll_file("ner_train.conll")
df.head()


#Prepare dataset

In [None]:
split = df.sample(frac=1, random_state=42)
train_df = split[:int(0.8 * len(split))]
val_df = split[int(0.8 * len(split)):]

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

dataset = DatasetDict({"train": train_ds, "validation": val_ds})
df.head()


#Label Mapping & models

In [None]:
label_list = ['O', 'B-Product', 'I-Product', 'B-PRICE', 'I-PRICE', 'B-LOC', 'I-LOC', 'CONTACT_INFO']
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

# ---- Models to Compare ----
models = {
    "xlm-roberta-base": "xlm-roberta-amharic-ner",
    "bert-base-multilingual-cased": "mbert-amharic-ner",
    "Davlan/bert-base-multilingual-cased-ner-hrl": "bert-tiny-amharic-ner"
}

results = []


In [None]:
import transformers
print(transformers.__version__)

# fine-tune NER models and collect evaluation results

In [None]:
results = []

# def compute_metrics(p):
#     preds, labels = p.predictions.argmax(-1), p.label_ids
#     true_preds = [
#         [id_to_label[p] for (p, l) in zip(pred, lab) if l != -100 and id_to_label[l] != 'CONTACT_INFO']
#         for pred, lab in zip(preds, labels)
#     ]
#     true_labels = [
#         [id_to_label[l] for (p, l) in zip(pred, lab) if l != -100 and id_to_label[l] != 'CONTACT_INFO']
#         for pred, lab in zip(preds, labels)
#     ]
#     return metric.compute(predictions=true_preds, references=true_labels, zero_division=0)
def compute_metrics(p):
    preds, labels = p.predictions.argmax(-1), p.label_ids
    true_preds, true_labels = [], []

    for pred, lab in zip(preds, labels):
        preds_clean, labels_clean = [], []
        for p_i, l_i in zip(pred, lab):
            if l_i == -100:
                continue
            label_str = id_to_label.get(l_i, "")
            if label_str == "CONTACT_INFO":  # Skip CONTACT_INFO
                continue
            preds_clean.append(id_to_label.get(p_i, "O"))
            labels_clean.append(label_str)
        true_preds.append(preds_clean)
        true_labels.append(labels_clean)

    return metric.compute(predictions=true_preds, references=true_labels, zero_division=0)

for model_name, save_dir in models.items():

    print(f"\n Fine-tuning {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize_and_align_labels(example):
        tokenized = tokenizer(example["tokens"], is_split_into_words=True, padding='max_length', truncation=True, max_length=128)
        word_ids = tokenized.word_ids()
        labels = []
        for i in word_ids:
            if i is None:
                labels.append(-100)
            else:
                labels.append(label_to_id[example["ner_tags"][i]])
        tokenized["labels"] = labels
        return tokenized

    tokenized_dataset = dataset.map(tokenize_and_align_labels)

    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(label_list),
        ignore_mismatched_sizes=True)

    args = TrainingArguments(
        output_dir=save_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        save_total_limit=2,
        logging_dir=f"{save_dir}/logs",
        eval_strategy="epoch",
        report_to=[]
    )

    data_collator = DataCollatorForTokenClassification(tokenizer)
    metric = load_metric("seqeval")

    os.environ["WANDB_DISABLED"] = "true"
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()

    eval_result = trainer.evaluate()
    f1_score = eval_result["eval_overall_f1"]
    size_mb = sum(os.path.getsize(os.path.join(root, f)) for root, _, files in os.walk(save_dir) for f in files) / 1e6

    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)

    #shutil.make_archive(save_dir, 'zip', save_dir)

    results.append({
        "Model": model_name,
        "F1-Score": round(f1_score * 100, 2),
        "Size (MB)": round(size_mb, 2),
        "Dir": f"{save_dir}.zip"
    })
    # AFTER the loop: manually create zip files for each saved model directory:
for model_name, save_dir in models.items():
    print(f"Creating zip archive for {save_dir}")
    shutil.make_archive(save_dir, 'zip', save_dir)


#download all ziped model

In [None]:
print(label_list)



In [None]:
for model_name, save_dir in models.items():
    zip_path = f"{save_dir}.zip"
    print(f"Downloading {zip_path}...")
    files.download(zip_path)

In [None]:
for res in results:
    files.download(res["Dir"])

#Show Comparison Table

In [None]:
print(pd.DataFrame(results))

In [None]:
import pandas as pd
pd.DataFrame(results)
pd.DataFrame(results).sort_values(by="F1-Score", ascending=False)
pd.DataFrame(results).to_csv("model_comparison.csv", index=False)
files.download("model_comparison.csv")


In [None]:
#!git clone https://github.com/tnsay/ethioMart-telegram-ner.git
!cp "/content/drive/MyDrive/Colab_Notebooks/ModelComparison_EthioMart.ipynb" ethioMart-telegram-ner/