In [1]:
!pip install transformers datasets torch seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==

In [2]:
import transformers, datasets, torch, seqeval

In [3]:
!mkdir -p data/processed
!mkdir -p results/fine_tuned_ner_model

In [4]:
with open('fine_tune_ner.py', 'w') as f:
    f.write('''
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['WANDB_MODE'] = 'disabled'
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from seqeval.metrics import classification_report
import numpy as np
import torch
import logging
import argparse
import time

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

parser = argparse.ArgumentParser(description='Train NER model')
parser.add_argument('--model_name', type=str, required=True, help='Model name')
parser.add_argument('--output_dir', type=str, required=True, help='Output directory')
args = parser.parse_args()

def load_conll(file_path):
    sentences, labels = [], []
    current_sentence, current_labels = [], []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    token, label = line.strip().split()
                    current_sentence.append(token)
                    current_labels.append(label)
                else:
                    if current_sentence:
                        sentences.append(current_sentence)
                        labels.append(current_labels)
                        current_sentence, current_labels = [], []
            if current_sentence:
                sentences.append(current_sentence)
                labels.append(current_labels)
        return Dataset.from_dict({'tokens': sentences, 'ner_tags': labels})
    except Exception as e:
        logging.error(f"Error loading CoNLL: {e}")
        raise

logging.info("Loading CoNLL dataset")
dataset = load_conll('/content/labeled_data.conll')

label_list = sorted(set(label for sent in dataset['ner_tags'] for label in sent))
label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for label, idx in label2id.items()}

def convert_labels_to_ids(example):
    example['ner_tags'] = [label2id[label] for label in example['ner_tags']]
    return example

dataset = dataset.map(convert_labels_to_ids)

train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_val_test = dataset.train_test_split(train_size=train_size, test_size=val_size+test_size, seed=42)
val_test = train_val_test['test'].train_test_split(train_size=val_size/(val_size+test_size), seed=42)
dataset_dict = DatasetDict({
    'train': train_val_test['train'],
    'validation': val_test['train'],
    'test': val_test['test']
})

logging.info(f"Loading tokenizer for {args.model_name}")
tokenizer = AutoTokenizer.from_pretrained(args.model_name)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True, padding=True)
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels.append(aligned_labels)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

logging.info("Tokenizing dataset")
tokenized_dataset = dataset_dict.map(tokenize_and_align_labels, batched=True)

logging.info(f"Saving tokenized dataset for {args.model_name}")
try:
    os.makedirs(f'/content/data/processed/tokenized_dataset_{args.model_name.split("/")[-1]}', exist_ok=True)
    tokenized_dataset.save_to_disk(f'/content/data/processed/tokenized_dataset_{args.model_name.split("/")[-1]}')
except Exception as e:
    logging.error(f"Error saving tokenized dataset: {e}")
    raise

logging.info(f"Loading model {args.model_name}")
model = AutoModelForTokenClassification.from_pretrained(
    args.model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    pred_labels = [[id2label[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]
    results = classification_report(true_labels, pred_labels, output_dict=True, zero_division=0)
    return {
        'precision': results['weighted avg']['precision'],
        'recall': results['weighted avg']['recall'],
        'f1': results['weighted avg']['f1-score']
    }

def measure_inference_time(model, tokenizer, text="Adidas SAMBAROSE ዋጋ 3300 ብር መገናኛ"):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to('cuda')
    start_time = time.time()
    with torch.no_grad():
        outputs = model(**inputs).logits
    return time.time() - start_time

training_args = TrainingArguments(
    output_dir=args.output_dir,
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='/content/logs',
    logging_steps=10,
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    compute_metrics=compute_metrics
)

logging.info(f"Starting training for {args.model_name}")
trainer.train()

logging.info(f"Evaluating {args.model_name}")
eval_results = trainer.evaluate()
print(f"Evaluation Results for {args.model_name}:")
print(eval_results)

inference_time = measure_inference_time(model, tokenizer)
print(f"Inference Time for {args.model_name}: {inference_time:.4f} seconds")

logging.info(f"Saving model to {args.output_dir}")
trainer.save_model(args.output_dir)

logging.info(f"Generating test set predictions for {args.model_name}")
predictions = trainer.predict(tokenized_dataset['test'])
print(f"Test Set Predictions for {args.model_name}:")
print(predictions.metrics)
    ''')

In [19]:
!mkdir -p /content/results/fine_tuned_ner_model
!python fine_tune_ner.py --model_name xlm-roberta-base --output_dir /content/results/fine_tuned_ner_model

E0000 00:00:1750797558.318748   25418 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750797558.364455   25418 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Map: 100% 50/50 [00:00<00:00, 4224.81 examples/s]
Map: 100% 40/40 [00:00<00:00, 776.64 examples/s]
Map: 100% 5/5 [00:00<00:00, 466.08 examples/s]
Map: 100% 5/5 [00:00<00:00, 511.01 examples/s]
Saving the dataset (1/1 shards): 100% 40/40 [00:00<00:00, 8770.11 examples/s]
Saving the dataset (1/1 shards): 100% 5/5 [00:00<00:00, 1210.97 examples/s]
Saving the dataset (1/1 shards): 100% 5/5 [00:00<00:00, 1301.61 examples/s]
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on 

In [20]:
!mkdir -p /content/results/fine_tuned_mbert
!python fine_tune_ner.py --model_name bert-base-multilingual-cased --output_dir /content/results/fine_tuned_mbert

E0000 00:00:1750799034.141852   31292 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750799034.153644   31292 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Map: 100% 50/50 [00:00<00:00, 4010.54 examples/s]
Map: 100% 40/40 [00:00<00:00, 828.07 examples/s]
Map: 100% 5/5 [00:00<00:00, 496.18 examples/s]
Map: 100% 5/5 [00:00<00:00, 553.54 examples/s]
Saving the dataset (1/1 shards): 100% 40/40 [00:00<00:00, 7803.72 examples/s]
Saving the dataset (1/1 shards): 100% 5/5 [00:00<00:00, 1261.29 examples/s]
Saving the dataset (1/1 shards): 100% 5/5 [00:00<00:00, 1239.74 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this mod

In [21]:
!mkdir -p /content/results/fine_tuned_distilbert
!python fine_tune_ner.py --model_name distilbert-base-multilingual-cased --output_dir /content/results/fine_tuned_distilbert

E0000 00:00:1750800081.474612   35466 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750800081.493098   35466 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Map: 100% 50/50 [00:00<00:00, 4036.87 examples/s]
Map: 100% 40/40 [00:00<00:00, 796.50 examples/s]
Map: 100% 5/5 [00:00<00:00, 531.61 examples/s]
Map: 100% 5/5 [00:00<00:00, 533.52 examples/s]
Saving the dataset (1/1 shards): 100% 40/40 [00:00<00:00, 6931.59 examples/s]
Saving the dataset (1/1 shards): 100% 5/5 [00:00<00:00, 1013.21 examples/s]
Saving the dataset (1/1 shards): 100% 5/5 [00:00<00:00, 1030.04 examples/s]
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TR

In [22]:
with open('compare_models.py', 'w') as f:
    f.write('''import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['WANDB_MODE'] = 'disabled'
from transformers import AutoTokenizer, AutoModelForTokenClassification
from datasets import load_from_disk
from seqeval.metrics import classification_report
import torch
import time
import pandas as pd
import logging
import glob

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def measure_inference_time(model, tokenizer, text="Adidas SAMBAROSE ዋጋ 3300 ብር መገናኛ", num_runs=100):
model.eval()
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to('cuda')
total_time = 0
for _ in range(num_runs):
start_time = time.time()
with torch.no_grad():
outputs = model(**inputs).logits
total_time += time.time() - start_time
return total_time / num_runs

def evaluate_model(model, tokenizer, dataset, id2label):
model.eval()
true_labels, pred_labels = [], []
for example in dataset:
inputs = tokenizer(example['tokens'], is_split_into_words=True, return_tensors="pt", truncation=True, padding=True).to('cuda')
with torch.no_grad():
outputs = model(**inputs).logits
predictions = torch.argmax(outputs, dim=2)[0]
labels = example['labels']
word_ids = inputs.word_ids()
example_true, example_pred = [], []
for i, (pred, label) in enumerate(zip(predictions, labels)):
if word_ids[i] is not None and label != -100:
example_true.append(id2label[label])
example_pred.append(id2label[pred.item()])
true_labels.append(example_true)
pred_labels.append(example_pred)
results = classification_report(true_labels, pred_labels, output_dict=True, zero_division=0)
return {
'precision': results['weighted avg']['precision'],
'recall': results['weighted avg']['recall'],
'f1': results['weighted avg']['f1-score']
}

def get_model_size(model_dir):
total_size = 0
for file in glob.glob(f"{model_dir}/*"):
total_size += os.path.getsize(file)
return total_size / (1024 ** 2)

logging.info("Loading tokenized dataset")
tokenized_dataset = load_from_disk('file:///content/data/processed/tokenized_dataset_xlm-roberta-base')

models = [
{'name': 'XLM-RoBERTa', 'path': '/results/fine_tuned_ner_model', 'model_name': 'xlm-roberta-base'},
{'name': 'mBERT', 'path': '/results/fine_tuned_mbert', 'model_name': 'bert-base-multilingual-cased'},
{'name': 'DistilBERT', 'path': '/results/fine_tuned_distilbert', 'model_name': 'distilbert-base-multilingual-cased'}
]

comparison = []
for config in models:
logging.info(f"Evaluating {config['name']}")
tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
model = AutoModelForTokenClassification.from_pretrained(config['path']).to('cuda')
eval_results = evaluate_model(model, tokenizer, tokenized_dataset['validation'], model.config.id2label)
inference_time = measure_inference_time(model, tokenizer)
model_size = get_model_size(config['path'])
comparison.append({
'Model': config['name'],
'F1-Score': eval_results['f1'],
'Precision': eval_results['precision'],
'Recall': eval_results['recall'],
'Inference Time (s)': inference_time,
'Model Size (MB)': model_size
})

comparison_df = pd.DataFrame(comparison)
print("Model Comparison Table:")
print(comparison_df)
comparison_df.to_csv('/results/model_comparison.csv', index=False)
logging.info("Comparison table saved to /results/model_comparison.csv")''')

In [23]:
!python compare_models.py

2025-06-24 21:30:44,277 - INFO - Loading tokenized dataset
2025-06-24 21:30:44,305 - INFO - Evaluating XLM-RoBERTa
2025-06-24 21:30:46,597 - INFO - Fallback to latest checkpoint: /content/results/fine_tuned_ner_model/checkpoint-25
E0000 00:00:1750800650.515069   37743 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750800650.526954   37743 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-24 21:31:23,429 - INFO - Evaluating mBERT
2025-06-24 21:31:24,538 - INFO - Fallback to latest checkpoint: /content/results/fine_tuned_mbert/checkpoint-25
2025-06-24 21:31:41,118 - INFO - Evaluating DistilBERT
2025-06-24 21:31:41,803 - INFO - Fallback to latest checkpoint: /content/results/fine_tuned_distilbert/checkpoint-25
Model Comparison Table:
         Model  F1-Score  ...  Inference Time (s)  Model S