In [1]:
!pip install transformers datasets torch
!pip install huggingface_hub



In [2]:
!cp /content/drive/MyDrive/merged_amharic_ner_data.conll /content/

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer
import pandas as pd

# Load the dataset
dataset = load_dataset('text', data_files='/content/drive/MyDrive/merged_amharic_ner_data.conll', split='train')

# Split the dataset into train and test sets
dataset_dict = dataset.train_test_split(test_size=0.2, seed=42)

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
# Define a label-to-ID mapping
label_to_id = {
    "O": 0,
    "B-LOC": 1,
    "I-LOC": 2,
    "B-PRICE": 3,
    "I-PRICE": 4,
    "B-Product": 5,
    "I-Product": 6,
}

In [5]:
# Extract tokens and map labels to integers
def extract_tokens_and_labels(examples):
    tokens, labels = [], []
    for line in examples["text"].split("\n"):
        if line.strip():  # Only process non-empty lines
            parts = line.split("\t")
            if len(parts) == 2:  # Only process lines with both a token and a label
                token, label = parts
                tokens.append(token)
                labels.append(label_to_id[label])  # Convert string labels to integer IDs
            # else:
            #     print(f"Skipping malformed line: {line}")  # Log malformed lines
    return {"tokens": tokens, "labels": labels}

# Apply the extraction function to the dataset
dataset_dict = dataset_dict.map(extract_tokens_and_labels, batched=False)


Map:   0%|          | 0/87196 [00:00<?, ? examples/s]

Map:   0%|          | 0/21800 [00:00<?, ? examples/s]

In [6]:
# Initialize the tokenizer
model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization and label alignment
def tokenize_and_align_labels(examples):
    # Tokenize the input, setting truncation and padding to True
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        padding=True,               # Ensure padding to have the same sequence length
        is_split_into_words=True    # Indicates that the input is already split into words (tokens)
    )

    labels = []
    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special tokens (e.g., [CLS], [SEP], padding tokens)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  # Assign label to the first token in the word
            else:
                label_ids.append(-100)  # Ignore subword tokens
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



In [7]:
# Apply tokenization and alignment to both train and test sets
tokenized_train = dataset_dict["train"].map(tokenize_and_align_labels, batched=True)
tokenized_test = dataset_dict["test"].map(tokenize_and_align_labels, batched=True)

# Check the tokenized data
print(tokenized_train[0])

Map:   0%|          | 0/87196 [00:00<?, ? examples/s]

Map:   0%|          | 0/21800 [00:00<?, ? examples/s]

{'text': 'ውሃ O', 'tokens': [], 'labels': [-100, -100], 'input_ids': [101, 102], 'token_type_ids': [0, 0], 'attention_mask': [1, 1]}


In [8]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",           # Output directory for model and logs
    evaluation_strategy="epoch",      # Evaluate the model after every epoch
    learning_rate=2e-5,               # Learning rate
    per_device_train_batch_size=16,   # Batch size for training
    per_device_eval_batch_size=16,    # Batch size for evaluation
    num_train_epochs=3,               # Number of training epochs
    weight_decay=0.01,                # Weight decay for regularization
    logging_dir='./logs',             # Directory for logs
    logging_steps=10,                 # Log every 10 steps
    save_strategy="epoch",            # Save the model after each epoch
    load_best_model_at_end=True,      # Load the best model (according to evaluation metric) at the end
)




In [9]:
from transformers import AutoModelForTokenClassification, Trainer

# Load pre-trained model for NER
model = AutoModelForTokenClassification.from_pretrained(
    model_name,                        # Use the same model_name used for the tokenizer
    num_labels=len(label_to_id)        # Number of unique labels
)

# Initialize Trainer
trainer = Trainer(
    model=model,                       # The pre-trained model
    args=training_args,                # Training arguments we defined above
    train_dataset=tokenized_train,     # Training dataset
    eval_dataset=tokenized_test,       # Validation dataset
    tokenizer=tokenizer                # The tokenizer
)




model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Evaluate the model on the validation set
evaluation_results = trainer.evaluate()

In [11]:
# Print the evaluation results
print(evaluation_results)

{'eval_loss': nan, 'eval_model_preparation_time': 0.0034, 'eval_runtime': 15.4936, 'eval_samples_per_second': 1407.035, 'eval_steps_per_second': 87.972}


In [12]:
# Load different models for comparison
from transformers import AutoModelForTokenClassification

# List of models to fine-tune
models_to_compare = [
    "xlm-roberta-base",
    "distilbert-base-cased",
    "bert-base-multilingual-cased"
]

for model_name in models_to_compare:
    # Load pre-trained model for NER
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(label_to_id)
    )

    # Initialize Trainer with different models
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        tokenizer=tokenizer
    )

    # Train and evaluate the model
    trainer.train()
    eval_results = trainer.evaluate()

    # Save the evaluation results for comparison
    print(f"Results for {model_name}: {eval_results}")

    # Save the model for later use
    model.save_pretrained(f"./fine_tuned_{model_name}")
    tokenizer.save_pretrained(f"./fine_tuned_{model_name}")


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.0,
2,0.0,
3,0.0,


Results for xlm-roberta-base: {'eval_loss': nan, 'eval_runtime': 16.313, 'eval_samples_per_second': 1336.36, 'eval_steps_per_second': 83.553, 'epoch': 3.0}


config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.0,
2,0.0,
3,0.0,


Results for distilbert-base-cased: {'eval_loss': nan, 'eval_runtime': 10.0645, 'eval_samples_per_second': 2166.021, 'eval_steps_per_second': 135.426, 'epoch': 3.0}


config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.0,
2,0.0,
3,0.0,


Results for bert-base-multilingual-cased: {'eval_loss': nan, 'eval_runtime': 12.8782, 'eval_samples_per_second': 1692.788, 'eval_steps_per_second': 105.838, 'epoch': 3.0}


In [21]:
from sklearn.metrics import classification_report
import numpy as np

# Get the output of the predictions
predictions_output = trainer.predict(tokenized_test)

# Extract the predictions (logits) and true labels (label_ids)
predictions = np.argmax(predictions_output.predictions, axis=-1)
true_labels = predictions_output.label_ids

# Initialize lists to store the flattened predictions and true labels
predictions_flat = []
true_labels_flat = []

# Iterate over each example
for i in range(len(true_labels)):
    for j in range(len(true_labels[i])):
        # Ignore padding and special tokens (-100 in true labels)
        if true_labels[i][j] != -100:
            true_labels_flat.append(true_labels[i][j])
            predictions_flat.append(predictions[i][j])

# Now, predictions_flat and true_labels_flat should have the same length

# Compute the classification report
def compute_metrics(preds, labels):
    # Check if labels is empty
    if not labels:
        print("No labels found. Check your data and model predictions.")
        return ""

    # Get the list of unique labels from the true labels
    unique_labels = set(labels)

    # Generate target names from the label_to_id mapping if available
    if label_to_id:
        target_names = [label_to_id[i] for i in unique_labels]
    else:
        print("label_to_id mapping not found. Using generic target names.")
        target_names = [str(i) for i in unique_labels]

    return classification_report(labels, preds, target_names=target_names, zero_division=0)

# Generate the classification report
report = compute_metrics(predictions_flat, true_labels_flat)
print(report)

No labels found. Check your data and model predictions.



In [2]:
!pip install shap

Collecting shap
  Downloading shap-0.46.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (24 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Downloading shap-0.46.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (540 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m540.1/540.1 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.46.0 slicer-0.0.8


In [16]:
import torch
import numpy as np
import shap

# Custom prediction function for SHAP to work with Hugging Face model
def model_predict(texts):
    # Tokenize the input texts (this should be raw text, not tokenized form)
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(model.device)
    outputs = model(**inputs)

    # Convert logits to probabilities using softmax
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)

    # Get the predicted class for each token (the one with the highest probability)
    predictions = torch.argmax(probabilities, dim=-1).cpu().numpy()

    return predictions, probabilities.detach().cpu().numpy()  # Return both predictions and probabilities for SHAP





In [17]:
# Test input sentence
test_texts = "መቷል ኦሪጅናል የልጆች ማሊያ ከነቁምጣው 2324 2 18 ዋጋ 1950 ብር ስልክ"

# Get the predictions and probabilities
predictions, probabilities = model_predict(test_texts)

print("Predictions:", predictions)
print("Probabilities:", probabilities)

Predictions: [[4 6 6 6 6 6 6 6 6 6 6 6 6 5 6]]
Probabilities: [[[0.13106269 0.09289607 0.18753609 0.09613483 0.19862632 0.15622173
   0.13752232]
  [0.10674487 0.09321373 0.12218568 0.09051397 0.14061795 0.21081367
   0.23591006]
  [0.103833   0.09652261 0.1259723  0.07867047 0.12414709 0.22869408
   0.2421605 ]
  [0.10256352 0.09353056 0.12000894 0.07777754 0.1294791  0.2270109
   0.24962945]
  [0.10427475 0.08468983 0.12681429 0.07429598 0.12878053 0.2082905
   0.2728541 ]
  [0.12454062 0.08821789 0.13084875 0.07121082 0.11873052 0.186746
   0.27970538]
  [0.13182755 0.10741124 0.10726987 0.11734512 0.15153615 0.15718748
   0.22742262]
  [0.15884942 0.13452207 0.12360926 0.10322435 0.12634835 0.17417896
   0.17926763]
  [0.12966856 0.08864462 0.10982378 0.10000443 0.1912998  0.18446565
   0.19609316]
  [0.16282555 0.12957688 0.11339311 0.11397454 0.12217534 0.16742535
   0.19062929]
  [0.12845851 0.07890881 0.11273124 0.0797976  0.15136234 0.20687403
   0.24186753]
  [0.12975597 0.12