In [6]:
!pip install transformers datasets seqeval torch evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [7]:
from transformers import XLMRobertaTokenizer, XLMRobertaForTokenClassification, TrainingArguments, Trainer
from datasets import Dataset
from evaluate import load
import torch


In [12]:
from google.colab import files

# This will prompt you to upload the file
uploaded = files.upload()


Saving labeled_data_custom.conll to labeled_data_custom.conll


In [17]:
import pandas as pd

# Get the filename from the uploaded dictionary
filename = list(uploaded.keys())[0]
print(f'Loading file: {filename}')

# Initialize lists to hold tokens and labels
tokens = []
labels = []

# Read the file line by line
with open(filename, 'r') as file:
    for line in file:
        line = line.strip()
        if line:  # Skip empty lines
            parts = line.split(maxsplit=1)  # Split by the first space only
            if len(parts) == 2:  # Ensure there are exactly 2 parts
                token, label = parts
                tokens.append(token)
                labels.append(label)
            else:
                print(f"Skipping line: {line}")  # Debugging output for lines that don't match the format

# Create a DataFrame
df = pd.DataFrame({'Token': tokens, 'Label': labels})

# Display the first few rows of the DataFrame to ensure it was loaded correctly
print(df.head(10))


Loading file: labeled_data_custom.conll
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping line: O
Skipping

In [18]:
# Create a label mapping
label_mapping = {
    "O": 0,
    "I-LOC": 1,
    "I-PRICE": 2,
    "I-Product": 3,
    # Add other labels as needed
}

# Map the labels in the DataFrame
df['Label'] = df['Label'].map(label_mapping)

# Drop any rows with NaN values after mapping
df.dropna(inplace=True)

# Convert labels to integer type
df['Label'] = df['Label'].astype(int)

# Display the DataFrame after mapping
print(df.head(10))


                                  Token  Label
0                                     💥      0
1                                     💥      0
2   ...................................      0
3                                     💥      0
4                                     💥      0
5                                     📌      3
6                           Replacement      3
8                                Nipple      3
9                                     👍      0
10                                  የጡት      0


In [19]:
from transformers import XLMRobertaTokenizerFast, XLMRobertaForTokenClassification, TrainingArguments, Trainer
from datasets import Dataset
from evaluate import load
import torch

# Load the fast tokenizer and model for XLM-Roberta
xlm_roberta_tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")
xlm_roberta_model = XLMRobertaForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=len(label_mapping))

def tokenize_and_align_labels_xlm_roberta(examples):
    tokenized_inputs = xlm_roberta_tokenizer(
        examples['Token'].tolist(),
        padding=True,
        truncation=True,
        is_split_into_words=True,
    )

    labels = []
    word_ids = tokenized_inputs.word_ids(batch_index=0)  # Assuming batch_size is 1
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(examples['Label'].values[word_idx])
        else:
            label_ids.append(-100)
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

# Apply the function to each sentence
tokenized_datasets_xlm_roberta = [tokenize_and_align_labels_xlm_roberta(sentence) for _, sentence in df.groupby(df.index // 10)]

# Convert lists to PyTorch tensors
input_ids_xlm_roberta = [torch.tensor(item['input_ids'], dtype=torch.long) for item in tokenized_datasets_xlm_roberta]
attention_mask_xlm_roberta = [torch.tensor(item['attention_mask'], dtype=torch.long) for item in tokenized_datasets_xlm_roberta]
labels_xlm_roberta = [torch.tensor(item['labels'], dtype=torch.long) for item in tokenized_datasets_xlm_roberta]

# Pad sequences to ensure they have the same length
input_ids_xlm_roberta = torch.nn.utils.rnn.pad_sequence(input_ids_xlm_roberta, batch_first=True, padding_value=xlm_roberta_tokenizer.pad_token_id)
attention_mask_xlm_roberta = torch.nn.utils.rnn.pad_sequence(attention_mask_xlm_roberta, batch_first=True, padding_value=0)
labels_xlm_roberta = torch.nn.utils.rnn.pad_sequence(labels_xlm_roberta, batch_first=True, padding_value=-100)

# Combine into a dataset
dataset_xlm_roberta = Dataset.from_dict({
    'input_ids': input_ids_xlm_roberta,
    'attention_mask': attention_mask_xlm_roberta,
    'labels': labels_xlm_roberta
})

# Set up training arguments for XLM-Roberta
training_args_xlm_roberta = TrainingArguments(
    output_dir='./results_xlm_roberta',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Fine-tune the model
trainer_xlm_roberta = Trainer(
    model=xlm_roberta_model,
    args=training_args_xlm_roberta,
    train_dataset=dataset_xlm_roberta,
    eval_dataset=dataset_xlm_roberta,
)

trainer_xlm_roberta.train()

# Evaluate the model
results_xlm_roberta = trainer_xlm_roberta.evaluate()
print(results_xlm_roberta)

# Save the model
xlm_roberta_model.save_pretrained('./fine-tuned-xlm-roberta-model')
xlm_roberta_tokenizer.save_pretrained('./fine-tuned-xlm-roberta-model')


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,No log,0.371981
2,No log,0.267065
3,No log,0.224168


{'eval_loss': 0.22416801750659943, 'eval_runtime': 59.1227, 'eval_samples_per_second': 5.599, 'eval_steps_per_second': 0.355, 'epoch': 3.0}


('./fine-tuned-xlm-roberta-model/tokenizer_config.json',
 './fine-tuned-xlm-roberta-model/special_tokens_map.json',
 './fine-tuned-xlm-roberta-model/sentencepiece.bpe.model',
 './fine-tuned-xlm-roberta-model/added_tokens.json',
 './fine-tuned-xlm-roberta-model/tokenizer.json')

In [20]:
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification, TrainingArguments, Trainer
from datasets import Dataset
import torch

# Define the label mapping
label_mapping = {
    "O": 0,
    "I-LOC": 1,
    "I-PRICE": 2,
    "I-Product": 3,
    # Add other labels as needed
}

# Load the fast tokenizer and model for DistilBERT
distilbert_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-multilingual-cased")
distilbert_model = DistilBertForTokenClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=len(label_mapping))

def tokenize_and_align_labels_distilbert(examples):
    tokenized_inputs = distilbert_tokenizer(
        examples['Token'].tolist(),
        padding=True,
        truncation=True,
        is_split_into_words=True,
    )

    labels = []
    word_ids = tokenized_inputs.word_ids(batch_index=0)  # Assuming batch_size is 1
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(examples['Label'].values[word_idx])
        else:
            label_ids.append(-100)
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

# Apply the function to each sentence
tokenized_datasets_distilbert = [tokenize_and_align_labels_distilbert(sentence) for _, sentence in df.groupby(df.index // 10)]

# Convert lists to PyTorch tensors
input_ids_distilbert = [torch.tensor(item['input_ids'], dtype=torch.long) for item in tokenized_datasets_distilbert]
attention_mask_distilbert = [torch.tensor(item['attention_mask'], dtype=torch.long) for item in tokenized_datasets_distilbert]
labels_distilbert = [torch.tensor(item['labels'], dtype=torch.long) for item in tokenized_datasets_distilbert]

# Pad sequences to ensure they have the same length
input_ids_distilbert = torch.nn.utils.rnn.pad_sequence(input_ids_distilbert, batch_first=True, padding_value=distilbert_tokenizer.pad_token_id)
attention_mask_distilbert = torch.nn.utils.rnn.pad_sequence(attention_mask_distilbert, batch_first=True, padding_value=0)
labels_distilbert = torch.nn.utils.rnn.pad_sequence(labels_distilbert, batch_first=True, padding_value=-100)

# Combine into a dataset
dataset_distilbert = Dataset.from_dict({
    'input_ids': input_ids_distilbert,
    'attention_mask': attention_mask_distilbert,
    'labels': labels_distilbert
})

# Set up training arguments for DistilBERT
training_args_distilbert = TrainingArguments(
    output_dir='./results_distilbert',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Fine-tune the model
trainer_distilbert = Trainer(
    model=distilbert_model,
    args=training_args_distilbert,
    train_dataset=dataset_distilbert,
    eval_dataset=dataset_distilbert,
)

trainer_distilbert.train()

# Evaluate the model
results_distilbert = trainer_distilbert.evaluate()
print(results_distilbert)

# Save the model
distilbert_model.save_pretrained('./fine-tuned-distilbert-model')
distilbert_tokenizer.save_pretrained('./fine-tuned-distilbert-model')


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.416068
2,No log,0.346628
3,No log,0.32146


{'eval_loss': 0.32145988941192627, 'eval_runtime': 59.5921, 'eval_samples_per_second': 5.554, 'eval_steps_per_second': 0.352, 'epoch': 3.0}


('./fine-tuned-distilbert-model/tokenizer_config.json',
 './fine-tuned-distilbert-model/special_tokens_map.json',
 './fine-tuned-distilbert-model/vocab.txt',
 './fine-tuned-distilbert-model/added_tokens.json',
 './fine-tuned-distilbert-model/tokenizer.json')

In [21]:
from transformers import BertTokenizerFast, BertForTokenClassification, TrainingArguments, Trainer
from datasets import Dataset
import torch

# Define the label mapping
label_mapping = {
    "O": 0,
    "I-LOC": 1,
    "I-PRICE": 2,
    "I-Product": 3,
    # Add other labels as needed
}

# Load the fast tokenizer and model for mBERT
mbert_tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
mbert_model = BertForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(label_mapping))

def tokenize_and_align_labels_mbert(examples):
    tokenized_inputs = mbert_tokenizer(
        examples['Token'].tolist(),
        padding=True,
        truncation=True,
        is_split_into_words=True,
    )

    labels = []
    word_ids = tokenized_inputs.word_ids(batch_index=0)  # Assuming batch_size is 1
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(examples['Label'].values[word_idx])
        else:
            label_ids.append(-100)
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

# Apply the function to each sentence
tokenized_datasets_mbert = [tokenize_and_align_labels_mbert(sentence) for _, sentence in df.groupby(df.index // 10)]

# Convert lists to PyTorch tensors
input_ids_mbert = [torch.tensor(item['input_ids'], dtype=torch.long) for item in tokenized_datasets_mbert]
attention_mask_mbert = [torch.tensor(item['attention_mask'], dtype=torch.long) for item in tokenized_datasets_mbert]
labels_mbert = [torch.tensor(item['labels'], dtype=torch.long) for item in tokenized_datasets_mbert]

# Pad sequences to ensure they have the same length
input_ids_mbert = torch.nn.utils.rnn.pad_sequence(input_ids_mbert, batch_first=True, padding_value=mbert_tokenizer.pad_token_id)
attention_mask_mbert = torch.nn.utils.rnn.pad_sequence(attention_mask_mbert, batch_first=True, padding_value=0)
labels_mbert = torch.nn.utils.rnn.pad_sequence(labels_mbert, batch_first=True, padding_value=-100)

# Combine into a dataset
dataset_mbert = Dataset.from_dict({
    'input_ids': input_ids_mbert,
    'attention_mask': attention_mask_mbert,
    'labels': labels_mbert
})

# Set up training arguments for mBERT
training_args_mbert = TrainingArguments(
    output_dir='./results_mbert',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Fine-tune the model
trainer_mbert = Trainer(
    model=mbert_model,
    args=training_args_mbert,
    train_dataset=dataset_mbert,
    eval_dataset=dataset_mbert,
)

trainer_mbert.train()

# Evaluate the model
results_mbert = trainer_mbert.evaluate()
print(results_mbert)

# Save the model
mbert_model.save_pretrained('./fine-tuned-mbert-model')
mbert_tokenizer.save_pretrained('./fine-tuned-mbert-model')


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.374801
2,No log,0.32171
3,No log,0.287782


{'eval_loss': 0.28778183460235596, 'eval_runtime': 119.5336, 'eval_samples_per_second': 2.769, 'eval_steps_per_second': 0.176, 'epoch': 3.0}


('./fine-tuned-mbert-model/tokenizer_config.json',
 './fine-tuned-mbert-model/special_tokens_map.json',
 './fine-tuned-mbert-model/vocab.txt',
 './fine-tuned-mbert-model/added_tokens.json',
 './fine-tuned-mbert-model/tokenizer.json')

In [23]:
!pip install evaluate




In [27]:
import numpy as np
from evaluate import load
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Define a function to compute metrics
def compute_metrics(eval_pred):
    labels = eval_pred.label_ids
    preds = np.argmax(eval_pred.predictions, axis=2)

    # Remove ignored index (special tokens)
    true_labels = [
        [label for label in label_seq if label != -100]
        for label_seq in labels
    ]
    true_preds = [
        [pred for (pred, label) in zip(pred_seq, label_seq) if label != -100]
        for pred_seq, label_seq in zip(preds, labels)
    ]

    # Flatten the lists
    true_labels_flat = [item for sublist in true_labels for item in sublist]
    true_preds_flat = [item for sublist in true_preds for item in sublist]

    precision = precision_score(true_labels_flat, true_preds_flat, average='weighted')
    recall = recall_score(true_labels_flat, true_preds_flat, average='weighted')
    f1 = f1_score(true_labels_flat, true_preds_flat, average='weighted')
    accuracy = accuracy_score(true_labels_flat, true_preds_flat)

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'accuracy': accuracy,
    }

# Evaluate each model
for model_name, trainer in [('XLM-Roberta', trainer_xlm_roberta), ('DistilBERT', trainer_distilbert), ('mBERT', trainer_mbert)]:
    eval_results = trainer.evaluate()
    print(f"\n{model_name} Evaluation Results:")
    print(eval_results)

    predictions, label_ids, metrics = trainer.predict(trainer.eval_dataset)
    metrics = compute_metrics(eval_pred=trainer.predict(trainer.eval_dataset))
    print(f"{model_name} Metrics: {metrics}")




XLM-Roberta Evaluation Results:
{'eval_loss': 0.22416801750659943, 'eval_runtime': 62.0325, 'eval_samples_per_second': 5.336, 'eval_steps_per_second': 0.339, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


XLM-Roberta Metrics: {'precision': 0.9021798778302322, 'recall': 0.9278508101498013, 'f1': 0.9147196134503752, 'accuracy': 0.9278508101498013}



DistilBERT Evaluation Results:
{'eval_loss': 0.32145988941192627, 'eval_runtime': 58.0454, 'eval_samples_per_second': 5.702, 'eval_steps_per_second': 0.362, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


DistilBERT Metrics: {'precision': 0.8624250519313915, 'recall': 0.8929990828492815, 'f1': 0.8766552897998986, 'accuracy': 0.8929990828492815}



mBERT Evaluation Results:
{'eval_loss': 0.28778183460235596, 'eval_runtime': 131.6019, 'eval_samples_per_second': 2.515, 'eval_steps_per_second': 0.16, 'epoch': 3.0}


mBERT Metrics: {'precision': 0.8814262873429588, 'recall': 0.9104249464995414, 'f1': 0.8952119940563171, 'accuracy': 0.9104249464995414}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:
# Evaluate each model and store metrics
results_summary = {}

for model_name, trainer in [('XLM-Roberta', trainer_xlm_roberta), ('DistilBERT', trainer_distilbert), ('mBERT', trainer_mbert)]:
    eval_results = trainer.evaluate()
    metrics = compute_metrics(eval_pred=trainer.predict(trainer.eval_dataset))

    results_summary[model_name] = {
        'precision': metrics['precision'],
        'recall': metrics['recall'],
        'f1': metrics['f1'],
        'accuracy': metrics['accuracy'],
        'eval_loss': eval_results['eval_loss'],
        'eval_runtime': eval_results['eval_runtime'],
        'eval_samples_per_second': eval_results['eval_samples_per_second'],
        'eval_steps_per_second': eval_results['eval_steps_per_second']
    }

# Determine the best-performing model
best_model_name = max(results_summary, key=lambda k: results_summary[k]['f1'])  # Example criterion: highest F1 score

print(f"Best Performing Model: {best_model_name}")

# Print results summary for all models
for model_name, metrics in results_summary.items():
    print(f"\n{model_name} Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")

# Select the best model based on the best_model_name
best_model = None
if best_model_name == 'XLM-Roberta':
    best_model = trainer_xlm_roberta.model
elif best_model_name == 'DistilBERT':
    best_model = trainer_distilbert.model
elif best_model_name == 'mBERT':
    best_model = trainer_mbert.model


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best Performing Model: XLM-Roberta

XLM-Roberta Metrics:
precision: 0.9021798778302322
recall: 0.9278508101498013
f1: 0.9147196134503752
accuracy: 0.9278508101498013
eval_loss: 0.22416801750659943
eval_runtime: 66.5048
eval_samples_per_second: 4.977
eval_steps_per_second: 0.316

DistilBERT Metrics:
precision: 0.8624250519313915
recall: 0.8929990828492815
f1: 0.8766552897998986
accuracy: 0.8929990828492815
eval_loss: 0.32145988941192627
eval_runtime: 60.5372
eval_samples_per_second: 5.468
eval_steps_per_second: 0.347

mBERT Metrics:
precision: 0.8814262873429588
recall: 0.9104249464995414
f1: 0.8952119940563171
accuracy: 0.9104249464995414
eval_loss: 0.28778183460235596
eval_runtime: 117.6536
eval_samples_per_second: 2.813
eval_steps_per_second: 0.178


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
