In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import sys
import os
import pandas as pd
import numpy as np
sys.path.append('/content/gdrive/MyDrive')

In [9]:
import warnings
warnings.filterwarnings("ignore")

**- Install Requirements**

In [17]:
!pip install -q transformers datasets evaluate accelerate 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [18]:
import os
import sys

from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification
)

### Load Data

In [19]:
data = load_dataset("csv", data_files="/kaggle/input/ethiomart/final_tokens_labels.csv")
train_test_datasets = data['train'].train_test_split(train_size=0.8, seed=42)

In [20]:
train_test_datasets

DatasetDict({
    train: Dataset({
        features: ['Token', 'Label'],
        num_rows: 121209
    })
    test: Dataset({
        features: ['Token', 'Label'],
        num_rows: 30303
    })
})

In [21]:
data['train']['Token'][:10]

['የሞተ', 'ቆዳን', 'እንዲሁም', 'ቆሻሻን', 'ለማፅዳት', 'ተመራጭ', 'ዋጋ', '200', 'ብር', 'ውስን']

In [22]:
data['train']['Label'][:10]

['O', 'O', 'O', 'O', 'O', 'O', 'B-PRICE', 'I-PRICE', 'I-PRICE', 'O']

In [52]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [53]:
labels = list(set(data['train']['Label']))
label_mapping = {label: i for i, label in enumerate(labels)}

In [54]:
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example['Token'], truncation=True, padding='max_length', max_length=128)

    # Initialize labels with -100 for tokens that should be ignored
    labels = [-100] * len(tokenized_inputs['input_ids'])

    # Align labels with tokens
    label_index = label_mapping[example["Label"]]

    # Fill labels according to the example
    for i in range(len(tokenized_inputs['input_ids'])):
        # Assuming example['Token'] is not empty and corresponds to the label
        if example['Token'] == example['Token']:
            labels[i] = label_index

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [55]:
# Tokenize the dataset
tokenized_dataset = train_test_datasets.map(tokenize_and_align_labels)

Map:   0%|          | 0/121209 [00:00<?, ? examples/s]

Map:   0%|          | 0/30303 [00:00<?, ? examples/s]

In [27]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['Token', 'Label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 121209
    })
    test: Dataset({
        features: ['Token', 'Label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 30303
    })
})

In [56]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    weight_decay=0.1,
    evaluation_strategy="epoch",
#     logging_steps=100,
#     eval_steps=500,
#     save_steps=500,
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    seed=42,
)


In [63]:
# Load the model
model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=len(labels))

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [64]:
# Initialize the data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True, return_tensors="pt")

In [65]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [66]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Flatten the arrays and ensure they are of type int32
    return metric.compute(predictions=predictions.flatten().astype(np.int32), references=labels.flatten().astype(np.int32))

In [67]:
small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(10000))
small_eval_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(10000))

In [68]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [69]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,0.2826,0.016204,0.998398
1,0.0145,0.015103,0.999
2,0.0104,0.014852,0.9991


TrainOutput(global_step=468, training_loss=0.10249834641432151, metrics={'train_runtime': 1107.9146, 'train_samples_per_second': 27.078, 'train_steps_per_second': 0.422, 'total_flos': 1956643182673920.0, 'train_loss': 0.10249834641432151, 'epoch': 2.9952})

In [70]:
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

Evaluation results: {'eval_loss': 0.014851914718747139, 'eval_accuracy': 0.9991, 'eval_runtime': 100.4293, 'eval_samples_per_second': 99.573, 'eval_steps_per_second': 6.223, 'epoch': 2.9952}


In [51]:
# Save the fine-tuned model and tokenizer
trainer.save_model('./fine_tuned_models')
tokenizer.save_pretrained('./fine_tuned_models')

('./fine_tuned_models/tokenizer_config.json',
 './fine_tuned_models/special_tokens_map.json',
 './fine_tuned_models/vocab.txt',
 './fine_tuned_models/added_tokens.json',
 './fine_tuned_models/tokenizer.json')