Import the cleaned dataset

In [None]:
import pandas as pd
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset
import optuna
from transformers import (
    AutoTokenizer, AutoModelForMaskedLM,
    AutoModel,
    AutoConfig,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    AutoModelForSequenceClassification, get_scheduler
)
from peft import LoraConfig, get_peft_model
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, multilabel_confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('processed_data_it3.csv')
df.head()

Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise,emotions,tokens,stemmed_tokens
0,hau_train_track_a_00001,kotu ta yi hukunci kan shari'ar zaben dan maja...,0,0,0,0,0,1,"('surprise',)","['kotu', 'hukunci', 'kan', ""shari'ar"", 'zaben'...","['kotu', 'hukunci', 'kan', ""shari'ar"", 'zaben'..."
1,hau_train_track_a_00002,"toh fah inji 'yan magana suka ce """"""""ana wata ...",0,0,0,0,0,1,"('surprise',)","['toh', 'fah', 'inji', ""'yan"", 'magana', '``',...","['toh', 'fah', 'inji', ""'yan"", 'magana', ""''"",..."
2,hau_train_track_a_00003,bincike ya nuna yan najeriya sun fi damuwa da ...,0,0,1,0,1,0,"('fear', 'sadness')","['bincike', 'nuna', 'yan', 'najeriya', 'fi', '...","['bincik', 'nuna', 'yan', 'najeriya', 'fi', 'd..."
3,hau_train_track_a_00004,kwamishina ya musanta rahoton masari ya cire k...,0,0,0,0,0,0,(),"['kwamishina', 'musanta', 'rahoton', 'masari',...","['kwamishina', 'musanta', 'rahoton', 'masari',..."
4,hau_train_track_a_00005,innalillahi wa inna ilaihir raji'un: allah ya ...,0,0,0,0,1,0,"('sadness',)","['innalillahi', 'wa', 'inna', 'ilaihir', ""raji...","['innalillahi', 'wa', 'inna', 'ilaihir', ""raji..."


In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.1-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.1 colorlog-6.9.0 optuna-4.3.0


In [None]:
text = df['stemmed_tokens']
emotion_labels = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']
labels = df[emotion_labels].values.tolist()
df[emotion_labels] = df[emotion_labels].astype(int)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "Davlan/bert-base-multilingual-cased-finetuned-hausa"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name).to(device)

tokenizer_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/712M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/712M [00:00<?, ?B/s]

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df[emotion_labels].values, test_size=0.2, random_state=42
)

tokenizer = AutoTokenizer.from_pretrained("Davlan/bert-base-multilingual-cased-finetuned-hausa")


In [None]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.texts)


In [None]:
train_dataset = EmotionDataset(train_texts, train_labels, tokenizer)
val_dataset = EmotionDataset(val_texts, val_labels, tokenizer)

In [None]:
def model_init(trial):
    return AutoModelForSequenceClassification.from_pretrained(
        "Davlan/bert-base-multilingual-cased-finetuned-hausa",
        num_labels=len(emotion_labels),
        problem_type="multi_label_classification"
    )
# model = AutoModelForSequenceClassification.from_pretrained(
#     "Davlan/bert-base-multilingual-cased-finetuned-hausa",
#     num_labels=len(emotion_labels),
#     problem_type="multi_label_classification"
# )

In [None]:
# training_args = TrainingArguments(
#     output_dir="./results",
#     num_train_epochs=2,
#     per_device_train_batch_size=24,
#     per_device_eval_batch_size=24,
#     save_strategy="no",
#     logging_dir='./logs',
#     logging_steps=10,
#     load_best_model_at_end=True,
#     optim="adamw_torch"
# )

# training_args = TrainingArguments(
#     output_dir="./optuna_results", # Use a different output directory for tuning
#     per_device_train_batch_size=4, # You might tune this
#     per_device_eval_batch_size=16,  # You might tune this
#     save_strategy="no",
#     logging_dir='./optuna_logs',    # Use a different logging directory
#     logging_steps=10,
#     load_best_model_at_end=True,
#     metric_for_best_model="f1", # Specify the metric to optimize
#     greater_is_better=True, # Specify if a higher metric is better
#     learning_rate=1.6788168340923137e-05,
#     num_train_epochs=4,
#     seed=3
# )

training_args = TrainingArguments(
    output_dir="./optuna_results", # Use a different output directory for tuning
    per_device_train_batch_size=16, # You might tune this
    per_device_eval_batch_size=16,  # You might tune this
    save_strategy="no",
    logging_dir='./optuna_logs',    # Use a different logging directory
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1", # Specify the metric to optimize
    greater_is_better=True # Specify if a higher metric is better
)



In [None]:
def compute_metrics(pred):
    logits, labels = pred
    preds = (torch.sigmoid(torch.tensor(logits)) > 0.5).int()
    labels = torch.tensor(labels).int()
    report = classification_report(labels, preds, output_dict=True, zero_division=0)
    return {
        'f1_macro': f1_score(labels, preds, average='macro'),
        'f1_micro': f1_score(labels, preds, average='micro'),
        'f1_samples': f1_score(labels, preds, average='samples'),
        'accuracy': (preds == labels).float().mean().item(),
    }

def custom_loss(outputs, labels):
    return BCEWithLogitsLoss()(outputs.logits, labels)

In [None]:
trainer = Trainer(
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    model_init=model_init )

best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    n_trials=20
)

print("Best trial:")
print(best_trial)

# trainer.train()

# metrics = trainer.evaluate()
# print("Evaluation metrics:", metrics)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-06-05 09:48:38,351] A new study created in memory with name: no-name-23095715-3169-4655-8862-9dff7a6de687
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1_macro,▁
eval/f1_micro,▁
eval/f1_samples,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇████

0,1
eval/accuracy,0.87063
eval/f1_macro,0.61516
eval/f1_micro,0.61324
eval/f1_samples,0.48205
eval/loss,0.30906
eval/runtime,3.1056
eval/samples_per_second,138.137
eval/steps_per_second,8.694
total_flos,338636089718784.0
train/epoch,3.0


Step,Training Loss
10,0.6391
20,0.5725
30,0.5363
40,0.4879
50,0.4873
60,0.4747
70,0.479
80,0.4777
90,0.4666
100,0.4556


[I 2025-06-05 09:50:27,788] Trial 0 finished with value: 1.4702054669547713 and parameters: {'learning_rate': 6.452773686425308e-06, 'num_train_epochs': 2, 'seed': 6, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 1.4702054669547713.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1_macro,▁
eval/f1_micro,▁
eval/f1_samples,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████

0,1
eval/accuracy,0.838
eval/f1_macro,0.21446
eval/f1_micro,0.27478
eval/f1_samples,0.14297
eval/loss,0.40046
eval/runtime,3.2271
eval/samples_per_second,132.937
eval/steps_per_second,8.367
total_flos,225757393145856.0
train/epoch,2.0


Step,Training Loss
10,0.6741
20,0.6571
30,0.6357
40,0.6176
50,0.6041
60,0.5924
70,0.5886
80,0.5781
90,0.5576
100,0.5647


[I 2025-06-05 09:51:57,269] Trial 1 finished with value: 0.8185703158378601 and parameters: {'learning_rate': 1.1020769609162e-06, 'num_train_epochs': 2, 'seed': 20, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 1.4702054669547713.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1_macro,▁
eval/f1_micro,▁
eval/f1_samples,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇███
train/global_step,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇███

0,1
eval/accuracy,0.81857
eval/f1_macro,0.0
eval/f1_micro,0.0
eval/f1_samples,0.0
eval/loss,0.50258
eval/runtime,3.166
eval/samples_per_second,135.504
eval/steps_per_second,8.528
total_flos,225757393145856.0
train/epoch,2.0


Step,Training Loss
10,0.6295
20,0.5296
30,0.4804
40,0.4591
50,0.452
60,0.4646
70,0.4518
80,0.4645
90,0.4283
100,0.4375


[I 2025-06-05 09:54:07,830] Trial 2 finished with value: 2.0127897232149055 and parameters: {'learning_rate': 1.2638376572667607e-05, 'num_train_epochs': 3, 'seed': 1, 'per_device_train_batch_size': 16}. Best is trial 2 with value: 2.0127897232149055.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1_macro,▁
eval/f1_micro,▁
eval/f1_samples,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇████

0,1
eval/accuracy,0.85703
eval/f1_macro,0.40634
eval/f1_micro,0.46667
eval/f1_samples,0.28275
eval/loss,0.36063
eval/runtime,3.1602
eval/samples_per_second,135.752
eval/steps_per_second,8.544
total_flos,338636089718784.0
train/epoch,3.0


Step,Training Loss
10,0.6165
20,0.5267
30,0.4658
40,0.4783
50,0.459
60,0.4706
70,0.4703
80,0.4676
90,0.4624
100,0.4658


[I 2025-06-05 09:56:17,329] Trial 3 finished with value: 1.860045173624026 and parameters: {'learning_rate': 1.0833120408638217e-05, 'num_train_epochs': 3, 'seed': 39, 'per_device_train_batch_size': 16}. Best is trial 2 with value: 2.0127897232149055.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1_macro,▁
eval/f1_micro,▁
eval/f1_samples,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇████

0,1
eval/accuracy,0.8512
eval/f1_macro,0.34806
eval/f1_micro,0.42232
eval/f1_samples,0.23846
eval/loss,0.37229
eval/runtime,3.1169
eval/samples_per_second,137.635
eval/steps_per_second,8.662
total_flos,338636089718784.0
train/epoch,3.0


Step,Training Loss
10,0.6297
20,0.5556
30,0.5204
40,0.4924
50,0.4766
60,0.4806
70,0.4749
80,0.4655
90,0.4657
100,0.4586


[I 2025-06-05 09:57:41,136] Trial 4 finished with value: 0.8185703158378601 and parameters: {'learning_rate': 6.924863871461867e-06, 'num_train_epochs': 2, 'seed': 11, 'per_device_train_batch_size': 32}. Best is trial 2 with value: 2.0127897232149055.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1_macro,▁
eval/f1_micro,▁
eval/f1_samples,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▂▃▄▅▅▆▇▇██
train/global_step,▁▂▂▃▄▅▅▆▇▇██

0,1
eval/accuracy,0.81857
eval/f1_macro,0.0
eval/f1_micro,0.0
eval/f1_samples,0.0
eval/loss,0.45705
eval/runtime,3.1263
eval/samples_per_second,137.225
eval/steps_per_second,8.637
total_flos,225757393145856.0
train/epoch,2.0


Step,Training Loss
10,0.6717
20,0.6213
30,0.5808
40,0.5604
50,0.5337
60,0.5121
70,0.507
80,0.4904
90,0.485
100,0.457


Step,Training Loss
10,0.6717
20,0.6213
30,0.5808
40,0.5604
50,0.5337
60,0.5121
70,0.507
80,0.4904
90,0.485
100,0.457


In [None]:
print("Number of records : ", len(df))

count_anger = df['anger'].value_counts().get(1, 0)
print("Occurrences of 'anger':", count_anger)

count_disgust = df['disgust'].value_counts().get(1, 0)
print("Occurrences of 'disgust':", count_disgust)

count_fear = df['fear'].value_counts().get(1, 0)
print("Occurrences of 'fear':", count_fear)

count_joy = df['joy'].value_counts().get(1, 0)
print("Occurrences of 'joy':", count_joy)

count_sadness = df['sadness'].value_counts().get(1, 0)
print("Occurrences of 'sadness':", count_sadness)

count_surprise = df['surprise'].value_counts().get(1, 0)
print("Occurrences of 'surprise':", count_surprise)


Number of records :  2145
Occurrences of 'anger': 408
Occurrences of 'disgust': 329
Occurrences of 'fear': 327
Occurrences of 'joy': 320
Occurrences of 'sadness': 647
Occurrences of 'surprise': 349
