<a href="https://colab.research.google.com/github/u21598012/COS-760-EA/blob/BERT-fine-tuned/run_bert_fine_tuned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import the cleaned dataset

In [3]:
import pandas as pd
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset
import optuna
from transformers import (
    AutoTokenizer, AutoModelForMaskedLM,
    AutoModel,
    AutoConfig,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    AutoModelForSequenceClassification, get_scheduler
)
from peft import LoraConfig, get_peft_model
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, multilabel_confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('processed_data.csv')
df.head()

Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise,emotions,tokens,stemmed_tokens
0,hau_train_track_a_00001,kotu ta yi hukunci kan shari'ar zaben dan maja...,0,0,0,0,0,1,"('surprise',)","['kotu', 'hukunci', 'kan', ""shari'ar"", 'zaben'...","['kotu', 'hukunci', 'kan', 'shari ar', 'zaben'..."
1,hau_train_track_a_00002,"toh fah inji 'yan magana suka ce """"""""ana wata ...",0,0,0,0,0,1,"('surprise',)","['toh', 'fah', 'inji', ""'yan"", 'magana', '``',...","['toh', 'fah', 'inji', ' yan', 'magana', ' ', ..."
2,hau_train_track_a_00003,bincike ya nuna yan najeriya sun fi damuwa da ...,0,0,1,0,1,0,"('fear', 'sadness')","['bincike', 'nuna', 'yan', 'najeriya', 'fi', '...","['bincik', 'nuna', 'yan', 'najeriya', 'fi', 'd..."
3,hau_train_track_a_00004,kwamishina ya musanta rahoton masari ya cire k...,0,0,0,0,0,0,(),"['kwamishina', 'musanta', 'rahoton', 'masari',...","['kwamishina', 'musanta', 'rahoton', 'masari',..."
4,hau_train_track_a_00005,innalillahi wa inna ilaihir raji'un: allah ya ...,0,0,0,0,1,0,"('sadness',)","['innalillahi', 'wa', 'inna', 'ilaihir', ""raji...","['innalillahi', 'wa', 'inna', 'ilaihir', 'raji..."


In [5]:
text = df['stemmed_tokens']
emotion_labels = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']
labels = df[emotion_labels].values.tolist()
df[emotion_labels] = df[emotion_labels].astype(int)


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "Davlan/bert-base-multilingual-cased-finetuned-hausa"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name).to(device)

tokenizer_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/712M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/712M [00:00<?, ?B/s]

In [7]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df[emotion_labels].values, test_size=0.2, random_state=42
)

tokenizer = AutoTokenizer.from_pretrained("Davlan/bert-base-multilingual-cased-finetuned-hausa")


In [8]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.texts)


In [9]:
train_dataset = EmotionDataset(train_texts, train_labels, tokenizer)
val_dataset = EmotionDataset(val_texts, val_labels, tokenizer)

In [10]:
def model_init(trial):
    return AutoModelForSequenceClassification.from_pretrained(
        "Davlan/bert-base-multilingual-cased-finetuned-hausa",
        num_labels=len(emotion_labels),
        problem_type="multi_label_classification"
    )
# model = AutoModelForSequenceClassification.from_pretrained(
#     "Davlan/bert-base-multilingual-cased-finetuned-hausa",
#     num_labels=len(emotion_labels),
#     problem_type="multi_label_classification"
# )

In [4]:
# training_args = TrainingArguments(
#     output_dir="./results",
#     num_train_epochs=20,
#     per_device_train_batch_size=24,
#     per_device_eval_batch_size=24,
#     save_strategy="no",
#     logging_dir='./logs',
#     logging_steps=10,
#     load_best_model_at_end=True,
#     optim="adamw_torch"
# )
training_args = TrainingArguments(
    output_dir="./optuna_results", # Use a different output directory for tuning
    per_device_train_batch_size=16, # You might tune this
    per_device_eval_batch_size=16,  # You might tune this
    save_strategy="no",
    logging_dir='./optuna_logs',    # Use a different logging directory
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1", # Specify the metric to optimize
    greater_is_better=True # Specify if a higher metric is better
)

In [11]:
def compute_metrics(pred):
    logits, labels = pred
    preds = (torch.sigmoid(torch.tensor(logits)) > 0.5).int()
    labels = torch.tensor(labels).int()
    report = classification_report(labels, preds, output_dict=True, zero_division=0)
    return {
        'f1': f1_score(labels, preds, average='macro'),
        'accuracy': (preds == labels).float().mean().item(),
    }

def custom_loss(outputs, labels):
    return BCEWithLogitsLoss()(outputs.logits, labels)

In [None]:
trainer = Trainer(
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    model_init=model_init # Pass the model_init function
)

best_trial = trainer.hyperparameter_search(
    direction="maximize", # Maximize the metric_for_best_model (f1 in this case)
    backend="optuna",
    n_trials=50 # Number of trials to run
)

print("Best trial:")
print(best_trial)

# # Train
# trainer.train()

# # Evaluate
# metrics = trainer.evaluate()
# print("Evaluation metrics:", metrics)

# model_save_path = "./m1"
# model.save_pretrained(model_save_path)
# tokenizer.save_pretrained(model_save_path)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-06-03 09:43:33,714] A new study created in memory with name: no-name-8fc1fbc0-15c9-458d-93e5-61cd601b77a7
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇█████
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/grad_norm,▁▁▁▁▁▅▁▁▁▁▁▁▁▃▁▁▁█▁▁▁▁▁▂▁▁▁▂▂▁▁▁▂▁▁▁▂▂▂▁
train/learning_rate,████▇▇▇▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁

0,1
eval/accuracy,0.84887
eval/f1,0.45731
eval/loss,0.3546
eval/runtime,3.1262
eval/samples_per_second,137.227
eval/steps_per_second,8.637
total_flos,225757393145856.0
train/epoch,2.0
train/global_step,858.0
train/grad_norm,1.44266


Step,Training Loss
10,0.6174
20,0.5238
30,0.4862
40,0.4997
50,0.4852
60,0.4361
70,0.4501
80,0.4879
90,0.4318
100,0.4455


[I 2025-06-03 09:48:53,840] Trial 0 finished with value: 1.516666937936325 and parameters: {'learning_rate': 1.3873092825936864e-05, 'num_train_epochs': 5, 'seed': 14, 'per_device_train_batch_size': 4}. Best is trial 0 with value: 1.516666937936325.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▁▁▂▂▂▂▃▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇███
train/global_step,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇██
train/grad_norm,▁▃▂▃▂▂▅▃▅▅▄▃▃▂▆▆▁▃▃▂▄▃▅▄▃▃▃▅▅▃▃▅▅▂▂█▆▁▂▇
train/learning_rate,████▇▇▇▇▆▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁

0,1
eval/accuracy,0.87568
eval/f1,0.64099
eval/loss,0.3187
eval/runtime,3.0835
eval/samples_per_second,139.128
eval/steps_per_second,8.756
total_flos,564393482864640.0
train/epoch,5.0
train/global_step,2145.0
train/grad_norm,7.48022


Step,Training Loss
10,0.5041
20,0.4623
30,0.4736
40,0.4411
50,0.3864
60,0.414
70,0.3777
80,0.3731
90,0.3537
100,0.358


[I 2025-06-03 09:51:06,053] Trial 1 finished with value: 1.48400633398454 and parameters: {'learning_rate': 9.176591817409464e-05, 'num_train_epochs': 3, 'seed': 19, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 1.516666937936325.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇████
train/grad_norm,▁▁▂▂▂▄▄▂▄▄▆▄▅▄▄▅▄▄▄█▆▃▂▃▆▇▆▂▄▃▅▃
train/learning_rate,███▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▁▁▁

0,1
eval/accuracy,0.87024
eval/f1,0.61377
eval/loss,0.31836
eval/runtime,3.152
eval/samples_per_second,136.105
eval/steps_per_second,8.566
total_flos,338636089718784.0
train/epoch,3.0
train/global_step,324.0
train/grad_norm,1.26281


Step,Training Loss
10,0.6022
20,0.5068
30,0.495
40,0.4909
50,0.4766
60,0.4568
70,0.4313
80,0.4457
90,0.4256
100,0.4174


[I 2025-06-03 09:54:24,664] Trial 2 finished with value: 1.501044354081575 and parameters: {'learning_rate': 1.9464728476037724e-05, 'num_train_epochs': 4, 'seed': 1, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 1.516666937936325.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇█████
train/global_step,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇████
train/grad_norm,▂▁▁▁▁▂▁▃▂▃▂▃▄▅█▃▄▄▄▂▃▃▃▆▂▃▃▅▃▂▄▄▇▄▃▄▃▄▄▆
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▁▁

0,1
eval/accuracy,0.87296
eval/f1,0.62808
eval/loss,0.31605
eval/runtime,3.1067
eval/samples_per_second,138.089
eval/steps_per_second,8.691
total_flos,451514786291712.0
train/epoch,4.0
train/global_step,860.0
train/grad_norm,4.23495


Step,Training Loss
10,0.5672
20,0.4908
30,0.443
40,0.477
50,0.4772
60,0.4386
70,0.4264
80,0.4457
90,0.3985
100,0.4348


[I 2025-06-03 09:57:45,164] Trial 3 finished with value: 1.5081048537349777 and parameters: {'learning_rate': 2.590008636226438e-05, 'num_train_epochs': 4, 'seed': 15, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 1.516666937936325.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇█
train/global_step,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇██
train/grad_norm,▂▁█▃▂▃▂▂▁▂▂▃▂▃▁▂▂▃▂▃▃▂▃▄▄▂▂▄▁▂▁▃▂▂▄▃▂▂▃▂
train/learning_rate,████▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▅▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁

0,1
eval/accuracy,0.87413
eval/f1,0.63398
eval/loss,0.31509
eval/runtime,3.1367
eval/samples_per_second,136.766
eval/steps_per_second,8.608
total_flos,451514786291712.0
train/epoch,4.0
train/global_step,860.0
train/grad_norm,2.39404


Step,Training Loss
10,0.6743
20,0.6231
30,0.5919
40,0.5429
50,0.5299
60,0.5223
70,0.4971
80,0.4861
90,0.4727
100,0.4784


[I 2025-06-03 10:00:01,631] Trial 4 finished with value: 0.8185703158378601 and parameters: {'learning_rate': 3.546928477209678e-06, 'num_train_epochs': 3, 'seed': 34, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 1.516666937936325.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇████
train/grad_norm,█▇▅▄▄▂▃▁▃▂▄▁▅▁▃▃▂▃▅▂▂▁▃▂▁▄▅▄▃▃▂▂
train/learning_rate,███▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▁▁▁

0,1
eval/accuracy,0.81857
eval/f1,0.0
eval/loss,0.44438
eval/runtime,3.1479
eval/samples_per_second,136.281
eval/steps_per_second,8.577
total_flos,338636089718784.0
train/epoch,3.0
train/global_step,324.0
train/grad_norm,0.89321


Step,Training Loss
10,0.5027
20,0.4493
30,0.4026
40,0.3693
50,0.3406
60,0.3197
70,0.2881
80,0.2717
90,0.2343
100,0.2334


[I 2025-06-03 10:02:23,665] Trial 5 finished with value: 1.5187164025880433 and parameters: {'learning_rate': 8.490062429364895e-05, 'num_train_epochs': 4, 'seed': 19, 'per_device_train_batch_size': 64}. Best is trial 5 with value: 1.5187164025880433.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▂▃▄▅▅▆▇▇██
train/global_step,▁▂▂▃▄▅▅▆▇▇██
train/grad_norm,▁▂▂▆▅▃▄▄▄█
train/learning_rate,█▇▆▆▅▄▃▃▂▁

0,1
eval/accuracy,0.87685
eval/f1,0.64187
eval/loss,0.31184
eval/runtime,3.1236
eval/samples_per_second,137.343
eval/steps_per_second,8.644
total_flos,451514786291712.0
train/epoch,4.0
train/global_step,108.0
train/grad_norm,1.92859


Step,Training Loss
10,0.6448
20,0.5994
30,0.5641
40,0.5387
50,0.5283
60,0.5108
70,0.5095
80,0.4924
90,0.4953
100,0.4945


[I 2025-06-03 10:03:41,523] Trial 6 finished with value: 0.8185703158378601 and parameters: {'learning_rate': 3.4127717367993324e-06, 'num_train_epochs': 2, 'seed': 19, 'per_device_train_batch_size': 32}. Best is trial 5 with value: 1.5187164025880433.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▂▃▄▅▅▆▇▇██
train/global_step,▁▂▂▃▄▅▅▆▇▇██
train/grad_norm,▇█▅▃▁▁▁▂▂▂
train/learning_rate,█▇▆▆▅▄▃▃▂▁

0,1
eval/accuracy,0.81857
eval/f1,0.0
eval/loss,0.47723
eval/runtime,3.1314
eval/samples_per_second,136.999
eval/steps_per_second,8.622
total_flos,225757393145856.0
train/epoch,2.0
train/global_step,108.0
train/grad_norm,1.05362


Step,Training Loss
10,0.6236
20,0.5264
30,0.4873
40,0.4753
50,0.4613
60,0.4616
70,0.4576
80,0.4602
90,0.4456
100,0.4563


[I 2025-06-03 10:06:02,540] Trial 7 finished with value: 0.8207657920913795 and parameters: {'learning_rate': 9.054856138052428e-06, 'num_train_epochs': 4, 'seed': 13, 'per_device_train_batch_size': 64}. Best is trial 5 with value: 1.5187164025880433.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▂▃▄▅▅▆▇▇██
train/global_step,▁▂▂▃▄▅▅▆▇▇██
train/grad_norm,█▄▃▂▂▁▁▂▂▂
train/learning_rate,█▇▆▆▅▄▃▃▂▁

0,1
eval/accuracy,0.81818
eval/f1,0.00258
eval/loss,0.43936
eval/runtime,3.1414
eval/samples_per_second,136.564
eval/steps_per_second,8.595
total_flos,451514786291712.0
train/epoch,4.0
train/global_step,108.0
train/grad_norm,0.48095


Step,Training Loss
10,0.5595
20,0.4883
30,0.4675
40,0.4643
50,0.4698


[I 2025-06-03 10:06:45,048] Trial 8 finished with value: 0.8185703158378601 and parameters: {'learning_rate': 2.6603370412421628e-05, 'num_train_epochs': 1, 'seed': 22, 'per_device_train_batch_size': 32}. Best is trial 5 with value: 1.5187164025880433.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▃▄▆▇██
train/global_step,▁▃▄▆▇██
train/grad_norm,█▇▄▄▁
train/learning_rate,█▆▅▃▁

0,1
eval/accuracy,0.81857
eval/f1,0.0
eval/loss,0.45352
eval/runtime,3.1068
eval/samples_per_second,138.084
eval/steps_per_second,8.691
total_flos,112878696572928.0
train/epoch,1.0
train/global_step,54.0
train/grad_norm,0.52848


Step,Training Loss
10,0.5228
20,0.4565
30,0.4123
40,0.4046
50,0.3774
60,0.3521
70,0.3134
80,0.3343
90,0.2945
100,0.2964


[I 2025-06-03 10:08:03,073] Trial 9 finished with value: 1.4724458319956661 and parameters: {'learning_rate': 9.820749462907982e-05, 'num_train_epochs': 2, 'seed': 1, 'per_device_train_batch_size': 32}. Best is trial 5 with value: 1.5187164025880433.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▂▃▄▅▅▆▇▇██
train/global_step,▁▂▂▃▄▅▅▆▇▇██
train/grad_norm,▁▂█▃▄▄▄█▇▆
train/learning_rate,█▇▆▆▅▄▃▃▂▁

0,1
eval/accuracy,0.86946
eval/f1,0.60298
eval/loss,0.31735
eval/runtime,3.0889
eval/samples_per_second,138.885
eval/steps_per_second,8.741
total_flos,225757393145856.0
train/epoch,2.0
train/global_step,108.0
train/grad_norm,1.49223


Step,Training Loss
10,0.5246
20,0.4509
30,0.4143
40,0.3742
50,0.3699
60,0.3278
70,0.2987
80,0.2935
90,0.2529
100,0.252


[I 2025-06-03 10:10:57,859] Trial 10 finished with value: 1.505985565126013 and parameters: {'learning_rate': 4.8450402615805846e-05, 'num_train_epochs': 5, 'seed': 31, 'per_device_train_batch_size': 64}. Best is trial 5 with value: 1.5187164025880433.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▂▃▃▄▄▅▅▆▇▇███
train/global_step,▁▂▂▃▃▄▄▅▅▆▇▇███
train/grad_norm,▁▁▂▂▅▃▂▃▄▃▃█▅
train/learning_rate,█▇▇▆▆▅▅▄▃▃▂▂▁

0,1
eval/accuracy,0.87685
eval/f1,0.62914
eval/loss,0.31048
eval/runtime,3.0935
eval/samples_per_second,138.679
eval/steps_per_second,8.728
total_flos,564393482864640.0
train/epoch,5.0
train/global_step,135.0
train/grad_norm,1.57493


Step,Training Loss
10,0.6336
20,0.5397
30,0.531
40,0.5154
50,0.4759
60,0.4583
70,0.4983
80,0.529
90,0.4532
100,0.4601


[I 2025-06-03 10:16:15,184] Trial 11 finished with value: 1.4398800766092807 and parameters: {'learning_rate': 8.277623152947889e-06, 'num_train_epochs': 5, 'seed': 8, 'per_device_train_batch_size': 4}. Best is trial 5 with value: 1.5187164025880433.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇███
train/grad_norm,▁▁▁▁▁▁▂▁▁▁▁▁▁▁▂▂▁▁▂▁▁▁▂▁▁▁▂▁█▂▁▂▂▂▂▂▁▂▁▂
train/learning_rate,█████▇▇▇▇▇▆▆▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁

0,1
eval/accuracy,0.86053
eval/f1,0.57935
eval/loss,0.32244
eval/runtime,3.1472
eval/samples_per_second,136.313
eval/steps_per_second,8.579
total_flos,564393482864640.0
train/epoch,5.0
train/global_step,2145.0
train/grad_norm,6.40687


Step,Training Loss
10,0.6793
20,0.6597
30,0.6381
40,0.6272
50,0.5989
60,0.5806
70,0.5487
80,0.5408
90,0.5561
100,0.5409


[I 2025-06-03 10:21:32,052] Trial 12 finished with value: 0.9408406362288766 and parameters: {'learning_rate': 1.3772421528489252e-06, 'num_train_epochs': 5, 'seed': 26, 'per_device_train_batch_size': 4}. Best is trial 5 with value: 1.5187164025880433.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▁▁▁▁▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇███
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇█████
train/grad_norm,▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▂▂▁▁▁▁▁▁▃▂▁▁▁▂▁▁
train/learning_rate,██▇▇▇▇▇▇▇▆▆▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▁▁▁▁

0,1
eval/accuracy,0.82712
eval/f1,0.11372
eval/loss,0.41927
eval/runtime,3.1039
eval/samples_per_second,138.215
eval/steps_per_second,8.699
total_flos,564393482864640.0
train/epoch,5.0
train/global_step,2145.0
train/grad_norm,2.32379


Step,Training Loss
10,0.5366
20,0.4642
30,0.4395
40,0.4059
50,0.3795
60,0.3569
70,0.3195
80,0.311
90,0.2955
100,0.281


[I 2025-06-03 10:23:58,194] Trial 13 finished with value: 1.4006879825932192 and parameters: {'learning_rate': 5.096667706931299e-05, 'num_train_epochs': 4, 'seed': 10, 'per_device_train_batch_size': 64}. Best is trial 5 with value: 1.5187164025880433.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▂▃▄▅▅▆▇▇██
train/global_step,▁▂▂▃▄▅▅▆▇▇██
train/grad_norm,▁▃▂▃▃█▄▃▆▄
train/learning_rate,█▇▆▆▅▄▃▃▂▁

0,1
eval/accuracy,0.86286
eval/f1,0.53783
eval/loss,0.33175
eval/runtime,3.083
eval/samples_per_second,139.15
eval/steps_per_second,8.758
total_flos,451514786291712.0
train/epoch,4.0
train/global_step,108.0
train/grad_norm,0.89949


Step,Training Loss
10,0.6786
20,0.6299
30,0.5979
40,0.5464
50,0.5447
60,0.5002
70,0.4948
80,0.4959
90,0.4677
100,0.4864


[I 2025-06-03 10:29:15,757] Trial 14 finished with value: 1.3665165645664397 and parameters: {'learning_rate': 4.616547000021472e-06, 'num_train_epochs': 5, 'seed': 40, 'per_device_train_batch_size': 4}. Best is trial 5 with value: 1.5187164025880433.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇██
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇██
train/grad_norm,▁▁▃▁▁▁▁▂▁▁▁▁▂▁▂▃▂▁█▂▂▂▂▁▂▃▃▄▃▃▂▃▂▄▂▄▂▂▂▄
train/learning_rate,████▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▄▄▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁

0,1
eval/accuracy,0.85975
eval/f1,0.50677
eval/loss,0.3392
eval/runtime,3.126
eval/samples_per_second,137.234
eval/steps_per_second,8.637
total_flos,564393482864640.0
train/epoch,5.0
train/global_step,2145.0
train/grad_norm,4.90765


Step,Training Loss
10,0.6167
20,0.5029
30,0.4752
40,0.4713
50,0.4588
60,0.4513
70,0.4474
80,0.4483
90,0.4404
100,0.4338


[I 2025-06-03 10:31:41,489] Trial 15 finished with value: 0.8394036491711935 and parameters: {'learning_rate': 1.2356753234589839e-05, 'num_train_epochs': 4, 'seed': 24, 'per_device_train_batch_size': 64}. Best is trial 5 with value: 1.5187164025880433.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▂▃▄▅▅▆▇▇██
train/global_step,▁▂▂▃▄▅▅▆▇▇██
train/grad_norm,█▆▁▁▂▂▃▃▃▂
train/learning_rate,█▇▆▆▅▄▃▃▂▁

0,1
eval/accuracy,0.81857
eval/f1,0.02083
eval/loss,0.42354
eval/runtime,3.1113
eval/samples_per_second,137.885
eval/steps_per_second,8.678
total_flos,451514786291712.0
train/epoch,4.0
train/global_step,108.0
train/grad_norm,0.52328


Step,Training Loss
10,0.6715
20,0.6501
30,0.643
40,0.6366
50,0.6256
60,0.6105
70,0.6073
80,0.6091
90,0.5724
100,0.5705


[I 2025-06-03 10:36:59,077] Trial 16 finished with value: 0.8363118282070866 and parameters: {'learning_rate': 1.1389290796491654e-06, 'num_train_epochs': 5, 'seed': 8, 'per_device_train_batch_size': 4}. Best is trial 5 with value: 1.5187164025880433.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇█████
train/grad_norm,▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▃▁▁▁▁▃▁▁▂▁▁▁▁▁▆▁▁▁▁▁▁▂▁▁
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁

0,1
eval/accuracy,0.81779
eval/f1,0.01852
eval/loss,0.42354
eval/runtime,3.1102
eval/samples_per_second,137.934
eval/steps_per_second,8.681
total_flos,564393482864640.0
train/epoch,5.0
train/global_step,2145.0
train/grad_norm,2.21197


Step,Training Loss
10,0.5364
20,0.4675
30,0.4449
40,0.4135
50,0.3864
60,0.3758
70,0.3394
80,0.3466


[I 2025-06-03 10:38:51,184] Trial 17 finished with value: 1.289187483838233 and parameters: {'learning_rate': 4.930133545738415e-05, 'num_train_epochs': 3, 'seed': 28, 'per_device_train_batch_size': 64}. Best is trial 5 with value: 1.5187164025880433.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▃▄▅▆▇███
train/global_step,▁▂▃▄▅▆▇███
train/grad_norm,▁▂▃▃▂▆▅█
train/learning_rate,█▇▆▅▄▃▂▁

0,1
eval/accuracy,0.85781
eval/f1,0.43138
eval/loss,0.35363
eval/runtime,3.1205
eval/samples_per_second,137.478
eval/steps_per_second,8.652
total_flos,338636089718784.0
train/epoch,3.0
train/global_step,81.0
train/grad_norm,1.34354


Step,Training Loss
10,0.6087
20,0.5301
30,0.4647
40,0.4611
50,0.463
60,0.4715
70,0.4865
80,0.4565
90,0.4571
100,0.4522


[I 2025-06-03 10:43:07,813] Trial 18 finished with value: 1.4963778575828068 and parameters: {'learning_rate': 1.595048152275773e-05, 'num_train_epochs': 4, 'seed': 17, 'per_device_train_batch_size': 4}. Best is trial 5 with value: 1.5187164025880433.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/grad_norm,▁▄▁▂▂▂▃▃▃▂▂▂▂▃▄▃▄▃▄▃▂▄▅▃▃▇█▅▃▂▂▃▄█▅▆▅▅▄▄
train/learning_rate,██▇▇▇▇▇▆▆▆▆▅▅▅▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁

0,1
eval/accuracy,0.87218
eval/f1,0.62419
eval/loss,0.31198
eval/runtime,3.1334
eval/samples_per_second,136.914
eval/steps_per_second,8.617
total_flos,451514786291712.0
train/epoch,4.0
train/global_step,1716.0
train/grad_norm,4.14306


Step,Training Loss
10,0.647
20,0.5717
30,0.5371
40,0.4886
50,0.4634
60,0.4881
70,0.4694
80,0.4803
90,0.467
100,0.4527


[I 2025-06-03 10:44:40,352] Trial 19 finished with value: 0.8211744825045267 and parameters: {'learning_rate': 6.110880845117523e-06, 'num_train_epochs': 2, 'seed': 13, 'per_device_train_batch_size': 16}. Best is trial 5 with value: 1.5187164025880433.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇███
train/global_step,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇███
train/grad_norm,█▂▁▂▁▁▁▁▁▂▁▁▂▄▁▁▂▁▁▁▁
train/learning_rate,██▇▇▇▆▆▆▅▅▄▄▄▃▃▃▂▂▂▁▁

0,1
eval/accuracy,0.81857
eval/f1,0.0026
eval/loss,0.43891
eval/runtime,3.0732
eval/samples_per_second,139.595
eval/steps_per_second,8.786
total_flos,225757393145856.0
train/epoch,2.0
train/global_step,216.0
train/grad_norm,0.78939


Step,Training Loss
10,0.6827
20,0.6566
30,0.6327
40,0.6114
50,0.5803
60,0.5603
70,0.5597
80,0.5356
90,0.5373
100,0.5273


[I 2025-06-03 10:47:13,576] Trial 20 finished with value: 0.8185703158378601 and parameters: {'learning_rate': 1.949953490759122e-06, 'num_train_epochs': 3, 'seed': 5, 'per_device_train_batch_size': 8}. Best is trial 5 with value: 1.5187164025880433.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▁▂▂▂▂▂▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▁▁▁▁▁▁▁▁▁▁▁▃▁▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁
train/learning_rate,████▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁

0,1
eval/accuracy,0.81857
eval/f1,0.0
eval/loss,0.4489
eval/runtime,3.1446
eval/samples_per_second,136.424
eval/steps_per_second,8.586
total_flos,338636089718784.0
train/epoch,3.0
train/global_step,645.0
train/grad_norm,1.6777


Step,Training Loss
10,0.5635
20,0.4949
30,0.4436
40,0.481
50,0.49
60,0.4517
70,0.4561
80,0.481
90,0.4338
100,0.4695


[I 2025-06-03 10:50:33,539] Trial 21 finished with value: 1.4812132981062152 and parameters: {'learning_rate': 2.7574078508721433e-05, 'num_train_epochs': 4, 'seed': 15, 'per_device_train_batch_size': 8}. Best is trial 5 with value: 1.5187164025880433.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇██
train/grad_norm,▇▁▂▁▁▁▂▂▅▄▂▅▂▂▃▂▂▂▂▂▂▂▁▂▃▅▂▃▁█▁▂▁▂▁▃▁▂▂▂
train/learning_rate,████▇▇▇▇▆▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁

0,1
eval/accuracy,0.87218
eval/f1,0.60903
eval/loss,0.315
eval/runtime,3.1208
eval/samples_per_second,137.467
eval/steps_per_second,8.652
total_flos,451514786291712.0
train/epoch,4.0
train/global_step,860.0
train/grad_norm,2.37182


Step,Training Loss
10,0.5538
20,0.4631
30,0.4519
40,0.4606
50,0.4941
60,0.4652
70,0.4507
80,0.4469
90,0.3977
100,0.3918


[I 2025-06-03 10:53:54,838] Trial 22 finished with value: 1.512016756293562 and parameters: {'learning_rate': 3.0174670916113646e-05, 'num_train_epochs': 4, 'seed': 21, 'per_device_train_batch_size': 8}. Best is trial 5 with value: 1.5187164025880433.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▁▁▂▂▂▃▂▂▄▃▂▂▂▃▃▂▃▃▄█▃▄▇▂▃▅▃▃▅▄▄▄▂▄▄▅▁▂▅▃
train/learning_rate,█████▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁

0,1
eval/accuracy,0.87685
eval/f1,0.63517
eval/loss,0.31274
eval/runtime,3.1503
eval/samples_per_second,136.179
eval/steps_per_second,8.571
total_flos,451514786291712.0
train/epoch,4.0
train/global_step,860.0
train/grad_norm,0.94427


Step,Training Loss
10,0.5148
20,0.4643
30,0.4577
40,0.456
50,0.4935
60,0.4434
70,0.4625
80,0.4725
90,0.4128
100,0.4036


[I 2025-06-03 10:58:02,597] Trial 23 finished with value: 1.5139115192658783 and parameters: {'learning_rate': 6.205966913904076e-05, 'num_train_epochs': 5, 'seed': 21, 'per_device_train_batch_size': 8}. Best is trial 5 with value: 1.5187164025880433.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▇▇▇▇▇▇████
train/grad_norm,▁▁▂▂▁▂▃▃▂▂▃▂▃▃▄▂▄▂▂▃▃█▄▄▃▃▃▂▂▃▁▃▂▂▂▃▃▂▂▁
train/learning_rate,███▇▇▇▇▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▄▄▃▃▂▂▂▂▂▂▂▁▁▁

0,1
eval/accuracy,0.8749
eval/f1,0.63901
eval/loss,0.35173
eval/runtime,3.1067
eval/samples_per_second,138.089
eval/steps_per_second,8.691
total_flos,564393482864640.0
train/epoch,5.0
train/global_step,1075.0
train/grad_norm,1.77204


Step,Training Loss
10,0.5187
20,0.4864
30,0.4873
40,0.465
50,0.4515
60,0.4807
70,0.4222
80,0.4148
90,0.3897
100,0.3716


[I 2025-06-03 11:02:09,719] Trial 24 finished with value: 1.5106158267844902 and parameters: {'learning_rate': 6.398537699181085e-05, 'num_train_epochs': 5, 'seed': 24, 'per_device_train_batch_size': 8}. Best is trial 5 with value: 1.5187164025880433.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
train/global_step,▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
train/grad_norm,▂▁▂▂▃▄▄▃▃▂▄▄▃▄▄█▃▄▄▅▄▅▂▃▅▅▃▅▅▅▄█▁▅▂▆▃▁▆▃
train/learning_rate,█████▇▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁

0,1
eval/accuracy,0.87335
eval/f1,0.63727
eval/loss,0.36648
eval/runtime,3.1345
eval/samples_per_second,136.863
eval/steps_per_second,8.614
total_flos,564393482864640.0
train/epoch,5.0
train/global_step,1075.0
train/grad_norm,2.24826


Step,Training Loss
10,0.524
20,0.4712
30,0.4534
40,0.4517
50,0.4146
60,0.3966
70,0.3532
80,0.3485
90,0.3114
100,0.3031


[I 2025-06-03 11:05:09,228] Trial 25 finished with value: 1.4898555929862456 and parameters: {'learning_rate': 7.263462430285637e-05, 'num_train_epochs': 5, 'seed': 28, 'per_device_train_batch_size': 64}. Best is trial 5 with value: 1.5187164025880433.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▂▃▃▄▄▅▅▆▇▇███
train/global_step,▁▂▂▃▃▄▄▅▅▆▇▇███
train/grad_norm,▁▁█▂▁▁▁▁▂▁▁▁▁
train/learning_rate,█▇▇▆▆▅▄▄▃▃▂▂▁

0,1
eval/accuracy,0.87102
eval/f1,0.61884
eval/loss,0.31332
eval/runtime,3.0975
eval/samples_per_second,138.499
eval/steps_per_second,8.717
total_flos,564393482864640.0
train/epoch,5.0
train/global_step,135.0
train/grad_norm,1.3281


Step,Training Loss
10,0.549
20,0.5032
30,0.4573
40,0.4584
50,0.4618
60,0.4764
70,0.4782
80,0.448
90,0.4355
100,0.4258


[I 2025-06-03 11:10:27,030] Trial 26 finished with value: 1.5678550150950614 and parameters: {'learning_rate': 3.5021916782646125e-05, 'num_train_epochs': 5, 'seed': 17, 'per_device_train_batch_size': 4}. Best is trial 26 with value: 1.5678550150950614.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▇▇▇█████
train/global_step,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇█████
train/grad_norm,▁▂▁▁▁▂▂▂▁▁▄▃▂▄▂▃▂▃▃▂▃▂▂▄▅█▄▂▂▂▂▂▄▂▂▁▁▁▁▄
train/learning_rate,█████▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▄▄▃▃▃▃▂▂▂▂▂▂▂▁▁▁

0,1
eval/accuracy,0.88811
eval/f1,0.67974
eval/loss,0.34018
eval/runtime,3.1205
eval/samples_per_second,137.479
eval/steps_per_second,8.653
total_flos,564393482864640.0
train/epoch,5.0
train/global_step,2145.0
train/grad_norm,5.24679


Step,Training Loss
10,0.5722
20,0.495
30,0.4884
40,0.5041
50,0.5167
60,0.4708
70,0.4684
80,0.462
90,0.4931
100,0.4034


[I 2025-06-03 11:15:44,568] Trial 27 finished with value: 1.5202695585938395 and parameters: {'learning_rate': 3.9047439818747774e-05, 'num_train_epochs': 5, 'seed': 12, 'per_device_train_batch_size': 4}. Best is trial 26 with value: 1.5678550150950614.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇█████
train/grad_norm,▃▁▂▂▃▃▁▅▃▁▄▂▆▄▃▃▂▄▆█▃▅▅▇▃▅▃▂▁▇▃▁▄▂▅▂▅▁▃▄
train/learning_rate,████▇▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁

0,1
eval/accuracy,0.87451
eval/f1,0.64576
eval/loss,0.38192
eval/runtime,3.1493
eval/samples_per_second,136.223
eval/steps_per_second,8.573
total_flos,564393482864640.0
train/epoch,5.0
train/global_step,2145.0
train/grad_norm,3.67023


Step,Training Loss
10,0.5663
20,0.5101
30,0.4557
40,0.4541
50,0.47
60,0.4758
70,0.451
80,0.4321
90,0.4727
100,0.4303


[I 2025-06-03 11:20:01,474] Trial 28 finished with value: 1.5353990970019056 and parameters: {'learning_rate': 3.9452710311199766e-05, 'num_train_epochs': 4, 'seed': 10, 'per_device_train_batch_size': 4}. Best is trial 26 with value: 1.5678550150950614.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇███
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇███
train/grad_norm,▂▂▅▂▂▃▃▂▂▂▂▄▃▂▂▄▃▄█▅▂▄▃▄▃▅▂▂▂▁▅▂▅▂▂▂▃▄▅▃
train/learning_rate,████▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃▃▂▂▁▁▁▁

0,1
eval/accuracy,0.88073
eval/f1,0.65467
eval/loss,0.34318
eval/runtime,3.1644
eval/samples_per_second,135.571
eval/steps_per_second,8.532
total_flos,451514786291712.0
train/epoch,4.0
train/global_step,1716.0
train/grad_norm,1.08254


Step,Training Loss
10,0.5728
20,0.4784
30,0.4703
40,0.4528
50,0.489
60,0.4562
70,0.4403
80,0.4355
90,0.4085
100,0.4321


[I 2025-06-03 11:25:19,025] Trial 29 finished with value: 1.5184428131078924 and parameters: {'learning_rate': 3.791832627911556e-05, 'num_train_epochs': 5, 'seed': 11, 'per_device_train_batch_size': 4}. Best is trial 26 with value: 1.5678550150950614.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇█████
train/grad_norm,▁▂▃▂▂▄▂▂▂▃▂▄▃▃▇▃▃▄▂▁▃▃▁▁▇▁▂▄▇▂▄▁▄▁▄▁▄▂█▄
train/learning_rate,█████▇▇▇▇▆▆▆▆▆▆▆▆▆▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁

0,1
eval/accuracy,0.87762
eval/f1,0.64082
eval/loss,0.37262
eval/runtime,3.1236
eval/samples_per_second,137.34
eval/steps_per_second,8.644
total_flos,564393482864640.0
train/epoch,5.0
train/global_step,2145.0
train/grad_norm,5.49075


Step,Training Loss
10,0.5625
20,0.4948
30,0.4817
40,0.4956
50,0.4297
60,0.4615
70,0.4562
80,0.4679
90,0.3551
100,0.4164


[I 2025-06-03 11:30:36,449] Trial 30 finished with value: 1.5106682163638778 and parameters: {'learning_rate': 3.7298113013766674e-05, 'num_train_epochs': 5, 'seed': 4, 'per_device_train_batch_size': 4}. Best is trial 26 with value: 1.5678550150950614.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇▇█
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇████
train/grad_norm,▂▂▂▂█▅▂▂▄▂▄▃▂▄▅▂▂▅▃▂▂▄▃▄▄▁▁▁▅▃▄▁▂▄▂▂▂▁▁▂
train/learning_rate,████▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▁

0,1
eval/accuracy,0.87685
eval/f1,0.63382
eval/loss,0.3594
eval/runtime,3.1275
eval/samples_per_second,137.169
eval/steps_per_second,8.633
total_flos,564393482864640.0
train/epoch,5.0
train/global_step,2145.0
train/grad_norm,0.62717


Step,Training Loss
10,0.5946
20,0.5188
30,0.4567
40,0.4555
50,0.4623
60,0.4758
70,0.4856
80,0.4583
90,0.4521
100,0.4597


[I 2025-06-03 11:34:53,350] Trial 31 finished with value: 1.5056400373941936 and parameters: {'learning_rate': 1.8750115103879646e-05, 'num_train_epochs': 4, 'seed': 17, 'per_device_train_batch_size': 4}. Best is trial 26 with value: 1.5678550150950614.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇██
train/global_step,▁▁▁▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇███
train/grad_norm,▁▁▁▂▁▂▃▃▁▃▃▃▃▅▄▃▄▂▂▃▄▃▂█▅▃▆▄▆▅▂▅▄▄▆▂▄▁▁▇
train/learning_rate,█████▇▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▁▁▁

0,1
eval/accuracy,0.87141
eval/f1,0.63423
eval/loss,0.31195
eval/runtime,3.1294
eval/samples_per_second,137.086
eval/steps_per_second,8.628
total_flos,451514786291712.0
train/epoch,4.0
train/global_step,1716.0
train/grad_norm,4.37003


Step,Training Loss
10,0.5351
20,0.5202
30,0.4633
40,0.4717
50,0.4912
60,0.5066
70,0.4909
80,0.4705
90,0.4959
100,0.4802


[I 2025-06-03 11:38:08,931] Trial 32 finished with value: 1.2724380413645149 and parameters: {'learning_rate': 7.575570675546112e-05, 'num_train_epochs': 3, 'seed': 10, 'per_device_train_batch_size': 4}. Best is trial 26 with value: 1.5678550150950614.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇███
train/grad_norm,▁▁▁▁▁▁▁▁▁▂▁▂▂▁▃▁▁▃▁▂▁▂▁▂▃▁▂▂▂▁▁█▂▂▁▅▃▆▄▃
train/learning_rate,███▇▇▇▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁

0,1
eval/accuracy,0.85664
eval/f1,0.41579
eval/loss,0.35169
eval/runtime,3.1088
eval/samples_per_second,137.996
eval/steps_per_second,8.685
total_flos,338636089718784.0
train/epoch,3.0
train/global_step,1287.0
train/grad_norm,2.89909


Step,Training Loss
10,0.5677
20,0.4746
30,0.4427
40,0.4406
50,0.4683
60,0.4649
70,0.4048
80,0.3859
90,0.4554
100,0.4183


[I 2025-06-03 11:42:25,700] Trial 33 finished with value: 1.4920143043588983 and parameters: {'learning_rate': 3.936314644109776e-05, 'num_train_epochs': 4, 'seed': 7, 'per_device_train_batch_size': 4}. Best is trial 26 with value: 1.5678550150950614.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇███
train/global_step,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇███
train/grad_norm,▂▂▄▃▂▃▂▃▂▃▃▄▂▂▂▁▃▂▇▃▄▂█▃▁▁▄▄▅▁▄▁▂▂▂▂▄▂▁▂
train/learning_rate,██████▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▁▁▁▁

0,1
eval/accuracy,0.86985
eval/f1,0.62216
eval/loss,0.35487
eval/runtime,3.1393
eval/samples_per_second,136.656
eval/steps_per_second,8.601
total_flos,451514786291712.0
train/epoch,4.0
train/global_step,1716.0
train/grad_norm,1.73392


Step,Training Loss
10,0.5837
20,0.5108
30,0.4558
40,0.4559
50,0.4626
60,0.4749
70,0.4903
80,0.4574
90,0.4511
100,0.4561


[I 2025-06-03 11:45:41,469] Trial 34 finished with value: 1.4844831788404251 and parameters: {'learning_rate': 2.1526212116224945e-05, 'num_train_epochs': 3, 'seed': 17, 'per_device_train_batch_size': 4}. Best is trial 26 with value: 1.5678550150950614.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇███
train/global_step,▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
train/grad_norm,▁▁▁▁▂▂▂▂▂▂▂▂▃▁▃▄▂▃▄▃▁▃▄▅▇▂▁▄▁▃▄▄▅▅█▆▇▆▁▆
train/learning_rate,█████▇▇▇▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▄▃▃▃▂▂▂▂▂▂▁▁▁

0,1
eval/accuracy,0.87063
eval/f1,0.61385
eval/loss,0.31424
eval/runtime,3.1201
eval/samples_per_second,137.497
eval/steps_per_second,8.654
total_flos,338636089718784.0
train/epoch,3.0
train/global_step,1287.0
train/grad_norm,3.15329


Step,Training Loss
10,0.524
20,0.4875
30,0.483
40,0.4332
50,0.4415
60,0.4116
70,0.3923
80,0.3912
90,0.3746
100,0.398


[I 2025-06-03 11:48:34,749] Trial 35 finished with value: 1.519496868903383 and parameters: {'learning_rate': 9.541379617842349e-05, 'num_train_epochs': 4, 'seed': 12, 'per_device_train_batch_size': 16}. Best is trial 26 with value: 1.5678550150950614.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▁▁▂▅▂▂▃▆▄▅▂▄▃▄▅▅▄▇▇▃▅▂▆▄▆▄▄▃▆▆▄▂▆█▄▅▅▃▃▂
train/learning_rate,███▇▇▇▇▇▇▇▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁

0,1
eval/accuracy,0.87646
eval/f1,0.64304
eval/loss,0.31513
eval/runtime,3.1537
eval/samples_per_second,136.029
eval/steps_per_second,8.561
total_flos,451514786291712.0
train/epoch,4.0
train/global_step,432.0
train/grad_norm,0.98061


Step,Training Loss
10,0.5231
20,0.4823
30,0.4507
40,0.3987
50,0.4169
60,0.3704
70,0.3731
80,0.3599
90,0.3402
100,0.3881


[I 2025-06-03 11:52:07,816] Trial 36 finished with value: 1.499630428537532 and parameters: {'learning_rate': 9.983400876867083e-05, 'num_train_epochs': 5, 'seed': 12, 'per_device_train_batch_size': 16}. Best is trial 26 with value: 1.5678550150950614.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train/grad_norm,▁▁▁▂▂▃▃▂▄▄▃▅▃▃▂▂▃▂▅▃█▄▄▃▃▃▄▄▂▂▂▄▃▂▂▂▂▁▂▃
train/learning_rate,████▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁

0,1
eval/accuracy,0.87335
eval/f1,0.62628
eval/loss,0.35139
eval/runtime,3.1155
eval/samples_per_second,137.701
eval/steps_per_second,8.666
total_flos,564393482864640.0
train/epoch,5.0
train/global_step,540.0
train/grad_norm,2.23703


Step,Training Loss
10,0.5981
20,0.4928
30,0.4746
40,0.4719
50,0.4562
60,0.4532
70,0.4304
80,0.4572
90,0.4231
100,0.434


[I 2025-06-03 11:55:00,706] Trial 37 finished with value: 1.3688209053914293 and parameters: {'learning_rate': 1.4333314864146205e-05, 'num_train_epochs': 4, 'seed': 15, 'per_device_train_batch_size': 16}. Best is trial 26 with value: 1.5678550150950614.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▂▆▁▂▂▂▂▃▂▃▂▂▃▄▃▃▃▃▃▄▂▂▅▃▃▅▃▃▄▃█▃▄▃▅▅▄▃▄▃
train/learning_rate,████▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁

0,1
eval/accuracy,0.86208
eval/f1,0.50674
eval/loss,0.33402
eval/runtime,3.1257
eval/samples_per_second,137.249
eval/steps_per_second,8.638
total_flos,451514786291712.0
train/epoch,4.0
train/global_step,432.0
train/grad_norm,1.90518


Step,Training Loss
10,0.5171
20,0.4635
30,0.4669
40,0.4379
50,0.4054
60,0.3978
70,0.3754
80,0.3761
90,0.3785
100,0.388


[I 2025-06-03 11:57:53,448] Trial 38 finished with value: 1.527005322604039 and parameters: {'learning_rate': 5.695681361470775e-05, 'num_train_epochs': 4, 'seed': 6, 'per_device_train_batch_size': 16}. Best is trial 26 with value: 1.5678550150950614.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▄▁▁▃▃▅▂▅▅▄▄▃▄▄▃▅▄▆▅▄▆▄▅▄▅▆▄█▆▆▆▄▅▆▄▅▅▄▆█
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁

0,1
eval/accuracy,0.87646
eval/f1,0.65055
eval/loss,0.31334
eval/runtime,3.1242
eval/samples_per_second,137.314
eval/steps_per_second,8.642
total_flos,451514786291712.0
train/epoch,4.0
train/global_step,432.0
train/grad_norm,3.18984


Step,Training Loss
10,0.5468
20,0.4728
30,0.4391
40,0.4512
50,0.4288
60,0.4306
70,0.4076
80,0.4015
90,0.3957
100,0.4064


[I 2025-06-03 12:01:26,622] Trial 39 finished with value: 1.5014818466942472 and parameters: {'learning_rate': 3.298201014409134e-05, 'num_train_epochs': 5, 'seed': 4, 'per_device_train_batch_size': 16}. Best is trial 26 with value: 1.5678550150950614.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▁▁▁▁▂▁▁▁▁▁▁▂▁▂▁█▁▃▁▁▂▂▂▁▁▁▁▂▁▂▂▁▂▁▁▂▂▁▁▂
train/learning_rate,████▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁

0,1
eval/accuracy,0.87179
eval/f1,0.62969
eval/loss,0.32513
eval/runtime,3.1259
eval/samples_per_second,137.242
eval/steps_per_second,8.638
total_flos,564393482864640.0
train/epoch,5.0
train/global_step,540.0
train/grad_norm,2.45258


Step,Training Loss
10,0.5832
20,0.4728
30,0.4591
40,0.4575
50,0.4365
60,0.424
70,0.4012
80,0.4192
90,0.3877
100,0.3884


[I 2025-06-03 12:03:24,621] Trial 40 finished with value: 1.2751717063175845 and parameters: {'learning_rate': 2.2386199116924946e-05, 'num_train_epochs': 3, 'seed': 1, 'per_device_train_batch_size': 32}. Best is trial 26 with value: 1.5678550150950614.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▂▂▃▃▄▄▅▅▆▆▇▇▇███
train/global_step,▁▁▂▂▃▃▄▄▅▅▆▆▇▇▇███
train/grad_norm,▂▃▃▁▃▃▃▂█▃▃▃▄▄▅▄
train/learning_rate,██▇▇▆▆▅▅▄▄▃▃▂▂▁▁

0,1
eval/accuracy,0.85897
eval/f1,0.4162
eval/loss,0.35885
eval/runtime,3.1083
eval/samples_per_second,138.016
eval/steps_per_second,8.686
total_flos,338636089718784.0
train/epoch,3.0
train/global_step,162.0
train/grad_norm,1.12343


Step,Training Loss
10,0.5134
20,0.4609
30,0.461
40,0.4359
50,0.4025
60,0.3849
70,0.3726
80,0.3765
90,0.3716
100,0.3868


[I 2025-06-03 12:06:17,827] Trial 41 finished with value: 1.523909246766559 and parameters: {'learning_rate': 5.7135431091557214e-05, 'num_train_epochs': 4, 'seed': 6, 'per_device_train_batch_size': 16}. Best is trial 26 with value: 1.5678550150950614.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▁▁▂▃▃▄▃▆▅▃▃▃▄▅▃▅▅▆▅▃▅▅▅▄▆▆▅▇▆▅▄▃▃▅▆▄▆▃▅█
train/learning_rate,███▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁

0,1
eval/accuracy,0.87801
eval/f1,0.6459
eval/loss,0.3136
eval/runtime,3.1282
eval/samples_per_second,137.141
eval/steps_per_second,8.631
total_flos,451514786291712.0
train/epoch,4.0
train/global_step,432.0
train/grad_norm,3.35113


Step,Training Loss
10,0.5136
20,0.4615
30,0.4682
40,0.4616
50,0.4324
60,0.418
70,0.3947
80,0.4036
90,0.4005
100,0.4017


[I 2025-06-03 12:09:10,716] Trial 42 finished with value: 1.5419393485038584 and parameters: {'learning_rate': 5.630144377390453e-05, 'num_train_epochs': 4, 'seed': 6, 'per_device_train_batch_size': 16}. Best is trial 26 with value: 1.5678550150950614.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▁▁▁▂▂▁▁▂█▁▁▁▁▂▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▂▁▂▁▁▁▁
train/learning_rate,███▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▁

0,1
eval/accuracy,0.88073
eval/f1,0.66121
eval/loss,0.30556
eval/runtime,3.1199
eval/samples_per_second,137.507
eval/steps_per_second,8.654
total_flos,451514786291712.0
train/epoch,4.0
train/global_step,432.0
train/grad_norm,2.13836


Step,Training Loss
10,0.5089
20,0.4648
30,0.4803
40,0.4567
50,0.424
60,0.4173
70,0.3934
80,0.3979
90,0.4083
100,0.4106


Step,Training Loss
10,0.5089
20,0.4648
30,0.4803
40,0.4567
50,0.424
60,0.4173
70,0.3934
80,0.3979
90,0.4083
100,0.4106


[I 2025-06-03 12:12:03,873] Trial 43 finished with value: 1.5362486714002346 and parameters: {'learning_rate': 6.170639222418016e-05, 'num_train_epochs': 4, 'seed': 6, 'per_device_train_batch_size': 16}. Best is trial 26 with value: 1.5678550150950614.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-hausa and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▂▁▁▅▂▃▁▃▂▁▂▂▂▂▃▂▂▃▁▁▂▂▂▂▃▃▂▂▂█▂▂▂▃▁▂▂▁▂▂
train/learning_rate,███▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁

0,1
eval/accuracy,0.8819
eval/f1,0.65435
eval/loss,0.29945
eval/runtime,3.1464
eval/samples_per_second,136.347
eval/steps_per_second,8.581
total_flos,451514786291712.0
train/epoch,4.0
train/global_step,432.0
train/grad_norm,2.48868


Step,Training Loss
10,0.5235
20,0.4565
30,0.4388
40,0.4658
50,0.4436
60,0.4935
70,0.466
80,0.4473
90,0.4302
100,0.4347
