<a href="https://colab.research.google.com/github/u21598012/COS-760-EA/blob/BERT-fine-tuned/run.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install optuna

In [1]:
import pandas as pd
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset
import optuna
from transformers import (
    AutoTokenizer, AutoModelForMaskedLM,
    AutoModel,
    AutoConfig,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    AutoModelForSequenceClassification, get_scheduler
)
from peft import LoraConfig, get_peft_model
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, multilabel_confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('processed_data_it3.csv')
df.head()

ModuleNotFoundError: No module named 'optuna'

In [None]:
text = df['stemmed_tokens']
emotion_labels = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']
labels = df[emotion_labels].values.tolist()
df[emotion_labels] = df[emotion_labels].astype(int)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "Davlan/bert-base-multilingual-cased-finetuned-hausa"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name).to(device)

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df[emotion_labels].values, test_size=0.2, random_state=42
)

tokenizer = AutoTokenizer.from_pretrained("Davlan/bert-base-multilingual-cased-finetuned-hausa")


In [None]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.texts)


In [None]:
train_dataset = EmotionDataset(train_texts, train_labels, tokenizer)
val_dataset = EmotionDataset(val_texts, val_labels, tokenizer)

In [None]:
def model_init(trial):
    return AutoModelForSequenceClassification.from_pretrained(
        "Davlan/bert-base-multilingual-cased-finetuned-hausa",
        num_labels=len(emotion_labels),
        problem_type="multi_label_classification"
    )
# model = AutoModelForSequenceClassification.from_pretrained(
#     "Davlan/bert-base-multilingual-cased-finetuned-hausa",
#     num_labels=len(emotion_labels),
#     problem_type="multi_label_classification"
# )

In [None]:
# training_args = TrainingArguments(
#     output_dir="./results",
#     num_train_epochs=2,
#     per_device_train_batch_size=24,
#     per_device_eval_batch_size=24,
#     save_strategy="no",
#     logging_dir='./logs',
#     logging_steps=10,
#     load_best_model_at_end=True,
#     optim="adamw_torch"
# )

# training_args = TrainingArguments(
#     output_dir="./optuna_results", # Use a different output directory for tuning
#     per_device_train_batch_size=4, # You might tune this
#     per_device_eval_batch_size=16,  # You might tune this
#     save_strategy="no",
#     logging_dir='./optuna_logs',    # Use a different logging directory
#     logging_steps=10,
#     load_best_model_at_end=True,
#     metric_for_best_model="f1", # Specify the metric to optimize
#     greater_is_better=True, # Specify if a higher metric is better
#     learning_rate=1.6788168340923137e-05,
#     num_train_epochs=4,
#     seed=3
# )

training_args = TrainingArguments(
    output_dir="./optuna_results", # Use a different output directory for tuning
    per_device_train_batch_size=16, # You might tune this
    per_device_eval_batch_size=16,  # You might tune this
    save_strategy="no",
    logging_dir='./optuna_logs',    # Use a different logging directory
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1", # Specify the metric to optimize
    greater_is_better=True # Specify if a higher metric is better
)



In [None]:
def compute_metrics(pred):
    logits, labels = pred
    preds = (torch.sigmoid(torch.tensor(logits)) > 0.5).int()
    labels = torch.tensor(labels).int()
    report = classification_report(labels, preds, output_dict=True, zero_division=0)
    return {
        'f1_macro': f1_score(labels, preds, average='macro'),
        'f1_micro': f1_score(labels, preds, average='micro'),
        'f1_samples': f1_score(labels, preds, average='samples'),
        'accuracy': (preds == labels).float().mean().item(),
    }

def custom_loss(outputs, labels):
    return BCEWithLogitsLoss()(outputs.logits, labels)

In [None]:
trainer = Trainer(
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    model_init=model_init )

best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    n_trials=20
)

print("Best trial:")
print(best_trial)

# trainer.train()

# metrics = trainer.evaluate()
# print("Evaluation metrics:", metrics)

In [None]:
print("Number of records : ", len(df))

count_anger = df['anger'].value_counts().get(1, 0)
print("Occurrences of 'anger':", count_anger)

count_disgust = df['disgust'].value_counts().get(1, 0)
print("Occurrences of 'disgust':", count_disgust)

count_fear = df['fear'].value_counts().get(1, 0)
print("Occurrences of 'fear':", count_fear)

count_joy = df['joy'].value_counts().get(1, 0)
print("Occurrences of 'joy':", count_joy)

count_sadness = df['sadness'].value_counts().get(1, 0)
print("Occurrences of 'sadness':", count_sadness)

count_surprise = df['surprise'].value_counts().get(1, 0)
print("Occurrences of 'surprise':", count_surprise)
