In [1]:
# ==== Libraries ====
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import os

# ==== Check device ====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [2]:
# ==== Load 6-class dataset ====
train_path = "../GoemotionsDataset/goemotions_train_grouped.csv"
val_path = "../GoemotionsDataset/goemotions_val_grouped.csv"

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)

print("Training shape:", train_df.shape)
print("Validation shape:", val_df.shape)
train_df.head()

Training shape: (48836, 2)
Validation shape: (5427, 2)


Unnamed: 0,text,labels
0,My favourite food is anything I didn't have to...,neutral
1,"Now if he does off himself, everyone will thin...",neutral
2,WHY THE FUCK IS BAYLESS ISOING,anger
3,To make her feel threatened,fear
4,Dirty Southern Wankers,anger


In [3]:
# ==== Check label distribution ====
print("Training labels distribution:")
print(train_df["labels"].value_counts(normalize=True).round(3))

print("\nValidation labels distribution:")
print(val_df["labels"].value_counts(normalize=True).round(3))

Training labels distribution:
labels
joy         0.378
neutral     0.327
anger       0.116
surprise    0.096
sadness     0.057
fear        0.014
disgust     0.013
Name: proportion, dtype: float64

Validation labels distribution:
labels
joy         0.365
neutral     0.329
anger       0.122
surprise    0.097
sadness     0.056
fear        0.016
disgust     0.015
Name: proportion, dtype: float64


In [4]:
# ==== Tokenizer ====
MODEL_NAME = "mental/mental-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("Tokenizer loaded successfully!")

Tokenizer loaded successfully!


In [5]:
# ==== Encode string labels into numeric IDs ====
label_names = sorted(train_df["labels"].unique())
label_to_id = {label: idx for idx, label in enumerate(label_names)}
id_to_label = {v: k for k, v in label_to_id.items()}

print("Label mapping:", label_to_id)

# Apply mapping to both train and validation data
train_df["labels"] = train_df["labels"].map(label_to_id)
val_df["labels"] = val_df["labels"].map(label_to_id)

# ==== PyTorch Dataset class ====
class MentalHealthDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.texts = df["text"].tolist()
        self.labels = df["labels"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])  # now safe, all numeric

        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(label, dtype=torch.long)
        return item

# ==== Create dataset objects ====
train_dataset = MentalHealthDataset(train_df, tokenizer)
val_dataset = MentalHealthDataset(val_df, tokenizer)
print("Datasets created successfully!")


Label mapping: {'anger': 0, 'disgust': 1, 'fear': 2, 'joy': 3, 'neutral': 4, 'sadness': 5, 'surprise': 6}
Datasets created successfully!


In [6]:
# ==== Metrics ====
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    prec = precision_score(labels, preds, average="weighted")
    rec = recall_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1, "precision": prec, "recall": rec}

In [7]:
# ==== Model training setup for 7-class classification ====
from transformers import (
    Trainer, TrainingArguments, AutoModelForSequenceClassification, EarlyStoppingCallback
)
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import numpy as np, math, torch, os

# ==== Parameters ====
best_lr = 5e-5
best_wd = 0.01
epochs = 10
per_device_bs = 16
output_dir = "../MentalBert/mentalbert_fine_tuned_6class_learningrate_5e-5_weightdecay_0.01"
os.makedirs(output_dir, exist_ok=True)

# ==== Metrics ====
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    labels = np.array(labels)

    return {
        "f1": f1_score(labels, preds, average="weighted"),
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average="weighted", zero_division=0),
        "recall": recall_score(labels, preds, average="weighted", zero_division=0),
    }

# ==== Model ====
model_final = AutoModelForSequenceClassification.from_pretrained(
    "mental/mental-bert-base-uncased",
    num_labels=7, 
    problem_type="single_label_classification"
).to(device)


# ==== TrainingArguments (with version compatibility) ====
def build_args():
    steps_per_epoch = math.ceil(len(train_dataset) / per_device_bs)
    try:
        return TrainingArguments(
            output_dir=output_dir,
            learning_rate=best_lr,
            weight_decay=best_wd,
            per_device_train_batch_size=per_device_bs,
            per_device_eval_batch_size=per_device_bs,
            num_train_epochs=epochs,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
            save_total_limit=1,
            logging_dir=f"{output_dir}/logs",
            logging_strategy="steps",
            logging_steps=100,
            report_to="none",
        )
    except TypeError:
        # fallback for older transformers versions
        try:
            return TrainingArguments(
                output_dir=output_dir,
                learning_rate=best_lr,
                weight_decay=best_wd,
                per_device_train_batch_size=per_device_bs,
                per_device_eval_batch_size=per_device_bs,
                num_train_epochs=epochs,
                eval_strategy="epoch",
                save_strategy="epoch",
                load_best_model_at_end=True,
                metric_for_best_model="f1",
                greater_is_better=True,
                save_total_limit=1,
                logging_dir=f"{output_dir}/logs",
                logging_steps=100,
            )
        except TypeError:
            # legacy fallback
            return TrainingArguments(
                output_dir=output_dir,
                learning_rate=best_lr,
                weight_decay=best_wd,
                per_device_train_batch_size=per_device_bs,
                per_device_eval_batch_size=per_device_bs,
                num_train_epochs=epochs,
                do_eval=True,
                evaluate_during_training=True,
                eval_steps=steps_per_epoch,
                save_steps=steps_per_epoch,
                save_total_limit=1,
                logging_dir=f"{output_dir}/logs",
                logging_steps=100,
            )

args_final = build_args()

# ==== Early stopping ====
early_stopping = EarlyStoppingCallback(early_stopping_patience=2)

# ==== Trainer ====
trainer_final = Trainer(
    model=model_final,
    args=args_final,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
)

print("Trainer initialised successfully!")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainer initialised successfully!


  trainer_final = Trainer(


In [8]:
# ==== Train the model ====
print(" Starting fine-tuning for 6-class dataset...\n")
trainer_final.train()

# ==== Save the best model ====
save_path = f"{output_dir}/best_model"
trainer_final.save_model(save_path)
print(f"\n Model saved successfully to: {save_path}")

# ==== Final evaluation on validation set ====
print("\n Evaluating best model on validation data...\n")
final_metrics = trainer_final.evaluate()
print("Final validation metrics:")
for k, v in final_metrics.items():
    print(f"{k}: {v:.6f}" if isinstance(v, float) else f"{k}: {v}")

 Starting fine-tuning for 6-class dataset...



Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.8602,0.824537,0.673449,0.683803,0.673722,0.683803
2,0.6568,0.907484,0.659541,0.667957,0.672668,0.667957
3,0.4445,1.054325,0.673523,0.675511,0.672609,0.675511
4,0.2905,1.441023,0.648092,0.649714,0.652284,0.649714
5,0.201,1.986628,0.650487,0.650451,0.655651,0.650451



 Model saved successfully to: ../MentalBert/mentalbert_fine_tuned_6class_learningrate_5e-5_weightdecay_0.01/best_model

 Evaluating best model on validation data...



Final validation metrics:
eval_loss: 1.054325
eval_f1: 0.673523
eval_accuracy: 0.675511
eval_precision: 0.672609
eval_recall: 0.675511
eval_runtime: 7.306800
eval_samples_per_second: 742.736000
eval_steps_per_second: 46.532000
epoch: 5.000000


In [22]:
# ==== Sanity check for label encoding ====
print("Unique labels in training set:", sorted(train_df["labels"].unique()))
print("Unique labels in validation set:", sorted(val_df["labels"].unique()))
print("Expected number of classes:", len(label_to_id))

# Remove any rows with missing or out-of-range labels
valid_labels = set(range(len(label_to_id)))
train_df = train_df[train_df["labels"].isin(valid_labels)]
val_df = val_df[val_df["labels"].isin(valid_labels)]

print("\nAfter cleanup:")
print("Train labels:", sorted(train_df["labels"].unique()))
print("Val labels:", sorted(val_df["labels"].unique()))

Unique labels in training set: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]
Unique labels in validation set: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]
Expected number of classes: 7

After cleanup:
Train labels: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]
Val labels: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]
