In [2]:
# Cell 1: Setup and Imports

# Check GPU
!nvidia-smi

# Install dependencies (only if not preinstalled on RunPod)
!pip install -q torch torchvision torchaudio transformers datasets evaluate scikit-learn accelerate

# Imports
import os
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import f1_score, precision_score, recall_score
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)
import evaluate

# Ensure GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ Using device: {device}")

Sun Oct 26 10:28:15 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.124.06             Driver Version: 570.124.06     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L40S                    On  |   00000000:03:00.0 Off |                    0 |
| N/A   22C    P8             33W /  350W |       1MiB /  46068MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
# Cell 2: Read the dataset

# Update these paths if your files are in a subfolder
train_path = "../GoemotionsDataset/goemotions_train_28.csv"
val_path   = "../GoemotionsDataset/goemotions_val_28.csv"

# Read the data
train_df = pd.read_csv(train_path)
val_df   = pd.read_csv(val_path)

print("✅ Train and validation datasets loaded successfully!\n")
print(f"Train shape: {train_df.shape}")
print(f"Validation shape: {val_df.shape}")

# Show first few rows
display(train_df.head())

# Check for key columns
print("\nColumns available:", train_df.columns.tolist())


✅ Train and validation datasets loaded successfully!

Train shape: (48836, 3)
Validation shape: (5427, 3)


Unnamed: 0,text,labels,id
0,My favourite food is anything I didn't have to...,[27],eebbqej
1,"Now if he does off himself, everyone will thin...",[27],ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,[2],eezlygj
3,To make her feel threatened,[14],ed7ypvh
4,Dirty Southern Wankers,[3],ed0bdzj



Columns available: ['text', 'labels', 'id']


In [4]:
# Cell 3: Prepare the data for model input

import ast  # to safely parse stringified lists

NUM_LABELS = 28

def process_labels(label_col):
    """
    Converts stringified label lists into 28-length multi-hot vectors.
    """
    all_labels = []
    for entry in label_col:
        label_list = ast.literal_eval(entry)  # e.g. "[2, 15]" → [2, 15]
        vec = [0] * NUM_LABELS
        for idx in label_list:
            if idx < NUM_LABELS:
                vec[idx] = 1
        all_labels.append(vec)
    return all_labels

train_df["multi_labels"] = process_labels(train_df["labels"])
val_df["multi_labels"]   = process_labels(val_df["labels"])

# Convert to Hugging Face Dataset
train_ds = Dataset.from_pandas(train_df[["text", "multi_labels"]])
val_ds   = Dataset.from_pandas(val_df[["text", "multi_labels"]])

print(train_ds)
print(val_ds)

Dataset({
    features: ['text', 'multi_labels'],
    num_rows: 48836
})
Dataset({
    features: ['text', 'multi_labels'],
    num_rows: 5427
})


In [None]:
from huggingface_hub import login
login("yourhuggingfacehubtoken")  # Replace with your actual token

In [6]:
!pip install -q hf_transfer

In [7]:
# Cell 4: Tokenisation using MentalBERT tokenizer

model_name = "mental/mental-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

train_tokenized = train_ds.map(tokenize_function, batched=True)
val_tokenized   = val_ds.map(tokenize_function, batched=True)

# Rename column for Trainer compatibility
train_tokenized = train_tokenized.rename_column("multi_labels", "labels")
val_tokenized   = val_tokenized.rename_column("multi_labels", "labels")

# Set format for PyTorch
train_tokenized.set_format("torch")
val_tokenized.set_format("torch")

print("✅ Tokenisation complete!")
print(train_tokenized[0])

Map:   0%|          | 0/48836 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

✅ Tokenisation complete!
{'text': "My favourite food is anything I didn't have to cook myself.", 'labels': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1]), 'input_ids': tensor([ 101, 2026, 8837, 2833, 2003, 2505, 1045, 2134, 1005, 1056, 2031, 2000,
        5660, 2870, 1012,  102,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,   

### Model Definition

In [8]:
# Cell 5: Define the MentalBERT model for multi-label classification

from transformers import AutoModelForSequenceClassification

NUM_LABELS = 28
model_name = "mental/mental-bert-base-uncased"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification"
)

# Move model to GPU
model.to(device)
print("✅ Model loaded and moved to GPU!")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model loaded and moved to GPU!


In [9]:
import torch

# Convert labels -> list of float32 values
def force_float_labels(ds):
    new_labels = []
    for example in ds["labels"]:
        new_labels.append([float(x) for x in example])  # plain Python float list
    ds = ds.remove_columns(["labels"]).add_column("labels", new_labels)
    return ds

train_tokenized = force_float_labels(train_tokenized)
val_tokenized   = force_float_labels(val_tokenized)

# Restore proper torch formatting
train_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

print("Label dtype (train):", train_tokenized[0]["labels"].dtype)
print("Label dtype (val):", val_tokenized[0]["labels"].dtype)

Label dtype (train): torch.float32
Label dtype (val): torch.float32


In [12]:
from copy import deepcopy
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
from sklearn.metrics import f1_score
import numpy as np
import torch

# Ensure labels are float32 for BCEWithLogitsLoss
train_tokenized = train_tokenized.map(lambda e: {"labels": e["labels"].float()})
val_tokenized   = val_tokenized.map(lambda e: {"labels": e["labels"].float()})

# Candidate learning rates
learning_rates = [5e-6, 1e-5, 2e-5, 3e-5, 5e-5]

def make_args(out_dir, epochs=2, lr=2e-5):
    try:
        return TrainingArguments(
            output_dir=out_dir,
            learning_rate=lr,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=epochs,
            save_strategy="no",
            logging_dir=f"{out_dir}/logs",
            logging_strategy="steps",
            logging_steps=100,
            report_to="none",
            do_eval=True,
        )
    except TypeError:
        return TrainingArguments(
            output_dir=out_dir,
            learning_rate=lr,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=epochs,
            logging_dir=f"{out_dir}/logs",
            logging_steps=100,
            do_eval=True,
        )

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits))
    preds = (probs > 0.5).int().numpy()
    return {"f1": f1_score(labels, preds, average="macro")}

results = {}
print("Starting manual learning-rate sweep...\n")

for lr in learning_rates:
    out_dir = f"/workspace/mentalbert_lr_{lr}"
    print(f"Training with learning rate = {lr}")

    model_temp = AutoModelForSequenceClassification.from_pretrained(
        "mental/mental-bert-base-uncased",
        num_labels=28,
        problem_type="multi_label_classification"
    ).to(device)

    args = make_args(out_dir, epochs=2, lr=lr)

    trainer = Trainer(
        model=model_temp,
        args=args,
        train_dataset=train_tokenized,
        eval_dataset=val_tokenized,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    eval_metrics = trainer.evaluate()
    results[lr] = float(eval_metrics.get("eval_f1", np.nan))
    print(f"Learning rate {lr} -> F1 = {results[lr]:.4f}\n")

print("LR sweep complete.")
print(results)

best_lr = max(results, key=results.get)
print(f"Best learning rate: {best_lr} with F1 = {results[best_lr]:.4f}")

Map:   0%|          | 0/48836 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

Starting manual learning-rate sweep...

Training with learning rate = 5e-06


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.4919
200,0.2911
300,0.227
400,0.1947
500,0.178
600,0.1678
700,0.1622
800,0.1583
900,0.1585
1000,0.1522


Learning rate 5e-06 -> F1 = 0.1369

Training with learning rate = 1e-05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.4085
200,0.2188
300,0.175
400,0.1594
500,0.1551
600,0.1525
700,0.1505
800,0.1453
900,0.1447
1000,0.1353


Learning rate 1e-05 -> F1 = 0.2611

Training with learning rate = 2e-05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.3189
200,0.1697
300,0.154
400,0.1485
500,0.1427
600,0.1348
700,0.1283
800,0.1207
900,0.1186
1000,0.1107


Learning rate 2e-05 -> F1 = 0.4001

Training with learning rate = 3e-05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.2733
200,0.1582
300,0.15
400,0.1389
500,0.1292
600,0.1216
700,0.1157
800,0.109
900,0.1087
1000,0.1018


Learning rate 3e-05 -> F1 = 0.4390

Training with learning rate = 5e-05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.2287
200,0.1534
300,0.1418
400,0.1269
500,0.1177
600,0.113
700,0.107
800,0.1025
900,0.1022
1000,0.0966


Learning rate 5e-05 -> F1 = 0.4565

LR sweep complete.
{5e-06: 0.13686793210182027, 1e-05: 0.26114374236324733, 2e-05: 0.4000854115105317, 3e-05: 0.4389896109974307, 5e-05: 0.45650435254434224}
Best learning rate: 5e-05 with F1 = 0.4565


## This cell gave me information for the best learning rate on 2 epochs. 

### Exploring the combination including weight decay as parameter now

In [13]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
from sklearn.metrics import f1_score
import numpy as np
import torch

# Ensure labels are float32 for BCEWithLogitsLoss
train_tokenized = train_tokenized.map(lambda e: {"labels": e["labels"].to(torch.float32)})
val_tokenized   = val_tokenized.map(lambda e: {"labels": e["labels"].to(torch.float32)})

# Candidate learning rates and weight decays
learning_rates = [2e-5, 3e-5, 5e-5]
weight_decays = [0.0, 0.01, 0.05]

def make_args(out_dir, epochs=2, lr=2e-5, wd=0.0):
    try:
        return TrainingArguments(
            output_dir=out_dir,
            learning_rate=lr,
            weight_decay=wd,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=epochs,
            save_strategy="no",
            logging_dir=f"{out_dir}/logs",
            logging_strategy="steps",
            logging_steps=100,
            report_to="none",
            do_eval=True,
        )
    except TypeError:
        return TrainingArguments(
            output_dir=out_dir,
            learning_rate=lr,
            weight_decay=wd,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=epochs,
            logging_dir=f"{out_dir}/logs",
            logging_steps=100,
            do_eval=True,
        )

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits))
    preds = (probs > 0.5).int().numpy()
    return {"f1": f1_score(labels, preds, average="macro")}

results = {}
print("Starting grid search over learning rate and weight decay...\n")

for lr in learning_rates:
    for wd in weight_decays:
        out_dir = f"/workspace/mentalbert_lr{lr}_wd{wd}"
        print(f"Training with learning rate = {lr}, weight decay = {wd}")

        model_temp = AutoModelForSequenceClassification.from_pretrained(
            "mental/mental-bert-base-uncased",
            num_labels=28,
            problem_type="multi_label_classification"
        ).to(device)

        args = make_args(out_dir, epochs=2, lr=lr, wd=wd)

        trainer = Trainer(
            model=model_temp,
            args=args,
            train_dataset=train_tokenized,
            eval_dataset=val_tokenized,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )

        trainer.train()
        eval_metrics = trainer.evaluate()
        f1 = float(eval_metrics.get("eval_f1", np.nan))
        results[(lr, wd)] = f1
        print(f"LR={lr}, WD={wd} -> F1 = {f1:.4f}\n")

print("Grid search complete.\n")
for (lr, wd), f1 in results.items():
    print(f"LR={lr:<8}  WD={wd:<5}  F1={f1:.4f}")

best_pair = max(results, key=results.get)
print(f"\nBest combination: LR={best_pair[0]}, WD={best_pair[1]}  with F1={results[best_pair]:.4f}")


Map:   0%|          | 0/48836 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

Starting grid search over learning rate and weight decay...

Training with learning rate = 2e-05, weight decay = 0.0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.3189
200,0.1697
300,0.154
400,0.1485
500,0.1427
600,0.1348
700,0.1283
800,0.1207
900,0.1186
1000,0.1107


LR=2e-05, WD=0.0 -> F1 = 0.4001

Training with learning rate = 2e-05, weight decay = 0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.3189
200,0.1697
300,0.1539
400,0.1484
500,0.1424
600,0.1344
700,0.1279
800,0.1202
900,0.1183
1000,0.1104


LR=2e-05, WD=0.01 -> F1 = 0.4020

Training with learning rate = 2e-05, weight decay = 0.05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.3189
200,0.1697
300,0.154
400,0.1486
500,0.1428
600,0.1351
700,0.1287
800,0.121
900,0.1189
1000,0.1109


LR=2e-05, WD=0.05 -> F1 = 0.3998

Training with learning rate = 3e-05, weight decay = 0.0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.2733
200,0.1582
300,0.15
400,0.1389
500,0.1292
600,0.1216
700,0.1157
800,0.109
900,0.1087
1000,0.1018


LR=3e-05, WD=0.0 -> F1 = 0.4390

Training with learning rate = 3e-05, weight decay = 0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.2733
200,0.1582
300,0.15
400,0.1387
500,0.1291
600,0.1216
700,0.1156
800,0.1092
900,0.1086
1000,0.1019


LR=3e-05, WD=0.01 -> F1 = 0.4359

Training with learning rate = 3e-05, weight decay = 0.05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.2733
200,0.1582
300,0.15
400,0.1387
500,0.129
600,0.1215
700,0.1156
800,0.1089
900,0.1085
1000,0.1018


LR=3e-05, WD=0.05 -> F1 = 0.4374

Training with learning rate = 5e-05, weight decay = 0.0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.2287
200,0.1534
300,0.1418
400,0.1269
500,0.1177
600,0.113
700,0.107
800,0.1025
900,0.1022
1000,0.0966


LR=5e-05, WD=0.0 -> F1 = 0.4565

Training with learning rate = 5e-05, weight decay = 0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.2287
200,0.1534
300,0.1414
400,0.1263
500,0.1175
600,0.1127
700,0.1069
800,0.103
900,0.1023
1000,0.0963


LR=5e-05, WD=0.01 -> F1 = 0.4662

Training with learning rate = 5e-05, weight decay = 0.05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.2287
200,0.1534
300,0.1417
400,0.1264
500,0.1176
600,0.1127
700,0.1074
800,0.1023
900,0.1019
1000,0.0967


LR=5e-05, WD=0.05 -> F1 = 0.4649

Grid search complete.

LR=2e-05     WD=0.0    F1=0.4001
LR=2e-05     WD=0.01   F1=0.4020
LR=2e-05     WD=0.05   F1=0.3998
LR=3e-05     WD=0.0    F1=0.4390
LR=3e-05     WD=0.01   F1=0.4359
LR=3e-05     WD=0.05   F1=0.4374
LR=5e-05     WD=0.0    F1=0.4565
LR=5e-05     WD=0.01   F1=0.4662
LR=5e-05     WD=0.05   F1=0.4649

Best combination: LR=5e-05, WD=0.01  with F1=0.4662


### Trying the best combination for 10 epochs now

In [15]:
from transformers import (
    Trainer, TrainingArguments, AutoModelForSequenceClassification, EarlyStoppingCallback
)
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import numpy as np, math, torch, os

# ==== config ====
best_lr = 5e-5
best_wd = 0.01
epochs = 10
per_device_bs = 16
output_dir = "../MentalBert/mentalbert_fine_tuned_learningrate_5e-5_weightdecay_0.01"
os.makedirs(output_dir, exist_ok=True)

# ==== metrics ====
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits))
    preds = (probs > 0.5).int().numpy()
    labels = np.array(labels)

    return {
        "f1": f1_score(labels, preds, average="macro"),
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average="macro", zero_division=0),
        "recall": recall_score(labels, preds, average="macro", zero_division=0),
    }

# ==== model ====
model_final = AutoModelForSequenceClassification.from_pretrained(
    "mental/mental-bert-base-uncased",
    num_labels=28,
    problem_type="multi_label_classification"
).to(device)

# ==== robust TrainingArguments (handles older/newer transformers) ====
def build_args():
    # steps per epoch for legacy fallback
    steps_per_epoch = math.ceil(len(train_tokenized) / per_device_bs)

    # Try newest API first
    try:
        return TrainingArguments(
            output_dir=output_dir,
            learning_rate=best_lr,
            weight_decay=best_wd,
            per_device_train_batch_size=per_device_bs,
            per_device_eval_batch_size=per_device_bs,
            num_train_epochs=epochs,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
            save_total_limit=1,
            logging_dir=f"{output_dir}/logs",
            logging_strategy="steps",
            logging_steps=100,
            report_to="none",
        )
    except TypeError:
        # Some older versions accept eval_strategy/save_strategy instead
        try:
            return TrainingArguments(
                output_dir=output_dir,
                learning_rate=best_lr,
                weight_decay=best_wd,
                per_device_train_batch_size=per_device_bs,
                per_device_eval_batch_size=per_device_bs,
                num_train_epochs=epochs,
                eval_strategy="epoch",            # older kw
                save_strategy="epoch",
                load_best_model_at_end=True,
                metric_for_best_model="f1",
                greater_is_better=True,
                save_total_limit=1,
                logging_dir=f"{output_dir}/logs",
                logging_steps=100,
            )
        except TypeError:
            # Legacy fallback: evaluate once per epoch using eval_steps/save_steps
            return TrainingArguments(
                output_dir=output_dir,
                learning_rate=best_lr,
                weight_decay=best_wd,
                per_device_train_batch_size=per_device_bs,
                per_device_eval_batch_size=per_device_bs,
                num_train_epochs=epochs,
                do_eval=True,
                evaluate_during_training=True,     # legacy flag
                eval_steps=steps_per_epoch,        # approx "per epoch"
                save_steps=steps_per_epoch,        # align save with eval
                save_total_limit=1,
                logging_dir=f"{output_dir}/logs",
                logging_steps=100,
            )

args_final = build_args()

# ==== trainer + early stopping (patience=2) ====
early_stopping = EarlyStoppingCallback(early_stopping_patience=2)

trainer_final = Trainer(
    model=model_final,
    args=args_final,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
)

# ==== train ====
trainer_final.train()

# Save the best/last model
save_path = f"{output_dir}/best_model"
trainer_final.save_model(save_path)
print(f"Saved model to: {save_path}")

# Final evaluation on validation set
final_metrics = trainer_final.evaluate()
print("Final validation metrics:", final_metrics)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_final = Trainer(


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.0873,0.083837,0.38315,0.416805,0.533069,0.333216
2,0.073,0.083423,0.456238,0.430993,0.592771,0.404333
3,0.0598,0.087243,0.482227,0.488668,0.613904,0.433255
4,0.0447,0.09948,0.479938,0.469689,0.574182,0.436715
5,0.0297,0.112192,0.482693,0.436705,0.505259,0.469757
6,0.0217,0.123694,0.496651,0.438732,0.50063,0.501222
7,0.0161,0.13146,0.489521,0.434126,0.504285,0.490424
8,0.0111,0.139568,0.485506,0.440391,0.493939,0.4858


Saved model to: ../MentalBert/mentalbert_fine_tuned_learningrate_5e-5_weightdecay_0.01/best_model


Final validation metrics: {'eval_loss': 0.12369436025619507, 'eval_f1': 0.49665139076366077, 'eval_accuracy': 0.43873226460291137, 'eval_precision': 0.5006301984334333, 'eval_recall': 0.5012220421772154, 'eval_runtime': 6.3985, 'eval_samples_per_second': 848.166, 'eval_steps_per_second': 53.137, 'epoch': 8.0}


### Trying again

In [10]:
from transformers import (
    AutoConfig, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback
)
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import numpy as np, torch, os

# Optional: confirm version
import transformers as _tf
print("Transformers version:", _tf.__version__)

# ==== paths & params ====
output_dir = "../MentalBert/mentalbert_fine_tuned_dropout_0.3"
os.makedirs(output_dir, exist_ok=True)

best_lr = 5e-5
best_wd = 0.01
epochs = 10
per_device_bs = 16

# ==== config with higher dropout ====
config = AutoConfig.from_pretrained("mental/mental-bert-base-uncased")
config.num_labels = 28
config.problem_type = "multi_label_classification"
config.hidden_dropout_prob = 0.3
config.attention_probs_dropout_prob = 0.3

# ==== model ====
model_dropout = AutoModelForSequenceClassification.from_pretrained(
    "mental/mental-bert-base-uncased",
    config=config
).to(device)

# ==== metrics ====
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits))
    preds = (probs > 0.5).int().numpy()
    labels = np.array(labels)
    return {
        "f1": f1_score(labels, preds, average="macro"),
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average="macro", zero_division=0),
        "recall": recall_score(labels, preds, average="macro", zero_division=0),
    }

# Steps per epoch for step-based eval/save
steps_per_epoch = max(1, len(train_tokenized) // per_device_bs)

# ==== TrainingArguments using legacy-compatible names ====
args_dropout = TrainingArguments(
    output_dir=output_dir,
    learning_rate=best_lr,
    weight_decay=best_wd,
    per_device_train_batch_size=per_device_bs,
    per_device_eval_batch_size=per_device_bs,
    num_train_epochs=epochs,
    logging_dir=f"{output_dir}/logs",
    logging_steps=100,
    do_eval=True,
    eval_steps=steps_per_epoch,   # evaluate once per epoch
    save_steps=steps_per_epoch,   # save once per epoch
    eval_strategy="steps",        # << use older kwarg name
    save_strategy="steps",        # must match eval_strategy
    load_best_model_at_end=True,
    save_total_limit=1,
)

# ==== early stopping ====
early_stopping = EarlyStoppingCallback(early_stopping_patience=2)

# ==== trainer ====
trainer_dropout = Trainer(
    model=model_dropout,
    args=args_dropout,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
)

# ==== train & save ====
trainer_dropout.train()
save_path = f"{output_dir}/best_model"
trainer_dropout.save_model(save_path)
print(f"\nModel with dropout=0.3 saved at: {save_path}")

# ==== evaluate ====
final_metrics = trainer_dropout.evaluate()
print("\nFinal validation metrics:")
for k, v in final_metrics.items():
    try:
        print(f"{k}: {v:.4f}")
    except Exception:
        print(k, v)

Transformers version: 4.57.1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_dropout = Trainer(


Step,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
3052,0.0933,0.088392,0.387431,0.3993,0.48997,0.355438
6104,0.0814,0.08853,0.44721,0.39672,0.512415,0.423665
9156,0.0769,0.083184,0.47246,0.455132,0.557358,0.439644
12208,0.0742,0.088338,0.474888,0.437442,0.545025,0.456057
15260,0.0655,0.091846,0.498234,0.434863,0.552163,0.502481



Model with dropout=0.3 saved at: ../MentalBert/mentalbert_fine_tuned_dropout_0.3/best_model



Final validation metrics:
eval_loss: 0.0832
eval_f1: 0.4725
eval_accuracy: 0.4551
eval_precision: 0.5574
eval_recall: 0.4396
eval_runtime: 6.3776
eval_samples_per_second: 850.9420
eval_steps_per_second: 53.3110
epoch: 4.9984


In [13]:
from transformers import (
    AutoConfig, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import torch, numpy as np, os

# ==== Paths & hyperparameters ====
output_dir = "../MentalBert/mentalbert_fine_tuned_dropout_0.3_full10"
os.makedirs(output_dir, exist_ok=True)

best_lr = 5e-5
best_wd = 0.01
epochs = 10
per_device_bs = 16

# ==== Load config with dropout ====
config = AutoConfig.from_pretrained("mental/mental-bert-base-uncased")
config.num_labels = 28
config.problem_type = "multi_label_classification"
config.hidden_dropout_prob = 0.3
config.attention_probs_dropout_prob = 0.3

# ==== Model ====
model_dropout = AutoModelForSequenceClassification.from_pretrained(
    "mental/mental-bert-base-uncased",
    config=config
).to(device)

# ==== Metrics ====
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits))
    preds = (probs > 0.5).int().numpy()
    labels = np.array(labels)
    return {
        "f1": f1_score(labels, preds, average="macro"),
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average="macro", zero_division=0),
        "recall": recall_score(labels, preds, average="macro", zero_division=0),
    }

# ==== Training arguments ====
args_full10 = TrainingArguments(
    output_dir=output_dir,
    learning_rate=best_lr,
    weight_decay=best_wd,
    per_device_train_batch_size=per_device_bs,
    per_device_eval_batch_size=per_device_bs,
    num_train_epochs=epochs,
    logging_dir=f"{output_dir}/logs",
    logging_steps=100,
    do_eval=True,
    eval_strategy="epoch",    # ✅ evaluate once per epoch
    save_strategy="epoch",    # ✅ save once per epoch
    load_best_model_at_end=True,
    save_total_limit=1,
    report_to="none"
)

# ==== Trainer (no early stopping) ====
trainer_full10 = Trainer(
    model=model_dropout,
    args=args_full10,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# ==== Train & save ====
trainer_full10.train()
save_path = f"{output_dir}/best_model"
trainer_full10.save_model(save_path)
print(f"\n✅ Model trained for full {epochs} epochs and saved at: {save_path}")

# ==== Evaluate final model ====
final_metrics = trainer_full10.evaluate()
print("\nFinal validation metrics:")
for k, v in final_metrics.items():
    try:
        print(f"{k}: {v:.4f}")
    except Exception:
        print(k, v)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_full10 = Trainer(


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.0932,0.089558,0.39649,0.405565,0.469182,0.373103
2,0.081,0.087473,0.454269,0.399116,0.538379,0.426403
3,0.076,0.083184,0.469684,0.462318,0.547345,0.441843
4,0.0739,0.087994,0.483594,0.441128,0.549783,0.465418
5,0.0647,0.092106,0.48787,0.430809,0.519083,0.49085
6,0.0606,0.095393,0.497767,0.423807,0.534489,0.503611
7,0.0565,0.098408,0.487237,0.420122,0.528231,0.49175
8,0.0508,0.103045,0.509935,0.414962,0.530176,0.520155
9,0.0479,0.104349,0.511039,0.425465,0.531113,0.51455
10,0.0457,0.107588,0.513433,0.42049,0.520393,0.5279



✅ Model trained for full 10 epochs and saved at: ../MentalBert/mentalbert_fine_tuned_dropout_0.3_full10/best_model



Final validation metrics:
eval_loss: 0.0832
eval_f1: 0.4697
eval_accuracy: 0.4623
eval_precision: 0.5473
eval_recall: 0.4418
eval_runtime: 6.3062
eval_samples_per_second: 860.5820
eval_steps_per_second: 53.9150
epoch: 10.0000
