In [1]:
# needed to fix a bug with unsloth
%env UNSLOTH_COMPILE_DISABLE = 1

env: UNSLOTH_COMPILE_DISABLE=1


In [2]:
from unsloth import FastLanguageModel, FastModel
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import os
import torch
from torch import tensor
import torch.nn.functional as F
from transformers import TrainingArguments, Trainer, ModernBertModel, AutoModelForSequenceClassification, training_args
from datasets import load_dataset, Dataset
from tqdm import tqdm

model_name = 'answerdotai/ModernBERT-large'

NUM_CLASSES = 3
DATA_DIR = "data/"

model, tokenizer = FastModel.from_pretrained(
    model_name = model_name,load_in_4bit = False,
    max_seq_length = 2048,
    dtype = None,
    auto_model = AutoModelForSequenceClassification,
    num_labels = NUM_CLASSES,
)
print("model parameters:" + str(sum(p.numel() for p in model.parameters())))

# make all parameters trainable
for param in model.parameters():
    param.requires_grad = True

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


  GPU_BUFFERS = tuple([torch.empty(2*256*2048, dtype = dtype, device = f"cuda:{i}") for i in range(n_gpus)])


==((====))==  Unsloth 2025.4.5: Fast Modernbert patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 23.999 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model parameters:395834371


The dataset can be found [here](https://github.com/timothelaborie/text_classification_scripts/blob/main/data/finance_sentiment_multiclass.csv)

In [3]:
data = pd.read_csv(DATA_DIR + "finance_sentiment_multiclass.csv")

labels = data["label"].tolist()
labels = [x-1 for x in labels]
# convert labels to one hot vectors
labels = np.eye(3)[labels]

train_data,val_data, train_labels, val_labels = train_test_split(data["text"], labels, test_size=0.1, random_state=42)
dataset = Dataset.from_list([{'text': text, 'labels': label} for text, label in zip(train_data, train_labels)])
val_dataset = Dataset.from_list([{'text': text, 'labels': label} for text, label in zip(val_data, val_labels)])

def tokenize_function(examples):
    return tokenizer(examples['text'])

dataset = dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
dataset

Map:   0%|          | 0/3893 [00:00<?, ? examples/s]

Map:   0%|          | 0/433 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 3893
})

In [4]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    eval_dataset=val_dataset,
    args=TrainingArguments(
        per_device_train_batch_size=32,
        gradient_accumulation_steps=1,
        warmup_steps=10,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        optim=training_args.OptimizerNames.ADAMW_TORCH,
        learning_rate=5e-5,
        weight_decay=0.001,
        lr_scheduler_type="cosine",
        seed=3407,
        num_train_epochs=3, # bert-style models usually need more than 1 epoch
        save_strategy="epoch",

        # report_to="wandb",
        report_to="none",

        group_by_length=True,

        # eval_strategy="no",
        eval_strategy="steps",
        eval_steps=0.25,
        logging_strategy="steps",
        logging_steps=0.25,
        
    ),
    compute_metrics=lambda eval_pred: { "accuracy": accuracy_score(eval_pred[1].argmax(axis=-1), eval_pred[0].argmax(axis=-1)) }
)
trainer_stats = trainer.train()

  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,893 | Num Epochs = 3 | Total steps = 366
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 395,834,371/395,834,371 (100.00% trained)


Step,Training Loss,Validation Loss,Accuracy
92,0.5649,0.430882,0.713626
184,0.3465,0.308327,0.812933
276,0.2475,0.28841,0.822171


Unsloth: Will smartly offload gradients to save VRAM!


In [5]:
model = model.cuda()
model = model.eval()
FastLanguageModel.for_inference(model)
print()




In [6]:
batch_size = 32
correct = 0
results = []

# If your val_labels are one-hot, convert to class indices
if isinstance(val_labels, np.ndarray) and val_labels.ndim == 2:
    val_true_labels = np.argmax(val_labels, axis=1)
else:
    val_true_labels = val_labels

val_texts = list(val_data)
val_true_labels = list(val_true_labels)

with torch.no_grad():
    for i in tqdm(range(0, len(val_texts), batch_size), desc="Evaluating"):
        batch_texts = val_texts[i:i+batch_size]
        batch_labels = val_true_labels[i:i+batch_size]
        # Tokenize
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=2048)
        inputs = {k: v.cuda() for k, v in inputs.items()}
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)
        preds = torch.argmax(probs, dim=-1).cpu().numpy()
        # Count correct
        correct += np.sum(preds == batch_labels)
        # Store results for display
        for j in range(len(batch_texts)):
            results.append({
                "text": batch_texts[j][:200],
                "true": batch_labels[j],
                "pred": preds[j],
                "probs": probs[j].detach().float().cpu().numpy(),
                "ok": preds[j] == batch_labels[j]
        })

accuracy = 100 * correct / len(val_texts)
print(f"\nValidation accuracy: {accuracy:.2f}% ({correct}/{len(val_texts)})")

# Show a few random samples
import random
display = 20
print("\n--- Random samples ---")
for s in random.sample(results, min(display, len(results))):
    print(f"\nText: {s['text']}")
    print(f"True: {s['true']}  Pred: {s['pred']} {'✅' if s['ok'] else '❌'}")
    print("Probs:", ", ".join([f"{k}: {v:.3f}" for k, v in enumerate(s['probs'])]))

Evaluating: 100%|██████████| 14/14 [00:01<00:00, 11.78it/s]


Validation accuracy: 82.45% (357/433)

--- Random samples ---

Text: Turkey Stiffens Manipulation Penalties in Banking Overhaul
True: 0  Pred: 0 ✅
Probs: 0: 0.846, 1: 0.114, 2: 0.040

Text: The Manitowoc Company, Inc. Just Reported Earnings, And Analysts Cut Their Target Price
True: 2  Pred: 2 ✅
Probs: 0: 0.065, 1: 0.029, 2: 0.906

Text: $BLMN $EAT $SBUX - Restaurants stocks break higher, analysts reel in near-term expectations https://t.co/fOjVVJdfF0
True: 1  Pred: 1 ✅
Probs: 0: 0.004, 1: 0.974, 2: 0.021

Text: $CMCSA $LHX - Comcast sues L3Harris in patent dispute https://t.co/kWReshGbvz
True: 2  Pred: 2 ✅
Probs: 0: 0.023, 1: 0.035, 2: 0.942

Text: Libyan economic experts will study the distribution of crucial oil revenue as efforts continue to solve the war-rav… https://t.co/S9lmpnDTqJ
True: 0  Pred: 0 ✅
Probs: 0: 0.888, 1: 0.013, 2: 0.099

Text: Stocks Suffer 'Shocking' Down Week As Fed Balance Sheet Unexpectedly Shrinks https://t.co/bspsRi3Wow
True: 2  Pred: 2 ✅
Probs: 0: 0.005, 1




In [7]:
# stop running all cells
1/0

ZeroDivisionError: division by zero

# to load the model again (run every cell above the one where the trainer is called)

In [None]:
from transformers.trainer_utils import get_last_checkpoint

output_dir = "trainer_output"
last_checkpoint = get_last_checkpoint(output_dir)
print("Last checkpoint:", last_checkpoint)

model, tokenizer = FastModel.from_pretrained(
    model_name = last_checkpoint,load_in_4bit = False,
    max_seq_length = 2048,
    dtype = None,
    auto_model = AutoModelForSequenceClassification,
    num_labels = NUM_CLASSES,
)

Last checkpoint: trainer_output\checkpoint-244
==((====))==  Unsloth 2025.4.5: Fast Modernbert patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 23.999 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


In [None]:
from torch import tensor
print(model(input_ids=tensor([[1,2,3,4,5]]).cuda(), attention_mask=tensor([[1,1,1,1,1]]).cuda()))

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0579, -0.5859, -1.1719]], device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
