In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q transformers datasets accelerate evaluate


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
from dataclasses import dataclass
from typing import Dict, Any

import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    set_seed,
)
import evaluate


In [None]:
config = {
    # ✅ your MLM model
    "model_name_or_path": "mlm_bert_goemotions_finance",

    # ✅ paths to your financial data
    # Option A: you already have train/val/test
    "train_file": "data/processed/fin_train.jsonl",
    "val_file":   "data/processed/fin_val.jsonl",
    "test_file":  "data/processed/fin_test.jsonl",

    # number of sentiment labels: 3 = [neg, neu, pos]
    "num_labels": 3,

    # training hyperparams
    "learning_rate": 2e-5,
    "batch_size": 16,
    "num_epochs": 5,
    "weight_decay": 0.01,
    "warmup_ratio": 0.06,
    "seed": 42,

    # where to save checkpoints
    "output_dir": "sft_financial_full",
}

set_seed(config["seed"])


In [None]:
data_files = {
    "train": config["train_file"],
    "validation": config["val_file"],
    "test": config["test_file"],
}

raw_datasets = load_dataset(
    "json",
    data_files=data_files,
)
print(raw_datasets)
print(raw_datasets["train"][0])


In [None]:
tokenizer = AutoTokenizer.from_pretrained(config["model_name_or_path"], use_fast=True)

def preprocess_function(examples):
    # `text` column -> tokenized inputs
    result = tokenizer(
        examples["text"],
        truncation=True,
        padding=False,  # we'll pad dynamically in the data collator
        max_length=256,
    )
    return result

tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=[col for col in raw_datasets["train"].column_names if col not in ("text", "label")],
)
print(tokenized_datasets)


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    config["model_name_or_path"],
    num_labels=config["num_labels"],
    problem_type="single_label_classification",
)


In [None]:
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    metrics: Dict[str, Any] = {}
    metrics["accuracy"] = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
    # macro F1
    metrics["macro_f1"] = f1_metric.compute(
        predictions=preds,
        references=labels,
        average="macro"
    )["f1"]
    # weighted F1
    metrics["weighted_f1"] = f1_metric.compute(
        predictions=preds,
        references=labels,
        average="weighted"
    )["f1"]

    return metrics


In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir=config["output_dir"],
    learning_rate=config["learning_rate"],
    per_device_train_batch_size=config["batch_size"],
    per_device_eval_batch_size=config["batch_size"],
    num_train_epochs=config["num_epochs"],
    weight_decay=config["weight_decay"],
    warmup_ratio=config["warmup_ratio"],
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    report_to="none",  # set to "wandb" or "tensorboard" if you want logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
train_result = trainer.train()
trainer.save_model(config["output_dir"])
tokenizer.save_pretrained(config["output_dir"])

print("Training done.")
print("Final train metrics:", train_result.metrics)

# Evaluate on validation after training
val_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["validation"])
print("Validation metrics:", val_metrics)


In [None]:
test_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print("Test metrics (financial):")
for k, v in test_metrics.items():
    print(f"{k}: {v:.4f}")
