In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q transformers datasets accelerate evaluate


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install -q transformers datasets scikit-learn


In [None]:
import numpy as np
from datasets import load_dataset, DatasetDict
from sklearn.metrics import accuracy_score, f1_score

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    default_data_collator,
    set_seed,
)

config = {
    "model_name_or_path": "/content/drive/Shareddrives/cs685/mlm_bert_goemotions_finance",  # ✅ your MLM model

    # your financial JSONL files (text + label + domain)
    "train_file": "/content/drive/Shareddrives/cs685/final_data_SFT/label_financial_3_train.jsonl",
    "val_file":   "/content/drive/Shareddrives/cs685/final_data_SFT/label_financial_3_val.jsonl",
    "test_file":  "/content/drive/Shareddrives/cs685/final_data_SFT/label_financial_3_test.jsonl",

    "num_labels": 3,          # e.g. 0=neg,1=neu,2=pos
    "learning_rate": 3e-5,
    "batch_size": 16,
    "num_epochs": 3,
    "weight_decay": 0.01,
    "seed": 42,
    "output_dir": "/content/drive/MyDrive/models/sft3_financial",
}

set_seed(config["seed"])


In [None]:
data_files = {
    "train": config["train_file"],
    "validation": config["val_file"],
    "test": config["test_file"],
}

raw_datasets = load_dataset("json", data_files=data_files)
print(raw_datasets)
print(raw_datasets["train"][0])


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'domain'],
        num_rows: 12000
    })
    validation: Dataset({
        features: ['text', 'label', 'domain'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['text', 'label', 'domain'],
        num_rows: 1500
    })
})
{'text': 'ECB bought record debt volumes last week in crisis fight #economy #MarketScreener https://t.co/08TD4PqNna https://t.co/5ct4SutY3Q', 'label': 2, 'domain': 'FIN'}


In [None]:
from transformers import Trainer
import torch

class WeightedTrainer(Trainer):
    """Trainer that applies a class-weighted cross-entropy loss."""

    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # tensor shape: [num_labels]
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        Accept **kwargs so we don't blow up if Trainer passes extra args
        like num_items_in_batch (newer versions).
        """
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        if self.class_weights is not None:
            weight = self.class_weights.to(logits.device)
            loss_fct = torch.nn.CrossEntropyLoss(weight=weight)
        else:
            loss_fct = torch.nn.CrossEntropyLoss()

        loss = loss_fct(
            logits.view(-1, self.model.config.num_labels),
            labels.view(-1),
        )
        return (loss, outputs) if return_outputs else loss


In [None]:
tokenizer = AutoTokenizer.from_pretrained(config["model_name_or_path"])

max_length = 256  # you can change

def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",  # fixed-length padding, simple for old versions
        max_length=max_length,
    )

# remove everything except text+label; domain is dropped
cols_to_remove = [
    col for col in raw_datasets["train"].column_names
    if col not in ("text", "label")
]

tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=cols_to_remove,
)

print(tokenized_datasets)


Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 12000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1500
    })
})


In [None]:
import numpy as np

# ---- compute class weights from training labels ----
train_labels = np.array(tokenized_datasets["train"]["label"])
num_labels = int(config["num_labels"])

class_counts = np.bincount(train_labels, minlength=num_labels)
# avoid division by zero just in case
class_counts = np.where(class_counts == 0, 1, class_counts)

# simple inverse-frequency weighting
inv_freq = 1.0 / class_counts
class_weights = inv_freq / inv_freq.sum() * num_labels   # normalize a bit

class_weights = torch.tensor(class_weights, dtype=torch.float)
print("Class counts:", class_counts)
print("Class weights:", class_weights)


Class counts: [1695 3759 6546]
Class weights: tensor([1.7545, 0.7912, 0.4543])


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    config["model_name_or_path"],
    num_labels=config["num_labels"],
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/Shareddrives/cs685/mlm_bert_goemotions_finance and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(pred):
    logits = pred.predictions
    labels = pred.label_ids

    preds = np.argmax(logits, axis=-1)

    return {
        "Accuracy": accuracy_score(labels, preds),
        "Macro F1": f1_score(labels, preds, average="macro"),
        "Weighted F1": f1_score(labels, preds, average="weighted"),
    }


In [None]:
training_args = TrainingArguments(
    output_dir=config["output_dir"],
    do_train=True,
    do_eval=True,

    num_train_epochs=config["num_epochs"],
    per_device_train_batch_size=config["batch_size"],
    per_device_eval_batch_size=config["batch_size"],
    learning_rate=config["learning_rate"],
    weight_decay=config["weight_decay"],
    logging_steps=500,
    save_steps=500,          # simple step-based saving
    logging_dir=config["output_dir"] + "/logs",
)

trainer = WeightedTrainer(
    class_weights=class_weights,          # <- new
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)



  super().__init__(*args, **kwargs)


In [None]:
train_result = trainer.train()

trainer.save_model(config["output_dir"])
tokenizer.save_pretrained(config["output_dir"])

print("Training done.")
print("Train metrics:", train_result.metrics)

val_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["validation"])
print("Validation metrics:", val_metrics)


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Step,Training Loss
500,0.9603
1000,0.6925
1500,0.5721
2000,0.3809


Training done.
Train metrics: {'train_runtime': 831.3445, 'train_samples_per_second': 43.303, 'train_steps_per_second': 2.706, 'total_flos': 4736041519104000.0, 'train_loss': 0.6184331732855903, 'epoch': 3.0}


Validation metrics: {'eval_loss': 0.8517059087753296, 'eval_Accuracy': 0.72, 'eval_Macro F1': 0.6857875457875459, 'eval_Weighted F1': 0.7254255433455434, 'eval_runtime': 9.5547, 'eval_samples_per_second': 156.992, 'eval_steps_per_second': 9.838, 'epoch': 3.0}


In [None]:
import json
import numpy as np

# ---- 1. Evaluate and get predictions ----
pred_output = trainer.predict(tokenized_datasets["test"])
test_metrics = pred_output.metrics

print("Test metrics (financial):")
for k, v in test_metrics.items():
    try:
        print(f"{k}: {v:.4f}")
    except TypeError:
        print(k, v)

# predictions and true labels
logits = pred_output.predictions
all_preds = np.argmax(logits, axis=-1)
labels = pred_output.label_ids

# ---- 2. Misclassified indices ----
mis_idx = np.where(labels != all_preds)[0]
print(f"Total misclassified examples: {len(mis_idx)}")

np.random.seed(42)
sample_size = min(100, len(mis_idx))
sample_idx = np.random.choice(mis_idx, size=sample_size, replace=False)

print(f"Sampling {sample_size} misclassified examples for manual error analysis.")

# ---- 3. Build error samples from tokenized_datasets["test"] ----
error_samples = []
test_split = tokenized_datasets["test"]

for idx in sample_idx:
    idx = int(idx)
    ex = test_split[idx]   # 👈 use the test split directly

    item = {
        "dataset_index": idx,
        "text": ex["text"],
        "true_label": int(ex["label"]),
        "pred_label": int(all_preds[idx]),
        "domain": ex.get("domain", ""),
        "length_category": "",
        "has_negation": "",
        "sarcastic_or_ironic": "",
        "contains_numbers": "",
        "comment": "",
    }
    error_samples.append(item)

# ---- 4. Save JSON ----
out_file = "error_analysis_sft3.json"
with open(out_file, "w", encoding="utf-8") as f:
    json.dump(error_samples, f, ensure_ascii=False, indent=2)

print(f"Saved sampled misclassified examples to {out_file}")


Test metrics (financial):
test_loss: 0.8430
test_Accuracy: 0.7260
test_Macro F1: 0.6857
test_Weighted F1: 0.7325
test_runtime: 9.7312
test_samples_per_second: 154.1430
test_steps_per_second: 9.6600
Total misclassified examples: 411
Sampling 100 misclassified examples for manual error analysis.
Saved sampled misclassified examples to error_analysis_sft3.json


In [None]:
trainable = [n for n, p in model.named_parameters() if p.requires_grad]
print("Num trainable params:", len(trainable))
print("Example trainable params:", trainable[:15])

Num trainable params: 201
Example trainable params: ['bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias']


In [None]:
total_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_all = sum(p.numel() for p in model.parameters())
print("Trainable scalars:", total_trainable)
print("Total scalars:", total_all)

Trainable scalars: 109484547
Total scalars: 109484547
