In [None]:
!pip install -q transformers datasets safetensors scikit-learn pyyaml peft

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
config = {
    "model_name_or_path": "/content/drive/Shareddrives/cs685/mlm_bert_goemotions_biomed",
    "output_dir": "/content/drive/MyDrive/models/bio_lora_sft",
    "num_labels": 3,

    "train_files": [
        "/content/drive/Shareddrives/cs685/final_data_SFT/label_bio_3_train.jsonl",
    ],
    "eval_files": [
        "/content/drive/Shareddrives/cs685/final_data_SFT/label_bio_3_val.jsonl",
    ],
    "test_files": [
        "/content/drive/Shareddrives/cs685/final_data_SFT/label_bio_3_test.jsonl",
    ],

    "learning_rate": 1e-4,
    "batch_size": 16,
    "epochs": 10,

    "use_lora": True,
    "use_class_weights": True,

    # LoRA config (you can tweak these)
    "lora_r": 64,
    "lora_alpha": 128,
    "lora_dropout": 0.1,
}


In [None]:
import os
from collections import Counter
from pathlib import Path

os.environ["WANDB_DISABLED"] = "true"  # disable wandb prompt

import torch
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
from sklearn.metrics import accuracy_score, f1_score
from peft import LoraConfig, get_peft_model, TaskType


In [None]:
def load_jsonl_datasets(files):
    """Load one or more JSONL files into a single HF Dataset."""
    if isinstance(files, str):
        files = [files]
    dsets = [load_dataset("json", data_files=f, split="train") for f in files]
    return dsets[0] if len(dsets) == 1 else concatenate_datasets(dsets)


In [None]:
class WeightedTrainer(Trainer):
    """Trainer that applies a class-weighted cross-entropy loss."""

    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights  # tensor [num_labels]

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        if self.class_weights is not None:
            weight = self.class_weights.to(logits.device)
            loss_fct = torch.nn.CrossEntropyLoss(weight=weight)
        else:
            loss_fct = torch.nn.CrossEntropyLoss()

        loss = loss_fct(
            logits.view(-1, self.model.config.num_labels),
            labels.view(-1),
        )
        return (loss, outputs) if return_outputs else loss


In [None]:
# -------- read config values --------
model_path = config["model_name_or_path"]
output_dir = config["output_dir"]
num_labels = int(config["num_labels"])
use_lora = bool(config.get("use_lora", False))
use_class_weights = bool(config.get("use_class_weights", True))

# tokenizer (no special domain tokens needed now)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# base model
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=num_labels,
    ignore_mismatched_sizes=True,
)

# wrap with LoRA if enabled
if use_lora:
    lora_cfg = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=int(config.get("lora_r", 64)),
        lora_alpha=int(config.get("lora_alpha", 128)),
        lora_dropout=float(config.get("lora_dropout", 0.1)),
        bias="none",
        target_modules=["query", "key", "value"],  # attention linears for BERT-like models
    )
    model = get_peft_model(base_model, lora_cfg)
    model.print_trainable_parameters()
else:
    model = base_model


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/Shareddrives/cs685/mlm_bert_goemotions_biomed and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 3,541,251 || all params: 113,025,798 || trainable%: 3.1331


In [None]:
# -------- load data --------
train_files = config["train_files"]
eval_files = config["eval_files"]

train_ds = load_jsonl_datasets(train_files)
eval_ds = load_jsonl_datasets(eval_files)

def encode(ex):
    # financial-only, ignore 'domain'
    text = ex["text"]
    tok = tokenizer(text, truncation=True, max_length=256)
    tok["labels"] = int(ex["label"])  # works if label is "0"/"1"/"2" or 0/1/2
    return tok

train_ds = train_ds.map(encode, batched=False, remove_columns=train_ds.column_names)
eval_ds = eval_ds.map(encode, batched=False, remove_columns=eval_ds.column_names)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
# -------- class weights --------
class_weights_tensor = None
if use_class_weights:
    counts = Counter(train_ds["labels"])
    print("Label counts in train:", counts)

    total = sum(counts.values())
    n_classes = num_labels
    weights = []
    for i in range(n_classes):
        c = counts.get(i, 1)
        w = total / (n_classes * c)
        weights.append(w)
    class_weights_tensor = torch.tensor(weights, dtype=torch.float)
    print("Class weights:", class_weights_tensor.tolist())


Label counts in train: Counter({0: 4000, 1: 4000, 2: 4000})
Class weights: [1.0, 1.0, 1.0]


In [None]:
# -------- metrics --------
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = preds.argmax(axis=-1)
    acc = (preds == labels).mean()
    return {"accuracy": acc}


In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=float(config.get("learning_rate", 1e-4)),
    per_device_train_batch_size=int(config.get("batch_size", 16)),
    num_train_epochs=float(config.get("epochs", 10)),
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
trainer = WeightedTrainer(
    class_weights=class_weights_tensor,
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


  super().__init__(*args, **kwargs)


Step,Training Loss
500,1.0569
1000,0.9908
1500,0.9527
2000,0.9171
2500,0.8772
3000,0.8511
3500,0.803
4000,0.7746
4500,0.7567
5000,0.7014


TrainOutput(global_step=7500, training_loss=0.7955062662760417, metrics={'train_runtime': 1560.077, 'train_samples_per_second': 76.919, 'train_steps_per_second': 4.807, 'total_flos': 1.30946152686048e+16, 'train_loss': 0.7955062662760417, 'epoch': 10.0})

In [None]:
# merge LoRA into base and save a normal HF model if possible
try:
    merged_model = trainer.model.merge_and_unload()
except AttributeError:
    merged_model = trainer.model  # fallback if peft is older

merged_model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print("✅ Finished LoRA SFT (financial-only), saved to", output_dir)


✅ Finished LoRA SFT (financial-only), saved to /content/drive/MyDrive/models/bio_lora_sft


In [None]:
# load test set
test_files = config["test_files"]
test_ds = load_jsonl_datasets(test_files)
len(test_ds), test_ds[0]


Generating train split: 0 examples [00:00, ? examples/s]

(1500,
 {'text': 'This drug is amazing. Lost 2lbs first week. No cravings.',
  'label': 2,
  'domain': 'BIO'})

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(output_dir)
test_model = AutoModelForSequenceClassification.from_pretrained(output_dir).to(device)
test_model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
texts = [ex["text"] for ex in test_ds]
labels = [int(ex["label"]) for ex in test_ds]

batch_size = 32
all_preds = []

for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    inputs = tokenizer(
        batch_texts,
        return_tensors="pt",
        truncation=True,
        max_length=256,
        padding=True,
    ).to(device)

    with torch.no_grad():
        logits = test_model(**inputs).logits
        preds = logits.argmax(dim=-1).cpu().tolist()

    all_preds.extend(preds)

import numpy as np
labels = np.array(labels)
all_preds = np.array(all_preds)

acc = accuracy_score(labels, all_preds)
f1_macro = f1_score(labels, all_preds, average="macro")
f1_weighted = f1_score(labels, all_preds, average="weighted")

print(f"Accuracy:    {acc:.4f}")
print(f"Macro F1:    {f1_macro:.4f}")
print(f"Weighted F1: {f1_weighted:.4f}")


Accuracy:    0.5340
Macro F1:    0.5339
Weighted F1: 0.5339


collect error data for analysis

In [None]:
import json
import numpy as np

mis_idx = np.where(labels != all_preds)[0]
print(f"Total misclassified examples: {len(mis_idx)}")

np.random.seed(42)
sample_size = min(100, len(mis_idx))
sample_idx = np.random.choice(mis_idx, size=sample_size, replace=False)

print(f"Sampling {sample_size} misclassified examples for manual error analysis.")

# build list of dicts for JSON
error_samples = []

for idx in sample_idx:
    idx = int(idx)
    ex = test_ds[idx]

    item = {
        "dataset_index": idx,
        "text": ex["text"],
        "true_label": int(ex["label"]),
        "pred_label": int(all_preds[idx]),
        # optional:
        "domain": ex.get("domain", ""),
        # fields you can fill in manually later
        "length_category": "",      # short / medium / long
        "has_negation": "",         # yes / no
        "sarcastic_or_ironic": "",  # yes / no
        "contains_numbers": "",     # yes / no
        "comment": "",              # free-form notes
    }
    error_samples.append(item)

# save to JSON (pretty-printed)
with open("error_analysis_sft5.json", "w", encoding="utf-8") as f:
    json.dump(error_samples, f, ensure_ascii=False, indent=2)

print("Saved sampled misclassified examples to error_analysis_sft6.json")


Total misclassified examples: 699
Sampling 100 misclassified examples for manual error analysis.
Saved sampled misclassified examples to error_analysis_sft6.json


In [None]:
from collections import Counter

labels = [int(ex["label"]) for ex in test_ds]
counts = Counter(labels)
maj_label, maj_count = counts.most_common(1)[0]

maj_acc = maj_count / len(labels)
print("Majority baseline accuracy (bio):", maj_acc)

Majority baseline accuracy (bio): 0.3333333333333333
