In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -q transformers safetensors torch

In [None]:
MODEL_DIR = "/content/drive/Shareddrives/cs685/mlm_bert_goemotions_biomed_finance"

In [None]:
!pip install -q --upgrade "transformers>=4.30.0" datasets safetensors pyyaml accelerate


In [None]:
import transformers
print(transformers.__version__)

4.57.3


In [None]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained(MODEL_DIR)
print("model_type:", config.model_type)
print("architectures:", config.architectures)

model_type: bert
architectures: ['BertForMaskedLM']


In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,  # change if config says something else
)
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
model.to(device)
model.eval()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/Shareddrives/cs685/mlm_bert_goemotions_biomed_finance and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
!pip install -q transformers safetensors torch


In [None]:
id2label = {
    0: "negative",
    1: "neutral",
    2: "positive",
}

In [None]:
!pip install -q transformers datasets safetensors pyyaml accelerate


In [None]:
from collections import Counter
from pathlib import Path

import yaml
import torch
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)



In [None]:
def load_config(path: str):
    with open(path, "r") as f:
        return yaml.safe_load(f)


def load_jsonl_datasets(files):
    """Load one or more JSONL files into a single HF Dataset."""
    if isinstance(files, str):
        files = [files]
    dsets = [load_dataset("json", data_files=f, split="train") for f in files]
    return dsets[0] if len(dsets) == 1 else concatenate_datasets(dsets)


from transformers import Trainer
import torch

class WeightedTrainer(Trainer):
    """Trainer that applies a class-weighted cross-entropy loss."""

    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # tensor shape: [num_labels]
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        Accept **kwargs so we don't blow up if Trainer passes extra args
        like num_items_in_batch (newer versions). Older versions will just
        ignore **kwargs, which is also fine.
        """
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        if self.class_weights is not None:
            weight = self.class_weights.to(logits.device)
            loss_fct = torch.nn.CrossEntropyLoss(weight=weight)
        else:
            loss_fct = torch.nn.CrossEntropyLoss()

        loss = loss_fct(
            logits.view(-1, self.model.config.num_labels),
            labels.view(-1),
        )
        return (loss, outputs) if return_outputs else loss



In [None]:
config = {
    "model_name_or_path": "/content/drive/Shareddrives/cs685/mlm_bert_goemotions_biomed",
    "output_dir": "/content/drive/MyDrive/models/result",
    "num_labels": 3,
    "train_files": [
        "/content/drive/Shareddrives/cs685/final_data_SFT/label_mixed_3_train.jsonl",
    ],
    "eval_files": [
        "/content/drive/Shareddrives/cs685/final_data_SFT/label_mixed_3_val.jsonl",
    ],
    "learning_rate": 5e-5,
    "batch_size": 8,
    "epochs": 3,
    "use_lora": False,
    "use_class_weights": True,
}

In [None]:
# -------- read config values --------
model_path = config["model_name_or_path"]
output_dir = config["output_dir"]
num_labels = int(config["num_labels"])
use_lora = bool(config.get("use_lora", False))  # currently unused
use_class_weights = bool(config.get("use_class_weights", True))

tokenizer = AutoTokenizer.from_pretrained(model_path)

# 1) Add domain tags as special tokens
special_tokens_dict = {"additional_special_tokens": ["[FIN]", "[BIO]"]}
num_added = tokenizer.add_special_tokens(special_tokens_dict)
print("Added tokens:", num_added, tokenizer.additional_special_tokens)

base_model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=num_labels,
    ignore_mismatched_sizes=True,
)

# 2) Resize embeddings so the model can use the new tokens
if num_added > 0:
    base_model.resize_token_embeddings(len(tokenizer))

model = base_model

# -------- load data --------
train_files = config["train_files"]
eval_files = config["eval_files"]

train_ds = load_jsonl_datasets(train_files)
eval_ds = load_jsonl_datasets(eval_files)

def encode(ex):
  domain = ex.get("domain", None)
  if domain == "FIN":
      prefix = "[FIN]"
  elif domain == "BIO":
      prefix = "[BIO]"
  else:
      # fallback if missing; you can default to one or raise an error
      prefix = ""

  if prefix:
      text = f"{prefix} {ex['text']}"
  else:
      text = ex["text"]

  tok = tokenizer(text, truncation=True, max_length=256)
  tok["labels"] = int(ex["label"])  # assumes label is already 0/1/2
  return tok

train_ds = train_ds.map(encode, batched=False, remove_columns=train_ds.column_names)
eval_ds = eval_ds.map(encode, batched=False, remove_columns=eval_ds.column_names)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# -------- class weights --------
class_weights_tensor = None
if use_class_weights:
    counts = Counter(train_ds["labels"])
    print("Label counts in train:", counts)

    # simple inverse-frequency weighting
    total = sum(counts.values())
    n_classes = num_labels
    weights = []
    for i in range(n_classes):
        c = counts.get(i, 1)
        w = total / (n_classes * c)
        weights.append(w)
    class_weights_tensor = torch.tensor(weights, dtype=torch.float)
    print("Class weights:", class_weights_tensor.tolist())

# -------- metrics --------
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = preds.argmax(axis=-1)
    acc = (preds == labels).mean()
    return {"accuracy": acc}

# -------- training args --------
# training_args = TrainingArguments(
#     output_dir=output_dir,
#     learning_rate=float(config.get("learning_rate", 5e-5)),
#     per_device_train_batch_size=int(config.get("batch_size", 8)),
#     num_train_epochs=float(config.get("epochs", 3)),
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
#     metric_for_best_model="eval_loss",
#     report_to=[],  # no wandb by default
# )
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=float(config.get("learning_rate", 5e-5)),
    per_device_train_batch_size=int(config.get("batch_size", 8)),
    num_train_epochs=float(config.get("epochs", 3)),
    # no evaluation_strategy / save_strategy / load_best_model_at_end
)

trainer = WeightedTrainer(
    class_weights=class_weights_tensor,
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print("✅ Finished SFT, saved to", output_dir)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/Shareddrives/cs685/mlm_bert_goemotions_biomed and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Added tokens: 2 ['[FIN]', '[BIO]']


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/11072 [00:00<?, ? examples/s]

Map:   0%|          | 0/1384 [00:00<?, ? examples/s]

Label counts in train: Counter({1: 5894, 0: 4099, 2: 1079})
Class weights: [0.9003822207450867, 0.6261734962463379, 3.4204509258270264]


  super().__init__(*args, **kwargs)


Step,Training Loss
500,0.9391
1000,0.8688
1500,0.7735
2000,0.6527
2500,0.5983
3000,0.4924
3500,0.4257
4000,0.3838


✅ Finished SFT, saved to /content/drive/MyDrive/models/result


In [None]:
!pip install -q transformers datasets safetensors scikit-learn


In [None]:
from datasets import load_dataset

TEST_PATH = "/content/drive/Shareddrives/cs685/final_data_SFT/label_mixed_3_test.jsonl"  # or FIN/BIO test
test_ds = load_dataset("json", data_files=TEST_PATH, split="train")
len(test_ds), test_ds[0]


Generating train split: 0 examples [00:00, ? examples/s]

(1384,
 {'text': "We are honored to be acknowledged for our commitment to the industry , especially in Asia Pacific . ''",
  'label': 2,
  'domain': 'FIN'})

In [None]:
def add_domain_prefix(example):
    domain = example.get("domain", None)
    if domain == "FIN":
        prefix = "[FIN]"
    elif domain == "BIO":
        prefix = "[BIO]"
    else:
        prefix = ""

    if prefix:
        return f"{prefix} {example['text']}"
    else:
        return example["text"]


In [None]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

texts = [add_domain_prefix(ex) for ex in test_ds]
labels = [int(ex["label"]) for ex in test_ds]

batch_size = 32
all_preds = []

for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    inputs = tokenizer(
        batch_texts,
        return_tensors="pt",
        truncation=True,
        max_length=256,
        padding=True,
    ).to(device)

    with torch.no_grad():
        logits = model(**inputs).logits
        preds = logits.argmax(dim=-1).cpu().tolist()

    all_preds.extend(preds)

labels = np.array(labels)
all_preds = np.array(all_preds)

acc = accuracy_score(labels, all_preds)
f1_macro = f1_score(labels, all_preds, average="macro")
f1_weighted = f1_score(labels, all_preds, average="weighted")

print(f"Accuracy:      {acc:.4f}")
print(f"Macro F1:      {f1_macro:.4f}")
print(f"Weighted F1:   {f1_weighted:.4f}")


Accuracy:      0.7211
Macro F1:      0.6979
Weighted F1:   0.7208


In [None]:
domains = [ex.get("domain", "UNK") for ex in test_ds]

import collections
for dom in ["FIN", "BIO"]:
    idx = [i for i, d in enumerate(domains) if d == dom]
    if not idx:
        continue
    dom_labels = labels[idx]
    dom_preds  = all_preds[idx]
    print(f"\nDomain: {dom}")
    print("  Accuracy:", accuracy_score(dom_labels, dom_preds))
    print("  Macro F1:", f1_score(dom_labels, dom_preds, average="macro"))



Domain: FIN
  Accuracy: 0.7169421487603306
  Macro F1: 0.659546448309158

Domain: BIO
  Accuracy: 0.7233333333333334
  Macro F1: 0.723004700638659
