In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q transformers datasets accelerate evaluate


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
!pip install -q transformers datasets scikit-learn


In [4]:
import numpy as np
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed,
)

config = {
    # ✅ start from SFT financial model (sequential FIN → BIO)
    "model_name_or_path": "/content/drive/MyDrive/models/sft3_financial",

    # ✅ BIO-only train/val data (text, label, domain="BIO")
    "bio_train_file": "/content/drive/Shareddrives/cs685/final_data_SFT/label_bio_3_train.jsonl",
    "bio_val_file":   "/content/drive/Shareddrives/cs685/final_data_SFT/label_bio_3_val.jsonl",

    # ✅ MIX test data (FIN + BIO, with `domain` column)
    "mixed_test_file": "/content/drive/Shareddrives/cs685/final_data_SFT/label_mixed_3_test.jsonl",

    "num_labels": 3,
    "learning_rate": 3e-5,
    "batch_size": 16,
    "num_epochs": 3,
    "weight_decay": 0.01,
    "seed": 42,
    "output_dir": "/content/drive/MyDrive/models/sft8_financial_to_bio",
}

set_seed(config["seed"])


In [5]:
bio_files = {
    "train": config["bio_train_file"],
    "validation": config["bio_val_file"],
}

bio_raw = load_dataset("json", data_files=bio_files)
print(bio_raw)
print("BIO example:", bio_raw["train"][0])


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'domain'],
        num_rows: 12000
    })
    validation: Dataset({
        features: ['text', 'label', 'domain'],
        num_rows: 1500
    })
})
BIO example: {'text': "After the first week and a half of severe itching - so bad it disrupted sleep - and being able to feel the ring rubbing my insides I began getting cysts all over. I finally discontinued use but I'm afraid the scarring isn't improving.", 'label': 0, 'domain': 'BIO'}


In [6]:
test_raw = load_dataset(
    "json",
    data_files={"test": config["mixed_test_file"]}
)["test"]

print("MIX example:", test_raw[0])


Generating test split: 0 examples [00:00, ? examples/s]

MIX example: {'text': 'I have had the Implanon since August 2013 and had it removed yesterday at my GP. I have put on a ridiculous amount of weight use to be a size 6 since having it inserted I am now a 12. I couldnt cope with the irregular periods and they would last for two weeks. And then the Pre Menstrual Symptoms I would have them for about three weeks before my period which was horrible. I would cry all the time or be extremely moody', 'label': 1, 'domain': 'BIO'}


In [7]:
tokenizer = AutoTokenizer.from_pretrained(config["model_name_or_path"])

max_length = 256

def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",  # fixed padding → simpler for old versions
        max_length=max_length,
    )

# BIO train/val
cols_to_remove_bio = [
    col for col in bio_raw["train"].column_names
    if col not in ("text", "label")
]

bio_tokenized = bio_raw.map(
    preprocess_function,
    batched=True,
    remove_columns=cols_to_remove_bio,
)

# MIX test
cols_to_remove_test = [
    col for col in test_raw.column_names
    if col not in ("text", "label", "domain")
]

test_tokenized = test_raw.map(
    preprocess_function,
    batched=True,
    remove_columns=cols_to_remove_test,
)

print(bio_tokenized)
print(test_tokenized)


Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 12000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1500
    })
})
Dataset({
    features: ['text', 'label', 'domain', 'input_ids', 'attention_mask'],
    num_rows: 3000
})


In [8]:
model = AutoModelForSequenceClassification.from_pretrained(
    config["model_name_or_path"],
    num_labels=config["num_labels"],
)


In [9]:
def compute_metrics(pred):
    logits = pred.predictions
    labels = pred.label_ids
    preds = np.argmax(logits, axis=-1)

    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro"),
        "weighted_f1": f1_score(labels, preds, average="weighted"),
    }


In [10]:
training_args = TrainingArguments(
    output_dir=config["output_dir"],
    num_train_epochs=config["num_epochs"],
    per_device_train_batch_size=config["batch_size"],
    per_device_eval_batch_size=config["batch_size"],
    learning_rate=config["learning_rate"],
    weight_decay=config["weight_decay"],
    logging_steps=100,
    # ✅ no evaluation_strategy, save_strategy, etc.
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=bio_tokenized["train"],
    eval_dataset=bio_tokenized["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [11]:
train_result = trainer.train()

trainer.save_model(config["output_dir"])
tokenizer.save_pretrained(config["output_dir"])

print("Training done.")
print("Train metrics:", train_result.metrics)

val_metrics = trainer.evaluate(eval_dataset=bio_tokenized["validation"])
print("BIO validation metrics:")
for k, v in val_metrics.items():
    try:
        print(f"{k}: {v:.4f}")
    except TypeError:
        print(k, v)


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Step,Training Loss
100,1.1297
200,1.0938
300,1.0358
400,1.0509
500,0.9974
600,0.9981
700,0.9947
800,0.9402
900,0.9217
1000,0.9199


Training done.
Train metrics: {'train_runtime': 860.0816, 'train_samples_per_second': 41.856, 'train_steps_per_second': 2.616, 'total_flos': 4736041519104000.0, 'train_loss': 0.9037851986355252, 'epoch': 3.0}


BIO validation metrics:
eval_loss: 1.0237
eval_accuracy: 0.5240
eval_macro_f1: 0.5241
eval_weighted_f1: 0.5241
eval_runtime: 9.7776
eval_samples_per_second: 153.4130
eval_steps_per_second: 9.6140
epoch: 3.0000


In [12]:
# 1) Get logits on MIX test set
test_predictions = trainer.predict(test_tokenized)
logits = test_predictions.predictions
all_preds = np.argmax(logits, axis=-1)
labels = test_predictions.label_ids

# 2) Overall metrics on MIX
print("\n=== Overall on MIX (FIN + BIO) ===")
print("Accuracy:", accuracy_score(labels, all_preds))
print("Macro F1:", f1_score(labels, all_preds, average="macro"))
print("Weighted F1:", f1_score(labels, all_preds, average="weighted"))

# 3) Per-domain metrics using your snippet
domains = [ex.get("domain", "UNK") for ex in test_raw]

import collections
domain_counts = collections.Counter(domains)
print("\nDomain counts in MIX test:", domain_counts)

for dom in ["FIN", "BIO"]:
    idx = [i for i, d in enumerate(domains) if d == dom]
    if not idx:
        continue
    dom_labels = labels[idx]
    dom_preds  = all_preds[idx]
    print(f"\nDomain: {dom}")
    print("  Accuracy:", accuracy_score(dom_labels, dom_preds))
    print("  Macro F1:", f1_score(dom_labels, dom_preds, average="macro"))



=== Overall on MIX (FIN + BIO) ===
Accuracy: 0.47933333333333333
Macro F1: 0.47840044302692025
Weighted F1: 0.48158838816950494

Domain counts in MIX test: Counter({'BIO': 1500, 'FIN': 1500})

Domain: FIN
  Accuracy: 0.3993333333333333
  Macro F1: 0.38011305147212165

Domain: BIO
  Accuracy: 0.5593333333333333
  Macro F1: 0.5599098666494581
