In [1]:
!pip install -q transformers datasets scikit-learn peft


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed,
)

from peft import LoraConfig, get_peft_model, TaskType
import collections


In [7]:
config = {
    # âœ… start from your FIN-SFT model
    "model_name_or_path": "/content/drive/Shareddrives/cs685/SFT3_fin_lora",

    # âœ… BIO-only train/val data
    "bio_train_file": "/content/drive/Shareddrives/cs685/final_data_SFT/label_bio_3_train.jsonl",
    "bio_val_file":   "/content/drive/Shareddrives/cs685/final_data_SFT/label_bio_3_val.jsonl",

    # âœ… MIX test data (FIN + BIO with `domain`)
    "mixed_test_file": "/content/drive/Shareddrives/cs685/final_data_SFT/label_mixed_3_test.jsonl",

    # labels: 0 = neg, 1 = neu, 2 = pos
    "num_labels": 3,

    # training
    "learning_rate": 1e-4,   # can be a bit higher for LoRA
    "batch_size": 16,
    "num_epochs": 10,
    "weight_decay": 0.01,
    "seed": 42,
    "output_dir": "/content/drive/MyDrive/models/sft_lora_fin_to_bio",

    # LoRA config (tweak if you like)
    "lora_r": 64,
    "lora_alpha": 128,
    "lora_dropout": 0.1,
}

set_seed(config["seed"])


In [8]:
# BIO train/val
bio_files = {
    "train": config["bio_train_file"],
    "validation": config["bio_val_file"],
}
bio_raw = load_dataset("json", data_files=bio_files)

# MIX test (FIN + BIO)
test_raw = load_dataset(
    "json",
    data_files={"test": config["mixed_test_file"]}
)["test"]

print("BIO example:", bio_raw["train"][0])
print("MIX example:", test_raw[0])


BIO example: {'text': "After the first week and a half of severe itching - so bad it disrupted sleep - and being able to feel the ring rubbing my insides I began getting cysts all over. I finally discontinued use but I'm afraid the scarring isn't improving.", 'label': 0, 'domain': 'BIO'}
MIX example: {'text': 'I have had the Implanon since August 2013 and had it removed yesterday at my GP. I have put on a ridiculous amount of weight use to be a size 6 since having it inserted I am now a 12. I couldnt cope with the irregular periods and they would last for two weeks. And then the Pre Menstrual Symptoms I would have them for about three weeks before my period which was horrible. I would cry all the time or be extremely moody', 'label': 1, 'domain': 'BIO'}


In [9]:
tokenizer = AutoTokenizer.from_pretrained(config["model_name_or_path"])
max_length = 256

def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=max_length,
    )

# BIO train/val â€“ keep text+label
cols_to_remove_bio = [
    col for col in bio_raw["train"].column_names
    if col not in ("text", "label")
]

bio_tokenized = bio_raw.map(
    preprocess_function,
    batched=True,
    remove_columns=cols_to_remove_bio,
)

# MIX test â€“ keep text+label, preserve domains in test_raw
cols_to_remove_test = [
    col for col in test_raw.column_names
    if col not in ("text", "label", "domain")
]

test_tokenized = test_raw.map(
    preprocess_function,
    batched=True,
    remove_columns=cols_to_remove_test,
)

print(bio_tokenized)
print(test_tokenized)


Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 12000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1500
    })
})
Dataset({
    features: ['text', 'label', 'domain', 'input_ids', 'attention_mask'],
    num_rows: 3000
})


In [10]:
# Load FIN-SFT model
base_model = AutoModelForSequenceClassification.from_pretrained(
    config["model_name_or_path"],
    num_labels=config["num_labels"],
)

# ðŸ”’ Freeze ALL existing parameters, including classifier head
for name, param in base_model.named_parameters():
    param.requires_grad = False

# LoRA config (SEQ_CLS)
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=config["lora_r"],
    lora_alpha=config["lora_alpha"],
    lora_dropout=config["lora_dropout"],
    target_modules=["query", "key", "value"],  # attention submodules
)

# Wrap with LoRA â€“ only LoRA weights will be trainable
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()  # sanity: only a small % trainable


trainable params: 3,541,251 || all params: 113,025,798 || trainable%: 3.1331


In [11]:
def compute_metrics(pred):
    logits = pred.predictions
    labels = pred.label_ids
    preds = np.argmax(logits, axis=-1)

    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro"),
        "weighted_f1": f1_score(labels, preds, average="weighted"),
    }


In [12]:
training_args = TrainingArguments(
    output_dir=config["output_dir"],
    num_train_epochs=config["num_epochs"],
    per_device_train_batch_size=config["batch_size"],
    per_device_eval_batch_size=config["batch_size"],
    learning_rate=config["learning_rate"],
    weight_decay=config["weight_decay"],
    logging_steps=100,
    # no evaluation_strategy / save_strategy / load_best_model_at_end â†’ works with older 4.x
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=bio_tokenized["train"],
    eval_dataset=bio_tokenized["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [13]:
train_result = trainer.train()

# Save LoRA-adapted model
trainer.save_model(config["output_dir"])
tokenizer.save_pretrained(config["output_dir"])

print("Training done.")
print("Train metrics:", train_result.metrics)

val_metrics = trainer.evaluate(eval_dataset=bio_tokenized["validation"])
print("BIO validation metrics:")
for k, v in val_metrics.items():
    try:
        print(f"{k}: {v:.4f}")
    except TypeError:
        print(k, v)


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Step,Training Loss
100,1.2094
200,1.0965
300,1.0771
400,1.0718
500,1.0339
600,1.038
700,1.0265
800,0.9901
900,0.9927
1000,0.9979


Training done.
Train metrics: {'train_runtime': 970.8862, 'train_samples_per_second': 123.598, 'train_steps_per_second': 7.725, 'total_flos': 1.6439528448e+16, 'train_loss': 0.8214111048380534, 'epoch': 10.0}


BIO validation metrics:
eval_loss: 1.1544
eval_accuracy: 0.5220
eval_macro_f1: 0.5228
eval_weighted_f1: 0.5228
eval_runtime: 5.6167
eval_samples_per_second: 267.0630
eval_steps_per_second: 16.7360
epoch: 10.0000


In [14]:
# Predict on MIX test set
test_predictions = trainer.predict(test_tokenized)
logits = test_predictions.predictions
all_preds = np.argmax(logits, axis=-1)
labels = test_predictions.label_ids

print("\n=== Overall on MIX (FIN + BIO) ===")
print("Accuracy:", accuracy_score(labels, all_preds))
print("Macro F1:", f1_score(labels, all_preds, average="macro"))
print("Weighted F1:", f1_score(labels, all_preds, average="weighted"))

# Domain-sliced metrics
domains = [ex.get("domain", "UNK") for ex in test_raw]
domain_counts = collections.Counter(domains)
print("\nDomain counts in MIX test:", domain_counts)

for dom in ["FIN", "BIO"]:
    idx = [i for i, d in enumerate(domains) if d == dom]
    if not idx:
        continue
    dom_labels = labels[idx]
    dom_preds  = all_preds[idx]
    print(f"\nDomain: {dom}")
    print("  Accuracy:", accuracy_score(dom_labels, dom_preds))
    print("  Macro F1:", f1_score(dom_labels, dom_preds, average="macro"))



=== Overall on MIX (FIN + BIO) ===
Accuracy: 0.5006666666666667
Macro F1: 0.4935827821065424
Weighted F1: 0.5046806333138368

Domain counts in MIX test: Counter({'BIO': 1500, 'FIN': 1500})

Domain: FIN
  Accuracy: 0.4613333333333333
  Macro F1: 0.4181604147979206

Domain: BIO
  Accuracy: 0.54
  Macro F1: 0.5419475136999292
