In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -q transformers datasets accelerate evaluate


In [None]:
!pip install -q transformers datasets scikit-learn


In [None]:
import numpy as np
from datasets import load_dataset, DatasetDict
from sklearn.metrics import accuracy_score, f1_score

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    default_data_collator,
    set_seed,
)

config = {
    "model_name_or_path": "/content/drive/Shareddrives/cs685/mlm_bert_goemotions_biomed",  # âœ… your MLM model

    # your bio JSONL files (text + label + domain)
    "train_file": "/content/drive/Shareddrives/cs685/final_data_SFT/label_bio_3_train.jsonl",
    "val_file":   "/content/drive/Shareddrives/cs685/final_data_SFT/label_bio_3_val.jsonl",
    "test_file":  "/content/drive/Shareddrives/cs685/final_data_SFT/label_bio_3_test.jsonl",

    "num_labels": 3,          # e.g. 0=neg,1=neu,2=pos
    "learning_rate": 3e-5,
    "batch_size": 16,
    "num_epochs": 3,
    "weight_decay": 0.01,
    "seed": 42,
    "output_dir": "/content/drive/MyDrive/models/sft2_bio",
}

set_seed(config["seed"])


In [None]:
data_files = {
    "train": config["train_file"],
    "validation": config["val_file"],
    "test": config["test_file"],
}

raw_datasets = load_dataset("json", data_files=data_files)
print(raw_datasets)
print(raw_datasets["train"][0])


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'domain'],
        num_rows: 12000
    })
    validation: Dataset({
        features: ['text', 'label', 'domain'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['text', 'label', 'domain'],
        num_rows: 1500
    })
})
{'text': "After the first week and a half of severe itching - so bad it disrupted sleep - and being able to feel the ring rubbing my insides I began getting cysts all over. I finally discontinued use but I'm afraid the scarring isn't improving.", 'label': 0, 'domain': 'BIO'}


In [None]:
tokenizer = AutoTokenizer.from_pretrained(config["model_name_or_path"])

max_length = 256  # you can change

def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",  # fixed-length padding, simple for old versions
        max_length=max_length,
    )

# remove everything except text+label; domain is dropped
cols_to_remove = [
    col for col in raw_datasets["train"].column_names
    if col not in ("text", "label")
]

tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=cols_to_remove,
)

print(tokenized_datasets)


Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 12000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1500
    })
})


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    config["model_name_or_path"],
    num_labels=config["num_labels"],
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/Shareddrives/cs685/mlm_bert_goemotions_biomed and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(pred):
    logits = pred.predictions
    labels = pred.label_ids

    preds = np.argmax(logits, axis=-1)

    return {
        "Accuracy: ": accuracy_score(labels, preds),
        "Macro F1: ": f1_score(labels, preds, average="macro"),
        "Weighted F1: ": f1_score(labels, preds, average="weighted"),
    }


In [None]:
training_args = TrainingArguments(
    output_dir=config["output_dir"],
    do_train=True,
    do_eval=True,

    num_train_epochs=config["num_epochs"],
    per_device_train_batch_size=config["batch_size"],
    per_device_eval_batch_size=config["batch_size"],
    learning_rate=config["learning_rate"],
    weight_decay=config["weight_decay"],
    logging_steps=100,
    save_steps=500,          # simple step-based saving
    logging_dir=config["output_dir"] + "/logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [None]:
train_result = trainer.train()

trainer.save_model(config["output_dir"])
tokenizer.save_pretrained(config["output_dir"])

print("Training done.")
print("Train metrics:", train_result.metrics)

val_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["validation"])
print("Validation metrics:", val_metrics)


Step,Training Loss
100,1.1172
200,1.0824
300,1.0579
400,1.0386
500,0.9866
600,1.0021
700,0.9876
800,0.9396
900,0.9426
1000,0.9066


Training done.
Train metrics: {'train_runtime': 687.0281, 'train_samples_per_second': 52.4, 'train_steps_per_second': 3.275, 'total_flos': 4736041519104000.0, 'train_loss': 0.8993934427897136, 'epoch': 3.0}


Validation metrics: {'eval_loss': 0.989325761795044, 'eval_Accuracy: ': 0.5373333333333333, 'eval_Macro F1: ': 0.5364189132939272, 'eval_Weighted F1: ': 0.5364189132939272, 'eval_runtime': 10.1881, 'eval_samples_per_second': 147.231, 'eval_steps_per_second': 9.226, 'epoch': 3.0}


In [None]:
test_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print("Test metrics (Bio):")
for k, v in test_metrics.items():
    try:
        print(f"{k}: {v:.4f}")
    except TypeError:
        print(k, v)


Test metrics (Bio):
eval_loss: 0.9573
eval_Accuracy: : 0.5513
eval_Macro F1: : 0.5521
eval_Weighted F1: : 0.5521
eval_runtime: 10.3748
eval_samples_per_second: 144.5810
eval_steps_per_second: 9.0600
epoch: 3.0000


In [None]:
import json
import numpy as np

# ---- 1. Evaluate and get predictions ----
pred_output = trainer.predict(tokenized_datasets["test"])
test_metrics = pred_output.metrics

print("Test metrics (bio):")
for k, v in test_metrics.items():
    try:
        print(f"{k}: {v:.4f}")
    except TypeError:
        print(k, v)

# predictions and true labels
logits = pred_output.predictions
all_preds = np.argmax(logits, axis=-1)
labels = pred_output.label_ids

# ---- 2. Misclassified indices ----
mis_idx = np.where(labels != all_preds)[0]
print(f"Total misclassified examples: {len(mis_idx)}")

np.random.seed(42)
sample_size = min(100, len(mis_idx))
sample_idx = np.random.choice(mis_idx, size=sample_size, replace=False)

print(f"Sampling {sample_size} misclassified examples for manual error analysis.")

# ---- 3. Build error samples from tokenized_datasets["test"] ----
error_samples = []
test_split = tokenized_datasets["test"]

for idx in sample_idx:
    idx = int(idx)
    ex = test_split[idx]   # ðŸ‘ˆ use the test split directly

    item = {
        "dataset_index": idx,
        "text": ex["text"],
        "true_label": int(ex["label"]),
        "pred_label": int(all_preds[idx]),
        "domain": ex.get("domain", ""),
        "length_category": "",
        "has_negation": "",
        "sarcastic_or_ironic": "",
        "contains_numbers": "",
        "comment": "",
    }
    error_samples.append(item)

# ---- 4. Save JSON ----
out_file = "error_analysis_sft2.json"
with open(out_file, "w", encoding="utf-8") as f:
    json.dump(error_samples, f, ensure_ascii=False, indent=2)

print(f"Saved sampled misclassified examples to {out_file}")

Test metrics (bio):
test_loss: 0.9573
test_Accuracy: : 0.5513
test_Macro F1: : 0.5521
test_Weighted F1: : 0.5521
test_runtime: 10.5708
test_samples_per_second: 141.9000
test_steps_per_second: 8.8920
Total misclassified examples: 673
Sampling 100 misclassified examples for manual error analysis.
Saved sampled misclassified examples to error_analysis_sft2.json
