In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q transformers datasets accelerate evaluate


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install -q transformers datasets scikit-learn


In [None]:
import numpy as np
from datasets import load_dataset, DatasetDict
from sklearn.metrics import accuracy_score, f1_score

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    default_data_collator,
    set_seed,
)

config = {
    "model_name_or_path": "/content/drive/Shareddrives/cs685/mlm_bert_goemotions_finance",  # ✅ your MLM model

    # your financial JSONL files (text + label + domain)
    "train_file": "/content/drive/Shareddrives/cs685/final_data_SFT/label_financial_3_train.jsonl",
    "val_file":   "/content/drive/Shareddrives/cs685/final_data_SFT/label_financial_3_val.jsonl",
    "test_file":  "/content/drive/Shareddrives/cs685/final_data_SFT/label_financial_3_test.jsonl",

    "num_labels": 3,          # e.g. 0=neg,1=neu,2=pos
    "learning_rate": 1e-4,
    "batch_size": 16,
    "num_epochs": 10,
    "weight_decay": 0.01,
    "seed": 42,
    "output_dir": "/content/drive/MyDrive/models/sft3_financial",
}

set_seed(config["seed"])


In [None]:
data_files = {
    "train": config["train_file"],
    "validation": config["val_file"],
    "test": config["test_file"],
}

raw_datasets = load_dataset("json", data_files=data_files)
print(raw_datasets)
print(raw_datasets["train"][0])


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'domain'],
        num_rows: 3872
    })
    validation: Dataset({
        features: ['text', 'label', 'domain'],
        num_rows: 484
    })
    test: Dataset({
        features: ['text', 'label', 'domain'],
        num_rows: 484
    })
})
{'text': 'Finnair was able to operate most of its leisure flights despite the strike .', 'label': 2, 'domain': 'FIN'}


In [None]:
tokenizer = AutoTokenizer.from_pretrained(config["model_name_or_path"])

max_length = 256  # you can change

def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",  # fixed-length padding, simple for old versions
        max_length=max_length,
    )

# remove everything except text+label; domain is dropped
cols_to_remove = [
    col for col in raw_datasets["train"].column_names
    if col not in ("text", "label")
]

tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=cols_to_remove,
)

print(tokenized_datasets)


Map:   0%|          | 0/3872 [00:00<?, ? examples/s]

Map:   0%|          | 0/484 [00:00<?, ? examples/s]

Map:   0%|          | 0/484 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3872
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 484
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 484
    })
})


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    config["model_name_or_path"],
    num_labels=config["num_labels"],
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/Shareddrives/cs685/mlm_bert_goemotions_finance and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(pred):
    logits = pred.predictions
    labels = pred.label_ids

    preds = np.argmax(logits, axis=-1)

    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro"),
        "weighted_f1": f1_score(labels, preds, average="weighted"),
    }


In [None]:
training_args = TrainingArguments(
    output_dir=config["output_dir"],
    do_train=True,
    do_eval=True,

    num_train_epochs=config["num_epochs"],
    per_device_train_batch_size=config["batch_size"],
    per_device_eval_batch_size=config["batch_size"],
    learning_rate=config["learning_rate"],
    weight_decay=config["weight_decay"],
    logging_steps=100,
    save_steps=500,          # simple step-based saving
    logging_dir=config["output_dir"] + "/logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [None]:
train_result = trainer.train()

trainer.save_model(config["output_dir"])
tokenizer.save_pretrained(config["output_dir"])

print("Training done.")
print("Train metrics:", train_result.metrics)

val_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["validation"])
print("Validation metrics:", val_metrics)


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Step,Training Loss
100,0.8745
200,0.8169
300,0.7375
400,0.6258
500,0.5934
600,0.4291
700,0.4627
800,0.3185
900,0.3145
1000,0.2778


Training done.
Train metrics: {'train_runtime': 456.1288, 'train_samples_per_second': 42.444, 'train_steps_per_second': 2.653, 'total_flos': 2546937883607040.0, 'train_loss': 0.48631761665186607, 'epoch': 5.0}


Validation metrics: {'eval_loss': 0.8264682292938232, 'eval_accuracy': 0.7293388429752066, 'eval_macro_f1': 0.6507629690353943, 'eval_weighted_f1': 0.7325672679918858, 'eval_runtime': 3.2274, 'eval_samples_per_second': 149.967, 'eval_steps_per_second': 9.605, 'epoch': 5.0}


In [None]:
test_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print("Test metrics (financial):")
for k, v in test_metrics.items():
    try:
        print(f"{k}: {v:.4f}")
    except TypeError:
        print(k, v)


Test metrics (financial):
eval_loss: 0.9319
eval_accuracy: 0.7169
eval_macro_f1: 0.6702
eval_weighted_f1: 0.7154
eval_runtime: 3.2399
eval_samples_per_second: 149.3890
eval_steps_per_second: 9.5680
epoch: 5.0000
