### Model Fine-Tuning
BERT base uncased

In [2]:
!pip install transformers datasets peft evaluate accelerate bitsandbytes pandas -q
!pip install optuna scikit-learn -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
os.environ["WANDB_DISABLED"] = "true"

from google.colab import drive
import os
import shutil

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Basic fine-tuning script

In [11]:
import os
import argparse
from typing import Dict, Any, List

import torch
from datasets import load_dataset, Dataset, ClassLabel, Features, Value
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model, TaskType
import evaluate
import numpy as np
import pandas as pd
import json

# -------------------------
# Configuration
# -------------------------
class SimpleArgs:
    csv = "/content/drive/MyDrive/ANLP Assignment 1/mt_bench_training.csv"
    output_dir = "./model_adapter"
    model_name = "bert-base-uncased"  # <-- switched to encoder-only
    batch_size = 8
    epochs = 6
    lr = 2e-5
    seed = 42
    max_input_length = 512
    lora_r = 16
    lora_alpha = 32
    lora_dropout = 0.05
    save_total_limit = 2
    eval_steps = 10
    logging_steps = 10
    seed_data_split = 42
    test_size = 0.1
    validation_size = 0.1

args = SimpleArgs()

# -------------------------
# Utilities
# -------------------------
def build_input_texts_from_columns(examples: Dict[str, List], tokenizer) -> List[str]:
    text_inputs = []
    sep_token = tokenizer.sep_token if tokenizer.sep_token is not None else " "

    for i in range(len(examples["turn"])):
        turn = int(examples["turn"][i])
        q1 = str(examples.get("turn_1_query", [""])[i]).strip()

        if turn == 1:
            text = f"Query: {q1}"
        elif turn == 2:
            ans = str(examples.get("turn_1_answer", [""])[i]).strip()
            q2 = str(examples.get("turn_2_query", [""])[i]).strip()
            text = f"Query: {q1}{sep_token}Answer: {ans}{sep_token}Follow-up Query: {q2}"
        else:
            text = f"Query: {q1}"
        text_inputs.append(text)
    return text_inputs


def preprocess_function(examples, tokenizer, args):
    text_inputs = build_input_texts_from_columns(examples, tokenizer)

    model_inputs = tokenizer(
        text_inputs,
        max_length=args.max_input_length,
        truncation=True,
        padding=False,
    )
    model_inputs["labels"] = examples["label"]
    return model_inputs


# -------------------------
# Compute Metrics
# -------------------------
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        logits = preds[0]
    else:
        logits = preds

    pred_ids = np.argmax(logits, axis=1)
    acc = accuracy_metric.compute(predictions=pred_ids, references=labels)
    return {"accuracy": acc["accuracy"]}

# -------------------------
# Main Logic
# -------------------------
def main():
    torch.manual_seed(args.seed)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"--- Using device: {device.upper()} ---")

    if not os.path.exists(args.csv):
        raise FileNotFoundError(f"CSV file not found: {args.csv}")

    df = pd.read_csv(args.csv)

    unique_winners = sorted(df["winner"].unique().tolist())
    label2id = {label: i for i, label in enumerate(unique_winners)}
    id2label = {i: label for i, label in enumerate(unique_winners)}
    num_labels = len(unique_winners)
    print(f"Found {num_labels} unique labels: {unique_winners}")
    print(f"Label mapping: {label2id}")

    # Save the label mappings alongside the model adapter for inference
    mappings_dir = "/content/drive/MyDrive/ANLP Assignment 1"
    mappings_path = os.path.join(mappings_dir, "label_mappings.json")
    os.makedirs(mappings_dir, exist_ok=True) # Create the directory if it doesn't exist
    with open(mappings_path, "w") as f:
        json.dump({"id2label": id2label, "label2id": label2id}, f)
    print(f"Label mappings saved to {mappings_path}")
    # --------------------
    df['label'] = df['winner'].map(label2id)

    features = Features({
        'question_id': Value('int64'),
        'turn': Value('int64'),
        'turn_1_query': Value('string'),
        'turn_1_answer': Value('string'),
        'turn_2_query': Value('string'),
        'winner': Value('string'),
        'label': ClassLabel(names=unique_winners)
    })

    raw_all = Dataset.from_pandas(df, features=features)

    train_val_split = raw_all.train_test_split(
        test_size=args.test_size,
        seed=args.seed_data_split,
        stratify_by_column="label"
    )
    test_ds = train_val_split["test"]
    train_val_ds = train_val_split["train"]

    train_split = train_val_ds.train_test_split(
        test_size=args.validation_size,
        seed=args.seed_data_split,
        stratify_by_column="label"
    )
    train_ds = train_split["train"]
    val_ds = train_split["test"]

    print(f"Dataset splits created: train={len(train_ds)}, validation={len(val_ds)}, test={len(test_ds)}")

    tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
    )

    # For BERT, we usually target query/key/value/projection layers for LoRA
    target_modules = ["query", "key", "value", "dense"]

    peft_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=args.lora_r,
        lora_alpha=args.lora_alpha,
        lora_dropout=args.lora_dropout,
        target_modules=target_modules,
    )

    model = get_peft_model(model, peft_config)
    print("Wrapped model with LoRA. Trainable parameters:")
    model.print_trainable_parameters()

    tokenized_train = train_ds.map(lambda examples: preprocess_function(examples, tokenizer, args), batched=True)
    tokenized_val = val_ds.map(lambda examples: preprocess_function(examples, tokenizer, args), batched=True)
    tokenized_test = test_ds.map(lambda examples: preprocess_function(examples, tokenizer, args), batched=True)

    columns_to_remove = ['question_id', 'turn', 'turn_1_query', 'turn_1_answer', 'turn_2_query', 'winner', 'label']
    tokenized_train = tokenized_train.remove_columns([col for col in columns_to_remove if col in tokenized_train.column_names])
    tokenized_val = tokenized_val.remove_columns([col for col in columns_to_remove if col in tokenized_val.column_names])
    tokenized_test = tokenized_test.remove_columns([col for col in columns_to_remove if col in tokenized_test.column_names])

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir=args.output_dir,
        eval_strategy="steps",
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        num_train_epochs=args.epochs,
        learning_rate=args.lr,
        save_total_limit=args.save_total_limit,
        fp16=torch.cuda.is_available(),
        logging_steps=args.logging_steps,
        eval_steps=args.eval_steps,
        save_strategy="steps",
        save_steps=args.eval_steps,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        seed=args.seed,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print("--- Starting Training ---")
    trainer.train()
    print("--- Training Finished ---")

    print("\n--- Evaluating on the held-out Test Set ---")
    test_results = trainer.evaluate(eval_dataset=tokenized_test)
    print("Test Set Metrics:")
    print(test_results)

    print("\nSaving final PEFT adapter to:", args.output_dir)
    model.save_pretrained(args.output_dir)
    tokenizer.save_pretrained(args.output_dir)
    print("Done.")

# Run the main function
main()

--- Using device: CUDA ---
Found 6 unique labels: ['alpaca-13b', 'claude-v1', 'gpt-3.5-turbo', 'gpt-4', 'llama-13b', 'vicuna-13b-v1.2']
Label mapping: {'alpaca-13b': 0, 'claude-v1': 1, 'gpt-3.5-turbo': 2, 'gpt-4': 3, 'llama-13b': 4, 'vicuna-13b-v1.2': 5}
Label mappings saved to /content/drive/MyDrive/ANLP Assignment 1/label_mappings.json
Dataset splits created: train=3348, validation=373, test=414


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wrapped model with LoRA. Trainable parameters:
trainable params: 2,683,398 || all params: 112,170,252 || trainable%: 2.3923


Map:   0%|          | 0/3348 [00:00<?, ? examples/s]

Map:   0%|          | 0/373 [00:00<?, ? examples/s]

Map:   0%|          | 0/414 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


--- Starting Training ---


Step,Training Loss,Validation Loss,Accuracy
10,No log,1.881034,0.203753
20,No log,1.84991,0.203753
30,No log,1.822829,0.203753
40,No log,1.797495,0.206434
50,1.880700,1.78142,0.209115


KeyboardInterrupt: 

#### Better fine-tuning strategy

In [21]:
import os
import json
from typing import Dict, Any, List

import torch
import torch.nn as nn
from datasets import Dataset, Features, Value, ClassLabel # <-- Imports are correct
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model, TaskType
import evaluate
import numpy as np
import pandas as pd
import optuna
from sklearn.utils.class_weight import compute_class_weight

# (Configuration and utility functions remain the same)
class SimpleArgs:
    csv = "/content/drive/MyDrive/ANLP Assignment 1/mt_bench_training.csv"
    output_dir = "./hpo_results"
    model_name = "bert-base-uncased"
    batch_size = 16
    epochs = 8
    lr = 2e-5
    seed = 42
    max_input_length = 512
    lora_r = 16
    lora_alpha = 32
    lora_dropout = 0.1
    save_total_limit = 2
    eval_steps = 50
    logging_steps = 50
    seed_data_split = 42
    test_size = 0.1
    validation_size = 0.1
    n_hpo_trials = 20

args = SimpleArgs()

class WeightedLossTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

def build_input_texts_from_columns(examples: Dict[str, List], tokenizer) -> List[str]:
    text_inputs = []
    sep_token = tokenizer.sep_token if tokenizer.sep_token is not None else " "
    for i in range(len(examples["turn"])):
        turn, q1 = int(examples["turn"][i]), str(examples.get("turn_1_query", [""])[i]).strip()
        if turn == 2:
            ans, q2 = str(examples.get("turn_1_answer", [""])[i]).strip(), str(examples.get("turn_2_query", [""])[i]).strip()
            text = f"Query: {q1}{sep_token}Answer: {ans}{sep_token}Follow-up Query: {q2}"
        else:
            text = f"Query: {q1}"
        text_inputs.append(text)
    return text_inputs

def preprocess_function(examples, tokenizer, args):
    text_inputs = build_input_texts_from_columns(examples, tokenizer)
    model_inputs = tokenizer(text_inputs, max_length=args.max_input_length, truncation=True, padding=False)
    model_inputs["labels"] = examples["label"]
    return model_inputs

accuracy_metric = evaluate.load("accuracy")
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    logits = preds[0] if isinstance(preds, tuple) else preds
    pred_ids = np.argmax(logits, axis=1)
    acc = accuracy_metric.compute(predictions=pred_ids, references=labels)
    return {"accuracy": acc["accuracy"]}


# -------------------------
# Main Logic
# -------------------------
def main():
    torch.manual_seed(args.seed)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"--- Using device: {device.upper()} ---")

    df = pd.read_csv(args.csv)
    # Fill NaN values in text columns to prevent errors
    text_cols = ['turn_1_query', 'turn_1_answer', 'turn_2_query']
    for col in text_cols:
        if col in df.columns:
            df[col] = df[col].fillna('')

    unique_winners = sorted(df["winner"].unique().tolist())
    label2id = {label: i for i, label in enumerate(unique_winners)}
    id2label = {i: label for i, label in enumerate(unique_winners)}
    num_labels = len(unique_winners)
    df['label'] = df['winner'].map(label2id)

    # --- THIS IS THE FIX ---
    # Define the features of the dataset, marking 'label' as a ClassLabel
    # This is necessary for stratified splitting.
    features = Features({
        'turn': Value('int64'),
        'turn_1_query': Value('string'),
        'turn_1_answer': Value('string'),
        'turn_2_query': Value('string'),
        'winner': Value('string'),
        'label': ClassLabel(names=unique_winners)
    })
    # Remove any columns from the DataFrame that are not in the Features
    df_columns = [col for col in features.keys() if col in df.columns]
    df = df[df_columns]

    # Create the dataset using the defined features
    raw_all = Dataset.from_pandas(df, features=features)
    # --- END FIX ---

    # Data Splitting
    train_val_split = raw_all.train_test_split(test_size=args.test_size, seed=args.seed_data_split, stratify_by_column="label")
    test_ds = train_val_split["test"]
    train_val_ds = train_val_split["train"]
    train_split = train_val_ds.train_test_split(test_size=args.validation_size, seed=args.seed_data_split, stratify_by_column="label")
    train_ds = train_split["train"]
    val_ds = train_split["test"]

    tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=True)

    # Calculate Class Weights
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(train_ds['label']),
        y=np.array(train_ds['label'])
    )
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)
    print(f"Calculated class weights for weighted loss: {class_weights}")

    def model_init(trial):
        return AutoModelForSequenceClassification.from_pretrained(
            args.model_name, num_labels=num_labels, id2label=id2label, label2id=label2id
        )

    def optuna_hp_space(trial: optuna.Trial) -> Dict[str, Any]:
        return {
            "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
            "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 5),
            "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16]),
            "lora_r": trial.suggest_categorical("lora_r", [8, 16, 32]),
            "lora_alpha": trial.suggest_categorical("lora_alpha", [16, 32, 64]),
        }

    tokenized_train = train_ds.map(lambda ex: preprocess_function(ex, tokenizer, args), batched=True)
    tokenized_val = val_ds.map(lambda ex: preprocess_function(ex, tokenizer, args), batched=True)
    tokenized_test = test_ds.map(lambda ex: preprocess_function(ex, tokenizer, args), batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir=os.path.join(args.output_dir, "base_run"),
        eval_strategy="steps",
        per_device_eval_batch_size=args.batch_size,
        save_total_limit=args.save_total_limit,
        fp16=torch.cuda.is_available(),
        logging_steps=args.logging_steps,
        eval_steps=args.eval_steps,
        save_strategy="steps",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        seed=args.seed,
    )

    trainer = WeightedLossTrainer(
        model=None, args=training_args,
        train_dataset=tokenized_train, eval_dataset=tokenized_val,
        tokenizer=tokenizer, data_collator=data_collator,
        compute_metrics=compute_metrics, model_init=model_init,
        class_weights=class_weights_tensor,
    )

    print("--- Starting Hyperparameter Search ---")
    best_run = trainer.hyperparameter_search(
        direction="maximize", backend="optuna",
        hp_space=optuna_hp_space, n_trials=args.n_hpo_trials,
    )
    print("--- Hyperparameter Search Finished ---")
    print("Best Run Found:", best_run)

    print("\n--- Training Final Model with Best Hyperparameters ---")
    best_params = best_run.hyperparameters

    final_model = model_init(None)
    peft_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=best_params.get("lora_r", args.lora_r),
        lora_alpha=best_params.get("lora_alpha", args.lora_alpha),
        lora_dropout=args.lora_dropout,
        target_modules=["query", "value", "key", "dense"],
    )
    final_model = get_peft_model(final_model, peft_config)

    final_training_args = TrainingArguments(
        output_dir=os.path.join(args.output_dir, "final_model"),
        learning_rate=best_params.get("learning_rate"),
        num_train_epochs=best_params.get("num_train_epochs"),
        per_device_train_batch_size=best_params.get("per_device_train_batch_size"),
        eval_strategy="epoch", save_strategy="epoch",
        per_device_eval_batch_size=args.batch_size,
        save_total_limit=args.save_total_limit,
        fp16=torch.cuda.is_available(),
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        seed=args.seed,
    )

    final_trainer = WeightedLossTrainer(
        model=final_model, args=final_training_args,
        train_dataset=tokenized_train, eval_dataset=tokenized_val,
        tokenizer=tokenizer, data_collator=data_collator,
        compute_metrics=compute_metrics, class_weights=class_weights_tensor,
    )

    final_trainer.train()
    print("--- Final Training Finished ---")

    print("\n--- Evaluating Final Model on Test Set ---")
    test_results = final_trainer.evaluate(eval_dataset=tokenized_test)
    print("Final Test Set Metrics:", test_results)

    final_model_dir = os.path.join(args.output_dir, "final_model_adapter")
    final_trainer.save_model(final_model_dir)
    tokenizer.save_pretrained(final_model_dir)
    with open(os.path.join(final_model_dir, "label_mappings.json"), "w") as f:
        json.dump({"id2label": {str(k): v for k, v in id2label.items()}, "label2id": label2id}, f)
    print(f"Final optimized model and artifacts saved to {final_model_dir}")

if __name__ == "__main__":
    main()

--- Using device: CUDA ---
Calculated class weights for weighted loss: [1.69604863 0.9        0.59298618 0.79714286 2.66985646 1.01639344]


Map:   0%|          | 0/3348 [00:00<?, ? examples/s]

Map:   0%|          | 0/373 [00:00<?, ? examples/s]

Map:   0%|          | 0/414 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  super().__init__(*args, **kwargs)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-09-16 10:13:30,701] A new study created in memory with name: no-name-7b55618a-06c5-4dee-a2c8-7bec23cb5443
Trying to set lora_r in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.


--- Starting Hyperparameter Search ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,1.8406,1.800538,0.176944
100,1.8015,1.781737,0.168901
150,1.7815,1.785635,0.211796
200,1.7937,1.762242,0.211796
250,1.7555,1.746212,0.219839
300,1.7285,1.727181,0.243968
350,1.7235,1.715261,0.238606
400,1.73,1.709815,0.268097


[I 2025-09-16 10:16:42,020] Trial 0 finished with value: 0.2680965147453083 and parameters: {'learning_rate': 7.066548209121037e-06, 'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'lora_r': 8, 'lora_alpha': 16}. Best is trial 0 with value: 0.2680965147453083.
Trying to set lora_r in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,1.8885,1.822236,0.155496
100,1.831,1.831628,0.179625
150,1.8202,1.821807,0.19571
200,1.7919,1.813845,0.163539
250,1.8025,1.79977,0.201072
300,1.804,1.796727,0.227882
350,1.7936,1.791545,0.227882
400,1.7937,1.78581,0.209115
450,1.7805,1.786169,0.203753
500,1.7549,1.786156,0.217158


[I 2025-09-16 10:25:53,824] Trial 1 finished with value: 0.28418230563002683 and parameters: {'learning_rate': 2.389824895660591e-06, 'num_train_epochs': 4, 'per_device_train_batch_size': 8, 'lora_r': 16, 'lora_alpha': 16}. Best is trial 1 with value: 0.28418230563002683.
Trying to set lora_r in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,1.8362,1.809866,0.190349
100,1.7956,1.787645,0.182306
150,1.777,1.777447,0.201072
200,1.7771,1.736279,0.203753
250,1.7241,1.709871,0.246649
300,1.684,1.676807,0.284182
350,1.6663,1.629059,0.319035
400,1.6618,1.629358,0.297587
450,1.6221,1.614842,0.308311
500,1.5831,1.595239,0.324397


[I 2025-09-16 10:33:51,152] Trial 2 finished with value: 0.41823056300268097 and parameters: {'learning_rate': 1.0443953667030775e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'lora_r': 32, 'lora_alpha': 32}. Best is trial 2 with value: 0.41823056300268097.
Trying to set lora_r in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,1.8203,1.796969,0.241287
100,1.8026,1.781462,0.163539
150,1.773,1.76966,0.203753
200,1.7649,1.718655,0.19571
250,1.7029,1.684192,0.217158
300,1.6699,1.651633,0.302949
350,1.6498,1.626947,0.313673
400,1.6487,1.615313,0.319035


[I 2025-09-16 10:37:09,132] Trial 3 finished with value: 0.3190348525469169 and parameters: {'learning_rate': 1.555727499461943e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'lora_r': 32, 'lora_alpha': 64}. Best is trial 2 with value: 0.41823056300268097.
Trying to set lora_r in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,1.8194,1.795597,0.19571
100,1.8003,1.779178,0.182306
150,1.7698,1.758648,0.230563
200,1.7505,1.70878,0.214477
250,1.6875,1.685668,0.227882
300,1.6665,1.651199,0.284182
350,1.6403,1.629416,0.33244
400,1.63,1.622655,0.316354


[I 2025-09-16 10:40:14,258] Trial 4 finished with value: 0.3163538873994638 and parameters: {'learning_rate': 1.586123552717211e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'lora_r': 8, 'lora_alpha': 16}. Best is trial 2 with value: 0.41823056300268097.
Trying to set lora_r in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,1.8295,1.812467,0.107239


[I 2025-09-16 10:40:26,843] Trial 5 pruned. 
Trying to set lora_r in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,1.8372,1.835231,0.203753
100,1.8059,1.789819,0.190349
150,1.7863,1.771676,0.273458
200,1.7845,1.764023,0.203753
250,1.7494,1.748645,0.235925
300,1.7204,1.734098,0.257373


[I 2025-09-16 10:42:33,328] Trial 6 pruned. 
Trying to set lora_r in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,1.8558,1.813046,0.136729


[I 2025-09-16 10:42:55,255] Trial 7 pruned. 
Trying to set lora_r in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,1.8498,1.810873,0.142091


[I 2025-09-16 10:43:17,088] Trial 8 pruned. 
Trying to set lora_r in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,1.8893,1.832169,0.158177


[I 2025-09-16 10:43:29,947] Trial 9 pruned. 
Trying to set lora_r in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,1.9162,1.849798,0.16622


[I 2025-09-16 10:43:42,774] Trial 10 pruned. 
Trying to set lora_r in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,1.809,1.785417,0.203753
100,1.7878,1.739632,0.201072
150,1.7578,1.742606,0.19571
200,1.7115,1.671697,0.230563
250,1.651,1.659904,0.281501
300,1.6301,1.618452,0.33244
350,1.5451,1.553498,0.380697
400,1.5173,1.498687,0.369973
450,1.4448,1.490046,0.378016
500,1.4126,1.441501,0.453083


[I 2025-09-16 10:50:28,827] Trial 11 finished with value: 0.4772117962466488 and parameters: {'learning_rate': 2.3969166892094607e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'lora_r': 32, 'lora_alpha': 64}. Best is trial 11 with value: 0.4772117962466488.
Trying to set lora_r in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,1.8033,1.790237,0.184987


[I 2025-09-16 10:50:49,670] Trial 12 pruned. 
Trying to set lora_r in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,1.8008,1.792889,0.187668


[I 2025-09-16 10:51:10,805] Trial 13 pruned. 
Trying to set lora_r in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,1.815,1.793939,0.262735
100,1.7953,1.78021,0.219839
150,1.7617,1.737085,0.203753
200,1.7077,1.672217,0.252011
250,1.674,1.656232,0.241287
300,1.6304,1.573056,0.351206
350,1.5683,1.521008,0.394102
400,1.5069,1.471907,0.404826
450,1.4595,1.468948,0.380697
500,1.4169,1.417042,0.447721


[I 2025-09-16 10:57:18,879] Trial 14 finished with value: 0.46112600536193027 and parameters: {'learning_rate': 2.0005690957659685e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'lora_r': 32, 'lora_alpha': 64}. Best is trial 11 with value: 0.4772117962466488.
Trying to set lora_r in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,1.8168,1.793599,0.16622


[I 2025-09-16 10:57:40,278] Trial 15 pruned. 
Trying to set lora_r in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,1.8057,1.770373,0.206434
100,1.808,1.741559,0.235925
150,1.7945,1.79386,0.27882
200,1.7859,1.755303,0.147453
250,1.8243,1.799083,0.099196
300,1.8298,1.793901,0.281501


[I 2025-09-16 10:59:43,668] Trial 16 pruned. 
Trying to set lora_r in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,1.8081,1.789146,0.265416
100,1.7855,1.750445,0.217158
150,1.7453,1.726036,0.241287
200,1.7014,1.71275,0.19571
250,1.6462,1.662694,0.289544
300,1.6414,1.63133,0.327078
350,1.5853,1.581586,0.364611
400,1.5606,1.548905,0.335121
450,1.4977,1.529877,0.364611
500,1.4714,1.483822,0.431635


[I 2025-09-16 11:04:45,573] Trial 17 finished with value: 0.4155495978552279 and parameters: {'learning_rate': 2.155119350237964e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'lora_r': 32, 'lora_alpha': 64}. Best is trial 11 with value: 0.4772117962466488.
Trying to set lora_r in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,1.8272,1.807511,0.217158
100,1.7906,1.798767,0.241287
150,1.8063,1.827629,0.209115
200,1.7929,1.77638,0.187668
250,1.7834,1.77503,0.211796
300,1.7771,1.757058,0.24933


[I 2025-09-16 11:05:57,428] Trial 18 pruned. 
Trying to set lora_r in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Trying to set lora_alpha in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,1.8174,1.788532,0.246649
100,1.7945,1.768523,0.187668
150,1.7662,1.764292,0.219839
200,1.7477,1.720557,0.187668
250,1.7014,1.711909,0.176944
300,1.6772,1.683756,0.27882


[I 2025-09-16 11:08:01,643] Trial 19 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Hyperparameter Search Finished ---
Best Run Found: BestRun(run_id='11', objective=0.4772117962466488, hyperparameters={'learning_rate': 2.3969166892094607e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'lora_r': 32, 'lora_alpha': 64}, run_summary=None)

--- Training Final Model with Best Hyperparameters ---


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.776529,0.179625
2,No log,1.740171,0.184987
3,1.770600,1.707468,0.190349
4,1.770600,1.696927,0.198391


--- Final Training Finished ---

--- Evaluating Final Model on Test Set ---


Final Test Set Metrics: {'eval_loss': 1.7319389581680298, 'eval_accuracy': 0.21497584541062803, 'eval_runtime': 4.4217, 'eval_samples_per_second': 93.63, 'eval_steps_per_second': 5.88, 'epoch': 4.0}
Final optimized model and artifacts saved to ./hpo_results/final_model_adapter


#### Third attempt


In [5]:
import os
import json
from typing import Dict, Any, List

import torch
import torch.nn as nn
from datasets import Dataset, Features, Value, ClassLabel
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model, TaskType
import evaluate
import numpy as np
import pandas as pd
import optuna
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import WeightedRandomSampler

# -------------------------
# Config
# -------------------------
class SimpleArgs:
    csv = "/content/drive/MyDrive/ANLP Assignment 1/mt_bench_training.csv"
    output_dir = "./hpo_results"
    model_name = "bert-base-uncased"
    batch_size = 16
    epochs = 10
    lr = 3e-5
    seed = 42
    max_input_length = 256
    lora_r = 32
    lora_alpha = 64
    lora_dropout = 0.2
    save_total_limit = 2
    eval_steps = 100
    logging_steps = 50
    seed_data_split = 42
    test_size = 0.1
    validation_size = 0.1
    n_hpo_trials = 15

args = SimpleArgs()

# -------------------------
# Trainer with weighted loss
# -------------------------
class WeightedLossTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# -------------------------
# Preprocessing
# -------------------------
def build_input_texts_from_columns(examples: Dict[str, List], tokenizer) -> List[str]:
    text_inputs = []
    sep_token = tokenizer.sep_token if tokenizer.sep_token is not None else " "
    for i in range(len(examples["turn"])):
        turn, q1 = int(examples["turn"][i]), str(examples.get("turn_1_query", [""])[i]).strip()
        if turn == 2:
            ans, q2 = str(examples.get("turn_1_answer", [""])[i]).strip(), str(examples.get("turn_2_query", [""])[i]).strip()
            text = f"[Q1] {q1} {sep_token} [A1] {ans} {sep_token} [Q2] {q2}"
        else:
            text = f"[Q1] {q1}"
        text_inputs.append(text)
    return text_inputs

def preprocess_function(examples, tokenizer, args):
    text_inputs = build_input_texts_from_columns(examples, tokenizer)
    model_inputs = tokenizer(text_inputs, max_length=args.max_input_length, truncation=True, padding=False)
    model_inputs["labels"] = examples["label"]
    return model_inputs

# -------------------------
# Metrics
# -------------------------
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    logits = preds[0] if isinstance(preds, tuple) else preds
    pred_ids = np.argmax(logits, axis=1)
    acc = accuracy_metric.compute(predictions=pred_ids, references=labels)
    f1 = f1_metric.compute(predictions=pred_ids, references=labels, average="macro")
    return {"accuracy": acc["accuracy"], "macro_f1": f1["f1"]}

# -------------------------
# Main
# -------------------------
def main():
    torch.manual_seed(args.seed)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"--- Using device: {device.upper()} ---")

    df = pd.read_csv(args.csv)
    df.fillna("", inplace=True)

    unique_winners = sorted(df["winner"].unique().tolist())
    label2id = {label: i for i, label in enumerate(unique_winners)}
    id2label = {i: label for i, label in enumerate(unique_winners)}
    num_labels = len(unique_winners)
    df['label'] = df['winner'].map(label2id)

    features = Features({
        'turn': Value('int64'),
        'turn_1_query': Value('string'),
        'turn_1_answer': Value('string'),
        'turn_2_query': Value('string'),
        'winner': Value('string'),
        'label': ClassLabel(names=unique_winners)
    })
    df = df[[col for col in features.keys() if col in df.columns]]
    raw_all = Dataset.from_pandas(df, features=features)

    # Splitting
    train_val_split = raw_all.train_test_split(test_size=args.test_size, seed=args.seed_data_split, stratify_by_column="label")
    test_ds = train_val_split["test"]
    train_val_ds = train_val_split["train"]
    train_split = train_val_ds.train_test_split(test_size=args.validation_size, seed=args.seed_data_split, stratify_by_column="label")
    train_ds, val_ds = train_split["train"], train_split["test"]

    tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=True)

    # Class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(train_ds['label']), y=np.array(train_ds['label']))
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

    # Tokenization
    tokenized_train = train_ds.map(lambda ex: preprocess_function(ex, tokenizer, args), batched=True)
    tokenized_val = val_ds.map(lambda ex: preprocess_function(ex, tokenizer, args), batched=True)
    tokenized_test = test_ds.map(lambda ex: preprocess_function(ex, tokenizer, args), batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Weighted sampler (helps with imbalance!)
    class_sample_counts = np.bincount(np.array(train_ds["label"]))
    weights = 1.0 / class_sample_counts
    sample_weights = [weights[label] for label in train_ds["label"]]
    sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

    # Model
    base_model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name, num_labels=num_labels, id2label=id2label, label2id=label2id
    )
    peft_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=args.lora_r,
        lora_alpha=args.lora_alpha,
        lora_dropout=args.lora_dropout,
        target_modules=["query", "key", "value", "dense", "output.dense", "intermediate.dense"]
    )
    model = get_peft_model(base_model, peft_config)

    training_args = TrainingArguments(
        output_dir=os.path.join(args.output_dir, "final_model"),
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=args.lr,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        num_train_epochs=args.epochs,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=args.logging_steps,
        save_total_limit=args.save_total_limit,
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        greater_is_better=True,
        fp16=torch.cuda.is_available(),
        seed=args.seed,
    )

    trainer = WeightedLossTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        class_weights=class_weights_tensor,
    )

    trainer.train()
    print("--- Training Finished ---")

    print("\n--- Evaluating on Test Set ---")
    test_results = trainer.evaluate(eval_dataset=tokenized_test)
    print("Final Test Set Metrics:", test_results)

    trainer.save_model(os.path.join(args.output_dir, "final_model"))
    tokenizer.save_pretrained(os.path.join(args.output_dir, "final_model"))
    with open(os.path.join(args.output_dir, "label_mappings.json"), "w") as f:
        json.dump({"id2label": {str(k): v for k, v in id2label.items()}, "label2id": label2id}, f)

if __name__ == "__main__":
    main()


Downloading builder script: 0.00B [00:00, ?B/s]

--- Using device: CUDA ---


Map:   0%|          | 0/3348 [00:00<?, ? examples/s]

Map:   0%|          | 0/373 [00:00<?, ? examples/s]

Map:   0%|          | 0/414 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,1.8005,1.769526,0.190349,0.157152
2,1.7624,1.730669,0.238606,0.218082
3,1.7171,1.698591,0.238606,0.230447
4,1.6864,1.665323,0.262735,0.214197
5,1.6659,1.633288,0.252011,0.245033
6,1.6567,1.605794,0.284182,0.275553
7,1.5881,1.61,0.302949,0.292661
8,1.5978,1.587612,0.292225,0.273432
9,1.6002,1.585137,0.265416,0.25961
10,1.5487,1.586426,0.300268,0.291104


--- Training Finished ---

--- Evaluating on Test Set ---


Final Test Set Metrics: {'eval_loss': 1.6440895795822144, 'eval_accuracy': 0.26570048309178745, 'eval_macro_f1': 0.25283914100351107, 'eval_runtime': 2.4529, 'eval_samples_per_second': 168.779, 'eval_steps_per_second': 10.6, 'epoch': 10.0}


#### Model Saving to google drive functionality

In [6]:


# Define the path in Google Drive where you want to save the model
# You can change 'my_finetuned_model' to a different folder name if you prefer
GOOGLE_DRIVE_SAVE_PATH = '/content/drive/MyDrive/ANLP Assignment 1/Model File'

# Create the directory in Google Drive if it doesn't exist
os.makedirs(GOOGLE_DRIVE_SAVE_PATH, exist_ok=True)

print(f"Google Drive mounted at /content/drive")
print(f"Model will be saved to {GOOGLE_DRIVE_SAVE_PATH}")

Google Drive mounted at /content/drive
Model will be saved to /content/drive/MyDrive/ANLP Assignment 1/Model File


In [8]:
# Define the directory where the model adapter is saved locally after training
LOCAL_MODEL_DIR = "./hpo_results/final_model"

# Define the destination path in Google Drive
GOOGLE_DRIVE_DEST_DIR = GOOGLE_DRIVE_SAVE_PATH

# Copy the entire model adapter directory to Google Drive
if os.path.exists(LOCAL_MODEL_DIR):
    # Remove the destination directory in Google Drive if it already exists to avoid errors during copy
    if os.path.exists(GOOGLE_DRIVE_DEST_DIR):
        print(f"Removing existing directory in Google Drive: {GOOGLE_DRIVE_DEST_DIR}")
        shutil.rmtree(GOOGLE_DRIVE_DEST_DIR)

    print(f"Copying model adapter from {LOCAL_MODEL_DIR} to {GOOGLE_DRIVE_DEST_DIR}")
    shutil.copytree(LOCAL_MODEL_DIR, GOOGLE_DRIVE_DEST_DIR)
    print("Model adapter successfully saved to Google Drive.")
else:
    print(f"Local model directory not found: {LOCAL_MODEL_DIR}. Please run the training code first.")

Removing existing directory in Google Drive: /content/drive/MyDrive/ANLP Assignment 1/Model File
Copying model adapter from ./hpo_results/final_model to /content/drive/MyDrive/ANLP Assignment 1/Model File
Model adapter successfully saved to Google Drive.


### Model Inference (loads the fine-tuned model previously saved in google drive)

In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel, PeftConfig
import json
import os
from google.colab import drive

# Mount Google Drive if it's not already mounted
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

# Define the path in Google Drive where the model is saved
GOOGLE_DRIVE_SAVED_MODEL_PATH = '/content/drive/MyDrive/ANLP Assignment 1/Model File' # Make sure this matches the save path

def predict(query: str, model_dir: str):
    """
    Loads a PEFT model and tokenizer from a directory and performs inference.

    Args:
        query (str): The input text (user query) to classify.
        model_dir (str): The directory containing the PEFT adapter and artifacts (in Google Drive).

    Returns:
        str: The predicted class label (the best model name).
    """
    # --- 1. Load All Artifacts from the Directory ---

    # Load the PEFT config to get the base model name
    config = PeftConfig.from_pretrained(model_dir)
    base_model_name = config.base_model_name_or_path

    # Load the label mappings
    mappings_path = os.path.join('/content/drive/MyDrive/ANLP Assignment 1/', "label_mappings.json")
    with open(mappings_path, "r") as f:
        label_mappings = json.load(f)
        # The keys in the JSON file are strings, convert them back to integers
        id2label = {int(k): v for k, v in label_mappings["id2label"].items()}
        label2id = label_mappings["label2id"]

    num_labels = len(id2label)

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_dir)

    # --- 2. Build the Model ---

    # Load the base model with the correct classification head
    base_model = AutoModelForSequenceClassification.from_pretrained(
        base_model_name,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
    )

    # Apply the LoRA adapter
    model = PeftModel.from_pretrained(base_model, model_dir)
    model.eval()

    # --- 3. Perform Inference ---
    inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    predicted_id = torch.argmax(logits, dim=-1).item()
    predicted_label = model.config.id2label[predicted_id]

    return predicted_label




In [25]:
if __name__ == '__main__':
    # Use the Google Drive path for inference
    ADAPTER_DIRECTORY = GOOGLE_DRIVE_SAVED_MODEL_PATH

    # Example query for inference
    test_query = "Imagine you are writing a blog post comparing two popular smartphone models. Develop an outline for the blog post, including key points and subheadings to effectively compare and contrast the features, performance, and user experience of the two models. Please answer in fewer than 200 words."

    # Get the prediction
    best_model = predict(test_query, ADAPTER_DIRECTORY)

    print(f"Input Query:\n'{test_query}'")
    print("---")
    print(f"Predicted Best Model: {best_model}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Input Query:
'Imagine you are writing a blog post comparing two popular smartphone models. Develop an outline for the blog post, including key points and subheadings to effectively compare and contrast the features, performance, and user experience of the two models. Please answer in fewer than 200 words.'
---
Predicted Best Model: claude-v1
