In [1]:
#!/usr/bin/env python3
"""
fine_tune_bert_lora_classification.py

Fine-tune a BERT model (e.g., bert-base-uncased) for sequence classification
using LoRA (PEFT). The script uses a three-way split: train, validation, and test.

This script is adapted for an encoder-only architecture. The task is framed as a
multi-class classification problem where the model predicts the ID of the best
model for a given query.

The input CSV contains conversational turns, which are formatted into a single
input sequence for BERT.
"""

import os
import argparse
from typing import Dict, Any

import torch
from datasets import load_dataset, Dataset, ClassLabel
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model, TaskType
import evaluate
import numpy as np
import pandas as pd

# -------------------------
# Argument parsing
# -------------------------
def parse_args():
    p = argparse.ArgumentParser(description="Fine-tune a BERT model with LoRA for sequence classification.")
    p.add_argument("--csv", type=str, default="mt_bench_training.csv", help="Input CSV file path")
    p.add_argument("--output_dir", type=str, default="./bert_peft_adapter", help="Where to save the LoRA adapter")
    p.add_argument("--model_name", type=str, default="bert-base-uncased", help="Base encoder-only model")
    p.add_argument("--batch_size", type=int, default=8)
    p.add_argument("--epochs", type=int, default=3)
    p.add_argument("--lr", type=float, default=5e-5)
    p.add_argument("--seed", type=int, default=42)
    p.add_argument("--max_input_length", type=int, default=512) # BERT's max length
    p.add_argument("--lora_r", type=int, default=8)
    p.add_argument("--lora_alpha", type=int, default=16)
    p.add_argument("--lora_dropout", type=float, default=0.1)
    p.add_argument("--save_total_limit", type=int, default=2)
    p.add_argument("--eval_steps", type=int, default=200)
    p.add_argument("--logging_steps", type=int, default=50)
    p.add_argument("--seed_data_split", type=int, default=42)
    p.add_argument("--test_size", type=float, default=0.1, help="Fraction for the final test set.")
    p.add_argument("--validation_size", type=float, default=0.1, help="Fraction of non-test data for validation.")
    return p.parse_args()

# -------------------------
# Utilities
# -------------------------
def build_input_text_from_row(row: Dict[str, Any]) -> str:
    """
    Build a single input text string for BERT based on the 'turn' value.
    This text will be fed to the encoder.
    """
    turn = int(row["turn"])
    q1 = str(row.get("turn_1_query", "")).strip()
    if turn == 1:
        # For BERT, we provide the full context as a single string
        text = f"Query: {q1}"
    elif turn == 2:
        ans = str(row.get("turn_1_answer", "")).strip()
        q2 = str(row.get("turn_2_query", "")).strip()
        # Concatenate the conversation history
        text = f"Query: {q1} [SEP] Answer: {ans} [SEP] Follow-up Query: {q2}"
    else:
        text = f"Query: {q1}"
    return text

def preprocess_dataset(records: list, tokenizer, args):
    """
    Tokenize the input text. The label is already an integer and doesn't need tokenization.
    Accepts a list of dicts.
    """
    # Build the single text input for BERT from the row data
    text_inputs = [build_input_text_from_row(ex) for ex in records]
    
    # Tokenize the constructed text
    model_inputs = tokenizer(
        text_inputs,
        max_length=args.max_input_length,
        truncation=True,
        padding=False, # Collator will handle padding
    )
    return Dataset.from_dict(model_inputs)


# -------------------------
# Compute Metrics
# -------------------------
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_preds):
    """
    Compute accuracy from model predictions (logits) and true labels.
    """
    preds, labels = eval_preds
    # The predictions are logits; we need to take the argmax to get the predicted class ID
    pred_ids = np.argmax(preds, axis=1)
    
    acc = accuracy_metric.compute(predictions=pred_ids, references=labels)
    return {"accuracy": acc["accuracy"]}

# -------------------------
# Main
# -------------------------
def main():
    args = parse_args()
    torch.manual_seed(args.seed)

    # --- ADDED: Explicitly check for CUDA and set device ---
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"--- Using device: {device.upper()} ---")
    # The Trainer will automatically use the GPU if it's available.
    # This check is for user information.

    # Load CSV as a pandas DataFrame to easily create the label mapping
    if not os.path.exists(args.csv):
        raise FileNotFoundError(f"CSV file not found: {args.csv}")
    
    df = pd.read_csv(args.csv)
    # NOTE: Assuming 'winner' column has no ties, as per the user request.
    
    # --- Create Label Mappings ---
    unique_winners = sorted(df["winner"].unique().tolist()) # Sort for consistency
    label2id = {label: i for i, label in enumerate(unique_winners)}
    id2label = {i: label for i, label in enumerate(unique_winners)}
    num_labels = len(unique_winners)
    print(f"Found {num_labels} unique labels: {unique_winners}")
    
    # Add the integer 'label' column to the DataFrame
    df['label'] = df['winner'].map(label2id)
    
    # Load the DataFrame as a Hugging Face Dataset
    raw_all = Dataset.from_pandas(df)

    # --- Create Train, Validation, and Test Splits ---
    train_val_split = raw_all.train_test_split(test_size=args.test_size, seed=args.seed_data_split, stratify_by_column="label")
    train_val_ds = train_val_split["train"]
    test_ds = train_val_split["test"]

    train_split = train_val_ds.train_test_split(test_size=args.validation_size, seed=args.seed_data_split, stratify_by_column="label")
    train_ds = train_split["train"]
    val_ds = train_split["test"]

    print(f"Dataset splits created: train={len(train_ds)}, validation={len(val_ds)}, test={len(test_ds)}")

    # Load tokenizer and model for Sequence Classification
    tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id,
    )
    
    # LoRA (PEFT) configuration for Sequence Classification
    peft_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=args.lora_r,
        lora_alpha=args.lora_alpha,
        lora_dropout=args.lora_dropout,
        target_modules=["query", "value"],
    )

    model = get_peft_model(model, peft_config)
    print("Wrapped model with LoRA for Sequence Classification. Trainable parameters:")
    model.print_trainable_parameters()

    # Tokenize datasets
    tokenized_train = preprocess_dataset(train_ds.to_dict('records'), tokenizer, args)
    tokenized_val = preprocess_dataset(val_ds.to_dict('records'), tokenizer, args)
    tokenized_test = preprocess_dataset(test_ds.to_dict('records'), tokenizer, args)

    # Add the labels back to the tokenized datasets
    tokenized_train = tokenized_train.add_column("labels", train_ds["label"])
    tokenized_val = tokenized_val.add_column("labels", val_ds["label"])
    tokenized_test = tokenized_test.add_column("labels", test_ds["label"])

    # Data collator for padding
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Standard TrainingArguments (not Seq2Seq)
    training_args = TrainingArguments(
        output_dir=args.output_dir,
        evaluation_strategy="steps",
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        num_train_epochs=args.epochs,
        learning_rate=args.lr,
        save_total_limit=args.save_total_limit,
        fp16=torch.cuda.is_available(), # This line enables CUDA usage
        logging_steps=args.logging_steps,
        eval_steps=args.eval_steps,
        save_strategy="steps",
        save_steps=args.eval_steps,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        seed=args.seed,
    )

    # Standard Trainer (not Seq2SeqTrainer)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Train the model
    print("--- Starting Training ---")
    trainer.train()
    print("--- Training Finished ---")

    # Final Evaluation on the held-out Test Set
    print("\n--- Evaluating on the held-out Test Set ---")
    test_results = trainer.evaluate(eval_dataset=tokenized_test)
    print("Test Set Metrics:")
    print(test_results)

    # Save the final LoRA adapter and tokenizer
    print("\nSaving final PEFT adapter to:", args.output_dir)
    model.save_pretrained(args.output_dir)
    tokenizer.save_pretrained(args.output_dir)
    print("Done.")

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'datasets'