In [27]:
# Importing PyTorch, the foundational deep learning framework used for model training and inference.
import torch

# Importing components from Hugging Face's 'transformers' library:
from transformers import (
    # Tokenizer for the RoBERTa model — used to convert text into tokens for the model.
    RobertaTokenizer,
    # Pretrained RoBERTa model specifically configured for sequence classification tasks.
    RobertaForSequenceClassification,
    # Automatically pads tokenized inputs to the maximum sequence length in a batch.
    DataCollatorWithPadding,
    # Holds the training configuration settings like learning rate, batch size, number of epochs, etc.
    TrainingArguments,
    # High-level class to manage the training and evaluation loop.
    Trainer,
    # Allows early stopping of training if the model performance stops improving.
    EarlyStoppingCallback,
)

# Importing components from 'peft' — Parameter-Efficient Fine-Tuning library.
# This allows fine-tuning of large models efficiently by training fewer parameters.
from peft import get_peft_model, LoraConfig, TaskType

# Hugging Face's 'datasets' library:
# - load_dataset is used to load datasets from Hugging Face Hub or custom datasets.
# - Dataset class allows manual creation or manipulation of datasets.
from datasets import load_dataset, Dataset

# Accuracy metric from scikit-learn to evaluate classification performance.
from sklearn.metrics import accuracy_score

# NumPy is a fundamental package for numerical operations in Python.
import numpy as np


This block brings in all the essential libraries for the deep learning pipeline you're about to construct:

- **torch**: Core deep learning operations like tensor computations and backpropagation are handled here.

- **transformers**: This library gives you easy access to pretrained models like RoBERTa and handles all NLP-related tokenization, training utilities, and evaluation mechanisms.

- **peft (Parameter-Efficient Fine-Tuning)**: Enables a lightweight and efficient way to fine-tune large transformer models. This is crucial when computational resources are limited.

- **datasets**: A streamlined way to load and preprocess datasets.

- **sklearn.metrics**: Used here to compute accuracy, which is a common metric for classification tasks.

- **numpy**: Helps in mathematical operations, especially useful for metrics computation and data manipulation.


In [41]:
# Load tokenizer and model
# Define the pretrained model to use — "roberta-base" is a popular transformer model trained on a large corpus.
model_name = "roberta-base"
# Load the tokenizer associated with RoBERTa — responsible for converting raw text into token IDs.
tokenizer = RobertaTokenizer.from_pretrained(model_name)
# Load the RoBERTa model specifically configured for classification with 4 output labels (i.e., 4 classes).
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=4)


# Freeze base model parameters (we'll only train LoRA + classifier)
# Freezing all parameters of the base RoBERTa model to prevent them from being updated during training.
# This is done because we want to use LoRA for efficient fine-tuning.
for param in model.roberta.parameters():
    param.requires_grad = False

# Define which modules inside the transformer will be fine-tuned using LoRA — usually attention components.
target_modules = ["query", "key", "value"]

# Create a LoRA configuration:
lora_config=LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Apply LoRA to a sequence classification task
    r=2,                         # Rank of the low‑rank decomposition (controls parameter count)
    lora_alpha=4,                # Scaling factor for the LoRA weight updates
    lora_dropout=0.1,            # Dropout applied within the LoRA layers for regularization
    bias="none",                 # Don’t fine‑tune any bias terms
    target_modules=target_modules  # List of module names (e.g., ["query","key","value"]) to inject LoRA into
)

# Inject LoRA into the model based on the configuration.
model = get_peft_model(model, lora_config)

# Optional: check trainable params
model.print_trainable_parameters()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 704,260 || all params: 125,352,968 || trainable%: 0.5618


### Model and Tokenizer Initialization

- Loads the `roberta-base` tokenizer and model for sequence classification with 4 labels (for the AG News dataset).

### Freezing Base Model

- Freezes all parameters of the base RoBERTa model to ensure only the LoRA layers and classifier are trained.

### LoRA Configuration

Configures LoRA (Low-Rank Adaptation) with the following settings:

- `task_type=TaskType.SEQ_CLS`: Specifies you're fine‑tuning for a sequence classification problem.
- `r=2`: Chooses a very small rank (2) for the additional LoRA matrices, ensuring minimal extra parameters.
- `lora_alpha=4`: Scales the LoRA updates by 4, which can help stabilize training and improve convergence.
- `lora_dropout=0.1`: Adds 10% dropout inside LoRA layers to prevent overfitting on small datasets.
- `bias="none"`: Keeps all original bias parameters frozen; only the LoRA weights (and model head) are trainable.
- `target_modules=target_modules`: You typically pass `["query", "key", "value"]` so LoRA only modifies the attention mechanism, preserving the rest of the pretrained model.

### Trainable Parameters

- Prints the number of trainable parameters (0.56% of total parameters), showing the efficiency of LoRA.

> Some weights of `RobertaForSequenceClassification` were not initialized...

- `trainable params: 704,260 || all params: 125,352,968 || trainable%: 0.5618`

A warning about newly initialized classifier layers is expected since the model is being fine-tuned.

Only 0.56% of the total parameters are trainable, demonstrating LoRA's parameter efficiency.


In [42]:
print(model)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): Mod

In [43]:
# Tokenization function
def preprocess_function(example):
    return tokenizer(example["text"], truncation=True, padding=True, max_length=256)

# Load and preprocess dataset
raw_dataset = load_dataset("ag_news", split="train")
tokenized_dataset = raw_dataset.map(preprocess_function, batched=True, remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

# Split dataset into train/validation
splits = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = splits["train"]
eval_dataset = splits["test"]

# Data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

### Tokenization Function

- Tokenizes text using the tokenizer with:
  - Truncation
  - Padding to a maximum length of 256 tokens

### Dataset Loading

- Loads the **AG News** dataset using the `datasets` library.
- Applies the tokenization function to all examples.
- Renames the `label` column to `labels` for compatibility with Hugging Face's training APIs.

### Train-Validation Split

- Splits the dataset into:
  - 90% for training
  - 10% for validation

### Data Collator

- Uses a data collator that dynamically pads each batch to the longest sequence in that batch.
- Improves efficiency by avoiding padding all sequences to the max length globally.


In [49]:
# Define a custom metric function used during evaluation
def compute_metrics(eval_pred):
    # Extract logits (raw model outputs) and ground-truth labels
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    # Convert logits to predicted class indices by selecting the one with the highest score
    preds = np.argmax(logits, axis=-1)
    # Compute and return accuracy between predictions and actual labels
    return {"accuracy": accuracy_score(labels, preds)}


# Training arguments
training_args = TrainingArguments(
    output_dir="./results_lora",  # Directory for model checkpoints and logs
    eval_strategy="epoch",  # Evaluate after each epoch
    save_strategy="epoch",  # Save model after each epoch
    learning_rate=4e-6,  # Low LR for fine-tuning
    per_device_train_batch_size=32,  # Training batch size
    per_device_eval_batch_size=64,  # Evaluation batch size
    num_train_epochs=3,  # Number of training epochs
    weight_decay=0.003,  # L2 regularization
    load_best_model_at_end=True,  # Load best model at end
    metric_for_best_model="eval_accuracy",  # Use accuracy for model selection
    greater_is_better=True,  # Higher accuracy is better
    logging_steps=100,  # Log every 100 steps
    optim="adamw_torch",  # Optimizer
    push_to_hub=False,  # Don't push to HF Hub
    fp16=True,  # Mixed precision training
    report_to=None  # Disable external logging
)

### Metrics Function

- Defines a function to compute **accuracy** during evaluation using `sklearn.metrics`.

### Training Arguments

Configures the training setup with the following:

- **Learning rate**: `4e-6`
- **Batch sizes**:
  - Training batch size: `32`
  - Evaluation batch size: `64`
- **Epochs**: `3`
- **Weight decay**: Applied for regularization to reduce overfitting
- **Model checkpointing**: Saves the **best model** based on validation accuracy
- **Mixed precision (fp16)**: Enabled for faster training on compatible GPUs


In [50]:
trainer = Trainer(
    model=model,  # The model to train/evaluate
    args=training_args,  # The training arguments (configured above)
    train_dataset=train_dataset,  # The training dataset
    eval_dataset=eval_dataset,  # The evaluation dataset
    data_collator=data_collator,  # Function to collate data batches
    compute_metrics=compute_metrics,  # Metrics function for evaluation
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Stop training early if no improvement
)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


### Trainer Setup

The `Trainer` is initialized with the following components:

- **model**: The LoRA-adapted model you've prepared for training.
- **args**: The training configuration defined using `TrainingArguments`.
- **train_dataset** & **eval_dataset**: Datasets for training and validation, respectively.
- **data_collator**: Handles dynamic padding within each batch, improving memory efficiency when dealing with sequences of varying lengths.
- **compute_metrics**: A custom function that computes **accuracy** during evaluation.
- **callbacks**: Includes `EarlyStoppingCallback` to stop training early if validation accuracy doesn't improve for 3 consecutive evaluations (`early_stopping_patience=3`).


In [51]:
# Train
print("Starting training...")
trainer.train()

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3206,0.304649,0.896667
2,0.3303,0.29102,0.900417
3,0.3121,0.287946,0.901333


TrainOutput(global_step=10125, training_loss=0.3537936629307123, metrics={'train_runtime': 819.6001, 'train_samples_per_second': 395.315, 'train_steps_per_second': 12.354, 'total_flos': 4.2975241224192e+16, 'train_loss': 0.3537936629307123, 'epoch': 3.0})

In [52]:
# Evaluate
eval_results = trainer.evaluate()
print(f"Validation accuracy: {eval_results['eval_accuracy']:.4f}")

Validation accuracy: 0.9013


In [None]:
import pandas as pd
import os

# Load unlabeled test data
unlabelled_dataset = pd.read_pickle("test_unlabelled.pkl")
# If it's a pandas DataFrame, convert to Hugging Face Dataset for easy batch processing
if isinstance(unlabelled_dataset, pd.DataFrame):
    test_dataset = Dataset.from_pandas(unlabelled_dataset)
else:
    test_dataset = unlabelled_dataset  # already a Dataset

# Preprocess the test dataset (tokenize)
test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=["text"])

# Run inference in batches
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(axis=-1)

# Prepare submission dataframe
submission = pd.DataFrame({
    "ID": range(len(pred_labels)),
    "Label": pred_labels
})
# Save to CSV (no index, just two columns)
submission.to_csv("Fsub_proj2_12.csv", index=False)
print(f"Predictions saved to local system")
