# Starter Notebook

Install and import required libraries

In [1]:
import os
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle

  from .autonotebook import tqdm as notebook_tqdm


## Load Tokenizer and Preprocess Data

In [3]:
base_model = "roberta-base"
dataset = load_dataset('ag_news', split='train')
tokenizer = RobertaTokenizer.from_pretrained(base_model)
id2label = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
label2id = {v: k for k, v in id2label.items()}

def format_agnews_headline_body(example):
    text = example["text"]
    if "." in text:
        headline, body = text.split(".", 1)
        return {"text": f"Headline: {headline.strip()} Body: {body.strip()}"}
    return {"text": f"Headline: {text.strip()}"}

def preprocess(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=True)
    return tokenized

#tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
#tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [4]:
# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


## Load Pre-trained Model
Set up config for pretrained model and download it from hugging face

In [5]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    num_labels=4,
    id2label=id2label,
    label2id=label2id)
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

## Anything from here on can be modified

In [15]:

# Split the original training set
split_datasets = tokenized_dataset.train_test_split(test_size=640, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']
'''
from datasets import load_dataset
dataset = load_dataset("ag_news")
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
'''

'\nfrom datasets import load_dataset\ndataset = load_dataset("ag_news")\ntrain_dataset = dataset["train"]\neval_dataset = dataset["test"]\n'

In [40]:
def format_agnews_prompt(row):
    """
    Converts AG News sample into a prompt-like structure with separated headline/body.
    """
    text = row["text"]
    if "." in text:
        headline, body = text.split(".", 1)
        formatted = (
            f"Classify the following news article:\n\n"
            f"Headline: {headline.strip()}\n"
            f"Body: {body.strip()}"
        )
    else:
        formatted = f"Classify the following news article:\n\n{text.strip()}"
    return {"text": formatted}

In [42]:
print(train_dataset.column_names)
train_dataset = train_dataset.map(format_agnews_prompt)
eval_dataset = eval_dataset.map(format_agnews_prompt)

['text', 'label']


Map: 100%|██████████| 120000/120000 [00:06<00:00, 17791.57 examples/s]
Map: 100%|██████████| 7600/7600 [00:00<00:00, 16224.38 examples/s]


In [44]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=128)

In [45]:
tokenized_train = train_dataset.map(tokenize, batched=True)
tokenized_eval = eval_dataset.map(tokenize, batched=True)

Map: 100%|██████████| 120000/120000 [00:12<00:00, 9247.54 examples/s] 
Map: 100%|██████████| 7600/7600 [00:00<00:00, 9304.30 examples/s] 


## Setup LoRA Config
Setup PEFT config and get peft model for finetuning

In [6]:
# PEFT Config
peft_config = LoraConfig(
    r=4,
    lora_alpha=32,
    lora_dropout=0.1,
    bias = 'none',
    #target_modules = ['query'],
    target_modules=['query', 'value'],
    task_type="SEQ_CLS",
)

In [7]:
peft_model = get_peft_model(model, peft_config)
peft_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): Mod

In [8]:
print("Trainable parameters:")
for name, param in peft_model.named_parameters():
    if param.requires_grad:
         print(name)

Trainable parameters:
base_model.model.roberta.encoder.layer.0.attention.self.query.lora_A.default.weight
base_model.model.roberta.encoder.layer.0.attention.self.query.lora_B.default.weight
base_model.model.roberta.encoder.layer.0.attention.self.value.lora_A.default.weight
base_model.model.roberta.encoder.layer.0.attention.self.value.lora_B.default.weight
base_model.model.roberta.encoder.layer.1.attention.self.query.lora_A.default.weight
base_model.model.roberta.encoder.layer.1.attention.self.query.lora_B.default.weight
base_model.model.roberta.encoder.layer.1.attention.self.value.lora_A.default.weight
base_model.model.roberta.encoder.layer.1.attention.self.value.lora_B.default.weight
base_model.model.roberta.encoder.layer.2.attention.self.query.lora_A.default.weight
base_model.model.roberta.encoder.layer.2.attention.self.query.lora_B.default.weight
base_model.model.roberta.encoder.layer.2.attention.self.value.lora_A.default.weight
base_model.model.roberta.encoder.layer.2.attention.sel

In [9]:
print('PEFT Model')
peft_model.print_trainable_parameters()

PEFT Model
trainable params: 741,124 || all params: 125,389,832 || trainable%: 0.5911


## Training Setup

In [10]:
# To track evaluation accuracy during training
# !pip install scikit-learn

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy
    }

In [11]:
from transformers import Trainer
from torch.nn import CrossEntropyLoss

class SmoothingTrainer(Trainer):
    def __init__(self, *args, label_smoothing=0.1, **kwargs):
        super().__init__(*args, **kwargs)
        self.label_smoothing = label_smoothing

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # Apply label smoothing
        loss_fct = CrossEntropyLoss(label_smoothing=self.label_smoothing)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [13]:
# Setup Training args
output_dir = "results"
training_args = TrainingArguments(
    output_dir=output_dir,
    report_to=None,
    eval_strategy='steps',
    logging_steps=100,
    learning_rate=3e-4,
    num_train_epochs=3,
    #max_steps=1200,
    max_steps=-1,
    use_cpu=False,
    dataloader_num_workers=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    optim="adamw_torch",
    lr_scheduler_type="linear", 
    #lr_scheduler_type="cosine",
    warmup_ratio=0.1, 
    gradient_checkpointing=False,
    gradient_checkpointing_kwargs={'use_reentrant':True}
)

def get_trainer(model):
      return SmoothingTrainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        #train_dataset=tokenized_train,
        #eval_dataset=tokenized_eval,
        data_collator=data_collator,
        label_smoothing=0.1 
    )

### Start Training

In [16]:
peft_lora_finetuning_trainer = get_trainer(peft_model)

result = peft_lora_finetuning_trainer.train()

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Accuracy
100,1.3935,1.384547,0.328125
200,1.3732,1.346738,0.625
300,0.9544,0.608263,0.871875
400,0.5794,0.594868,0.884375
500,0.5801,0.575876,0.9
600,0.5695,0.552674,0.903125
700,0.5709,0.61509,0.8875
800,0.5831,0.58453,0.90625
900,0.5525,0.577834,0.89375
1000,0.5624,0.564093,0.898438


## Evaluate Finetuned Model


### Performing Inference on Custom Input
Uncomment following functions for running inference on custom inputs

In [17]:
def classify(model, tokenizer, text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
    output = model(**inputs)

    prediction = output.logits.argmax(dim=-1).item()

    print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
    return id2label[prediction]

In [18]:
classify( peft_model, tokenizer, "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")
classify( peft_model, tokenizer, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.")


 Class: 1, Label: Sports, Text: Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...

 Class: 2, Label: Business, Text: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindlinand of ultra-cynics, are seeing green again.


'Business'

### Run Inference on eval_dataset

In [19]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

In [20]:
# Check evaluation accuracy
_, _ = evaluate_model(peft_model, eval_dataset, True, 8, data_collator)

100%|██████████| 80/80 [00:05<00:00, 15.22it/s]

Evaluation Metric: {'accuracy': 0.946875}





In [18]:
!pip install --user kaggle



In [19]:
import os
os.environ["PATH"] += os.pathsep + os.path.expanduser("~/.local/bin")

In [20]:
!which kaggle

/home/tc4104/.local/bin/kaggle


In [21]:
import kaggle

In [22]:
!kaggle --version

Kaggle API 1.7.4.2


In [23]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [24]:
!kaggle competitions download -c deep-learning-spring-2025-project-2

deep-learning-spring-2025-project-2.zip: Skipping, found more recently modified local copy (use --force to force download)


In [25]:
print(os.listdir())

['.bash_history', '.triton', '.singularity', 'deep-learning-spring-2025-project-2.zip', '.cache', 'miniconda.sh', 'test_unlabelled.pkl', 'results', 'train.log', 'DLSP25-Project1', '.viminfo', '.nv', '.jupyter', '.local', '.lesshst', '.ipython', '.conda', '.ipynb_checkpoints', '.config', 'cifar_test_nolabel.pkl', 'Starter_Notebook.ipynb', 'cifar-10-python', 'data', '.ssh', '.kaggle', '.bashrc', 'deep-learning-spring-2025-project-1.zip', 'ondemand']


In [26]:
import zipfile

with zipfile.ZipFile("deep-learning-spring-2025-project-2.zip", 'r') as zip_ref:
    zip_ref.extractall(".")

### Run Inference on unlabelled dataset

In [21]:
#Load your unlabelled data
unlabelled_dataset = pd.read_pickle("test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])
unlabelled_dataset

Map: 100%|██████████| 8000/8000 [00:06<00:00, 1317.49 examples/s]


Dataset({
    features: ['text'],
    num_rows: 8000
})

In [22]:
# Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 8, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})
df_output.to_csv(os.path.join(output_dir,"inference_output.csv"), index=False)
print("Inference complete. Predictions saved to inference_output.csv")

100%|██████████| 1000/1000 [00:48<00:00, 20.45it/s]


Inference complete. Predictions saved to inference_output.csv


In [25]:
preds_output = peft_lora_finetuning_trainer.predict(eval_dataset)

In [26]:
import numpy as np

pred_logits = preds_output.predictions
true_labels = preds_output.label_ids
pred_labels = np.argmax(pred_logits, axis=1)

In [27]:
wrong_idxs = np.where(pred_labels != true_labels)[0]

In [29]:
# Load original (unformatted) dataset for readable text
original_eval = load_dataset("ag_news", split="test")

# Show 5 misclassified samples
for idx in wrong_idxs[:5]:
    i = int(idx)  # Cast numpy.int64 → Python int
    print(f"Text: {original_eval[i]['text']}")
    print(f"True Label: {true_labels[i]} | Predicted: {pred_labels[i]}")
    print("—" * 80)

Text: Rocking the Cradle of Life When did life begin? One evidential clue stems from the fossil records in Western Australia, although whether these layered sediments are biological or chemical has spawned a spirited debate. Oxford researcher, Nicola McLoughlin, describes some of the issues in contention.
True Label: 2 | Predicted: 3
————————————————————————————————————————————————————————————————————————————————
Text: Afghan Army Dispatched to Calm Violence KABUL, Afghanistan - Government troops intervened in Afghanistan's latest outbreak of deadly fighting between warlords, flying from the capital to the far west on U.S. and NATO airplanes to retake an air base contested in the violence, officials said Sunday...
True Label: 2 | Predicted: 3
————————————————————————————————————————————————————————————————————————————————
Text: Drew Out of Braves' Lineup After Injury (AP) AP - Outfielder J.D. Drew missed the Atlanta Braves' game against the St. Louis Cardinals on Sunday night with a so

In [30]:
import pandas as pd

wrong_preds = [
    {
        "text": original_eval[int(idx)]["text"],  # convert to Python int
        "true_label": int(true_labels[int(idx)]),
        "predicted_label": int(pred_labels[int(idx)]),
    }
    for idx in wrong_idxs
]

import pandas as pd
df = pd.DataFrame(wrong_preds)
df.to_csv("misclassified_eval_examples.csv", index=False)

TypeError: Wrong key type: '18' of type '<class 'numpy.int64'>'. Expected one of int, slice, range, str or Iterable.