In [1]:
import os
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_model = 'roberta-large'

dataset = load_dataset('ag_news', split='train')
tokenizer = RobertaTokenizer.from_pretrained(base_model)

def preprocess(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=True)
    return tokenized

tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


In [3]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label)
# Split the original training set
split_datasets = tokenized_dataset.train_test_split(test_size=10000, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
layer_indices = range(8, 24)
target_modules_mid_upper = [f"roberta.encoder.layer.{i}.attention.self.query" for i in layer_indices]

peft_config = LoraConfig(
    r=32, # Good rank for this number of layers
    lora_alpha=64,
    lora_dropout=0.2,
    bias='none',
    target_modules=["query", "value"],
    use_dora=True,
    task_type="SEQ_CLS"
)

# peft_model.unload()
peft_model = get_peft_model(model, peft_config)
print('PEFT Model')
peft_model.print_trainable_parameters()

PEFT Model
trainable params: 4,248,580 || all params: 359,612,424 || trainable%: 1.1814


In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy
    }
# Setup Training args
output_dir = "teacher_qv_r32_ep10"
training_args = TrainingArguments(
        bf16=True,
        output_dir=output_dir,
        report_to=None,
        eval_strategy='steps',
        eval_steps=1000,  #  More frequent eval
        logging_steps=1000,  #  More frequent logging
        learning_rate=5e-5,  # Increased learning rate
        warmup_ratio=0.1,  #  Warmup added
        weight_decay=0.01,
        num_train_epochs=10,  #  Reduced from 20 to 10
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        optim="adamw_torch",  #  Switched from SGD to AdamW
        lr_scheduler_type="cosine",  #  Added cosine scheduler
        dataloader_num_workers=8,
        gradient_checkpointing=False,  #  Left disabled, matching current setup
        gradient_checkpointing_kwargs={'use_reentrant':True},
        save_strategy= "steps",      # Explicitly set strategy to steps
        save_steps= 5000             # Save a checkpoint every 500 steps
)
    

from transformers import TrainerCallback

class SavePeftModelCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, model=None, **kwargs):
        # Save only the LoRA adapter weights
        epoch = int(state.epoch)
        save_path = f"lora-ag_news_r22_ep{epoch}"
        model.save_pretrained(save_path)
        print(f"Saved LoRA weights at {save_path}")

from transformers import TrainerCallback

class SimpleLoggerCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        with open("training_log_teacher.txt", "a") as f:
            f.write(str(logs) + "\n")


def get_trainer(model):
    return Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        callbacks=[SimpleLoggerCallback()]
        )


In [8]:
peft_lora_finetuning_trainer = get_trainer(peft_model)

result = peft_lora_finetuning_trainer.train()
peft_model.save_pretrained('qv_r32_rslora_ep10')

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss,Accuracy
1000,0.9549,0.30467,0.903
2000,0.2853,0.289089,0.9101
3000,0.2756,0.262544,0.9172
4000,0.2548,0.255051,0.9199
5000,0.2457,0.272679,0.9212
6000,0.2193,0.231146,0.9309
7000,0.2177,0.21203,0.9383
8000,0.2056,0.218871,0.9333
9000,0.1987,0.215307,0.94
10000,0.2011,0.190865,0.9394


In [9]:
base_model = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(base_model)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
smodel = RobertaForSequenceClassification.from_pretrained(
    base_model,
    num_labels=num_labels,
    id2label=id2label
)
peft_config = LoraConfig(
    r=10,
    lora_alpha=20,
    lora_dropout=0.2,
    bias = 'none',
    use_dora=True,
    target_modules = ['query','value'],
    task_type="SEQ_CLS",
    use_rslora=True
)
s_model = get_peft_model(model, peft_config)
print("PEFT Model loaded.")
s_model.print_trainable_parameters()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PEFT Model loaded.
trainable params: 2,085,892 || all params: 357,449,736 || trainable%: 0.5835




In [11]:
peft_model.eval()
class DistillationTrainer(Trainer):
    """
    Custom Trainer that adds a distillation loss on top of the student’s CE loss.
    """
    def __init__(
        self, 
        teacher_model,
        alpha_distillation=0.7,
        temperature=2.0,
        *args, 
        **kwargs
    ):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model
        self.alpha_distillation = alpha_distillation
        self.temperature = temperature

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        
        # Forward pass on student (LoRA) model
        outputs_student = model(**inputs)
        student_logits = outputs_student.logits
        
        # Forward pass on teacher (no grad)
        with torch.no_grad():
            outputs_teacher = self.teacher_model(**inputs)
            teacher_logits = outputs_teacher.logits

        # 1) Hard-label cross-entropy
        loss_ce = F.cross_entropy(
            student_logits.view(-1, self.model.config.num_labels),
            labels.view(-1)
        )

        # 2) Distillation (KL-Div between teacher & student)
        T = self.temperature
        student_logits_T = student_logits / T
        teacher_logits_T = teacher_logits / T
        
        loss_kl = F.kl_div(
            F.log_softmax(student_logits_T, dim=-1),
            F.softmax(teacher_logits_T, dim=-1),
            reduction="batchmean",
        ) * (T * T)

        # Combine them
        loss = (self.alpha_distillation * loss_kl) + ((1 - self.alpha_distillation) * loss_ce)

        if return_outputs:
            return (loss, outputs_student)
        return loss
output_dir = "qv_r10_rslora_ep10"
from transformers import TrainerCallback

class SimpleLoggerCallbackStudent(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        with open(output_dir+"training_log_student.txt", "a") as f:
            f.write(str(logs) + "\n")
import torch.nn.functional as F
training_args = TrainingArguments(
    bf16=True,
    output_dir=output_dir,
    report_to=None,
    eval_strategy='steps',
    eval_steps=1000,
    logging_steps=1000,
    learning_rate=1e-4,
    warmup_ratio=0.1,
    weight_decay=0.01,
    num_train_epochs=10,               # example
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    dataloader_num_workers=8,
    gradient_checkpointing=False,
    save_strategy= "steps",      # Explicitly set strategy to steps
    save_steps= 5000             # Save a checkpoint every 500 steps
    
        
    
)

distill_trainer = DistillationTrainer(
    teacher_model=peft_model,
    alpha_distillation=0.7,       # how much distillation matters
    temperature=2.0,             # softening factor
    model=s_model,            # LoRA student
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[SimpleLoggerCallbackStudent()]
)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
distill_trainer.train()

Step,Training Loss,Validation Loss,Accuracy
1000,0.2681,0.081525,0.9098
2000,0.1251,0.078742,0.9133
3000,0.103,0.069608,0.9188
4000,0.0957,0.06666,0.9238
5000,0.0894,0.063769,0.929
6000,0.0834,0.071349,0.9175
7000,0.0798,0.059445,0.9349
8000,0.0761,0.058512,0.9354
9000,0.0738,0.056932,0.9376
10000,0.0732,0.056246,0.9366


In [None]:
def classify(model, tokenizer, text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
    output = model(**inputs)

    prediction = output.logits.argmax(dim=-1).item()

    print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
    return id2label[prediction]

classify( s_model, tokenizer, "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")
classify( s_model, tokenizer, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.")

In [None]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

In [None]:
# Check evaluation accuracy
_, _ = evaluate_model(s_model, eval_dataset, True, 8, data_collator)

In [None]:
#Load your unlabelled data
unlabelled_dataset = pd.read_pickle("test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])
unlabelled_dataset

# Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 8, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})
df_output.to_csv(os.path.join(output_dir,"inference_output_distill_alpha_7_from_qv_r32_rslora_ep10_qv_r10_loraout_2_rslora_ep10.csv"), index=False)
print("Inference complete. Predictions saved to inference_output.csv")