In [28]:
!pip install transformers datasets accelerate peft bitsandbytes evaluate rouge_score



In [29]:
import os
import torch
from datasets import load_dataset
from transformers import (
    T5ForConditionalGeneration,
    T5TokenizerFast,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
from peft import AdaLoraConfig, get_peft_model, LoraConfig, TaskType
import evaluate

In [30]:
dataset = load_dataset("billsum", split="ca_test")
print(dataset)

Dataset({
    features: ['text', 'summary', 'title'],
    num_rows: 1237
})


In [31]:
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

In [32]:
model_name = "t5-small"
tokenizer = T5TokenizerFast.from_pretrained(model_name)

max_input_length = 512
max_target_length = 128

def preprocess(examples):
    inputs = ["summarize: " + doc for doc in examples["text"]]
    model_inputs = tokenizer(
        inputs, max_length=max_input_length, truncation=True, padding=False
    )
    labels = tokenizer(
        text_target=examples["summary"], max_length=max_target_length, truncation=True, padding=False
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


processed_train_dataset = train_dataset.map(preprocess, batched=True, remove_columns=train_dataset.column_names)
processed_eval_dataset = eval_dataset.map(preprocess, batched=True, remove_columns=eval_dataset.column_names)


Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [33]:

model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,  # or peft_model if it's required
    label_pad_token_id=tokenizer.pad_token_id,
    pad_to_multiple_of=None,
    padding="longest"
)


In [34]:
ada_lora_config = AdaLoraConfig(
    r=8,
    target_r=4,
    init_r=12,
    beta1=0.9,
    beta2=0.999,
    tinit=200,
    tfinal=1000,
    deltaT=10,
    lora_alpha=32,
    lora_dropout=0.1,
    task_type=TaskType.SEQ_2_SEQ_LM,
    target_modules=[
        "SelfAttention.q",  # Query projection in self-attention
        "SelfAttention.v",  # Value projection in self-attention
        "SelfAttention.k",  # Output projection in self-attention
        "EncDecAttention.q",  # First linear layer in feed-forward network
        "EncDecAttention.k",
        "EncDecAttention.v"# Second linear layer in feed-forward network
    ],
    modules_to_save=["lm_head"]
)

peft_model = get_peft_model(model, ada_lora_config)
peft_model.print_trainable_parameters()


trainable params: 17,113,736 || all params: 77,620,414 || trainable%: 22.0480


In [35]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Convert to lists if they're tensors
    if isinstance(predictions, torch.Tensor):
        predictions = predictions.cpu().tolist()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().tolist()

    # Replace any -100 or out-of-range tokens with pad_token_id
    # This ensures no invalid token IDs slip through
    vocab_size = tokenizer.vocab_size
    pad_id = tokenizer.pad_token_id
    predictions = [[p if 0 <= p < vocab_size else pad_id for p in pred] for pred in predictions]
    labels = [[l if 0 <= l < vocab_size else pad_id for l in lab] for lab in labels]

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Handle empty strings
    decoded_preds = [pred if pred.strip() != "" else " " for pred in decoded_preds]
    decoded_labels = [lbl if lbl.strip() != "" else " " for lbl in decoded_labels]

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )
    return {
        "rouge1": result["rouge1"],
        "rouge2": result["rouge2"],
        "rougeL": result["rougeL"]
    }



In [36]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset

In [37]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-ada-lora-billsum",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=1e-4,
    num_train_epochs=5,  # increase epochs for a better demonstration
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    predict_with_generate=True,
    fp16=True,  # Mixed precision
    push_to_hub=False,
    report_to="none",
    load_best_model_at_end=True,  # Loads the best model for evaluation
    metric_for_best_model="rouge2",  # Track ROUGE-2 for the best model

    logging_strategy="steps",  # Log by steps for continuous logs
)




In [38]:
trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Seq2SeqTrainer(


In [39]:
trainer.train()


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,6.1484,6.080212,0.145193,0.054783,0.121141
2,5.2274,5.129618,0.143834,0.052489,0.120148
3,4.3373,4.250475,0.143576,0.051134,0.119597
4,4.3037,3.970296,0.144545,0.051988,0.121057
5,4.1936,3.912946,0.14477,0.052852,0.121646




TrainOutput(global_step=620, training_loss=5.051656519982123, metrics={'train_runtime': 235.8839, 'train_samples_per_second': 20.964, 'train_steps_per_second': 2.628, 'total_flos': 929241476720640.0, 'train_loss': 5.051656519982123, 'epoch': 5.0})

In [40]:
trainer.evaluate()



{'eval_loss': 6.080212116241455,
 'eval_rouge1': 0.14519307460126976,
 'eval_rouge2': 0.05478330177466391,
 'eval_rougeL': 0.12114137134423202,
 'eval_runtime': 15.7355,
 'eval_samples_per_second': 15.76,
 'eval_steps_per_second': 1.97,
 'epoch': 5.0}