In [9]:
!pip install transformers datasets accelerate peft bitsandbytes evaluate rouge_score --quiet


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    T5ForConditionalGeneration,
    T5TokenizerFast,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
from peft import IA3Config, get_peft_model, TaskType
import evaluate


In [3]:
# Load dataset
dataset = load_dataset("billsum", split="ca_test")

# Split ca_test into train/eval for demonstration
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# Filter out empty summaries
train_dataset = train_dataset.filter(lambda x: x["summary"] is not None and x["summary"].strip() != "")
eval_dataset = eval_dataset.filter(lambda x: x["summary"] is not None and x["summary"].strip() != "")

model_name = "t5-small"
tokenizer = T5TokenizerFast.from_pretrained(model_name)

max_input_length = 512
max_target_length = 128

def preprocess(examples):
    inputs = ["summarize: " + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding=False)
    labels = tokenizer(text_target=examples["summary"], max_length=max_target_length, truncation=True, padding=False)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

processed_train_dataset = train_dataset.map(preprocess, batched=True, remove_columns=train_dataset.column_names)
processed_eval_dataset = eval_dataset.map(preprocess, batched=True, remove_columns=eval_dataset.column_names)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.27k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/91.8M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/15.8M [00:00<?, ?B/s]

ca_test-00000-of-00001.parquet:   0%|          | 0.00/6.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

Filter:   0%|          | 0/989 [00:00<?, ? examples/s]

Filter:   0%|          | 0/248 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [4]:
# Use DataCollatorForSeq2Seq which will handle padding
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=None,
    padding="longest",
    label_pad_token_id=tokenizer.pad_token_id
)
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [7]:
ia3_config = IA3Config(
    target_modules = [
      "SelfAttention.q", "SelfAttention.k", "SelfAttention.v",
      "EncDecAttention.q", "EncDecAttention.k", "EncDecAttention.v",
      "DenseReluDense.wi", "DenseReluDense.wo"
    ],
    feedforward_modules = [
      "DenseReluDense.wi",
      "DenseReluDense.wo"
    ],
    task_type=TaskType.SEQ_2_SEQ_LM  # Correct task type for summarization
)


peft_model = get_peft_model(model, ia3_config)
peft_model.print_trainable_parameters()


trainable params: 58,368 || all params: 60,564,992 || trainable%: 0.0964


In [10]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    if isinstance(predictions, torch.Tensor):
        predictions = predictions.cpu().tolist()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().tolist()

    # Replace out-of-range tokens and ensure valid decoding
    vocab_size = tokenizer.vocab_size
    pad_id = tokenizer.pad_token_id

    predictions = [[p if 0 <= p < vocab_size else pad_id for p in pred] for pred in predictions]
    labels = [[l if 0 <= l < vocab_size else pad_id for l in lab] for lab in labels]

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Handle empty strings to avoid errors
    decoded_preds = [pred if pred.strip() != "" else " " for pred in decoded_preds]
    decoded_labels = [lbl if lbl.strip() != "" else " " for lbl in decoded_labels]

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    return {
        "rouge1": result["rouge1"],
        "rouge2": result["rouge2"],
        "rougeL": result["rougeL"]
    }


In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-ia3-billsum",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=1e-4,
    num_train_epochs=5,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="no",
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    report_to="none",
)




In [14]:
trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Seq2SeqTrainer(


In [15]:
trainer.train()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,3.7723,3.737714,0.145269,0.054892,0.122488
2,3.6995,3.477828,0.140021,0.049829,0.118205
3,3.4263,3.368963,0.137665,0.048833,0.116223
4,3.6129,3.313488,0.134746,0.045965,0.113108
5,3.2626,3.29632,0.135404,0.046645,0.113844




TrainOutput(global_step=2475, training_loss=3.7549790415137707, metrics={'train_runtime': 413.6288, 'train_samples_per_second': 11.955, 'train_steps_per_second': 5.984, 'total_flos': 670151878901760.0, 'train_loss': 3.7549790415137707, 'epoch': 5.0})

In [16]:
trainer.evaluate()

{'eval_loss': 3.2963201999664307,
 'eval_rouge1': 0.13540433719095518,
 'eval_rouge2': 0.04664545132068253,
 'eval_rougeL': 0.11384374357149593,
 'eval_runtime': 39.1456,
 'eval_samples_per_second': 6.335,
 'eval_steps_per_second': 3.168,
 'epoch': 5.0}