In [1]:
!pip install transformers datasets accelerate peft bitsandbytes evaluate rouge_score

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    T5ForConditionalGeneration,
    T5TokenizerFast,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
from peft import AdaLoraConfig, get_peft_model, LoraConfig, TaskType
import evaluate

In [3]:
dataset = load_dataset("billsum", split="ca_test")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.27k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/91.8M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/15.8M [00:00<?, ?B/s]

ca_test-00000-of-00001.parquet:   0%|          | 0.00/6.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'summary', 'title'],
    num_rows: 1237
})


In [4]:
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

In [5]:
model_name = "t5-small"
tokenizer = T5TokenizerFast.from_pretrained(model_name)

max_input_length = 512
max_target_length = 128

def preprocess(examples):
    inputs = ["summarize: " + doc for doc in examples["text"]]
    model_inputs = tokenizer(
        inputs, max_length=max_input_length, truncation=True, padding=False
    )
    labels = tokenizer(
        text_target=examples["summary"], max_length=max_target_length, truncation=True, padding=False
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


processed_train_dataset = train_dataset.map(preprocess, batched=True, remove_columns=train_dataset.column_names)
processed_eval_dataset = eval_dataset.map(preprocess, batched=True, remove_columns=eval_dataset.column_names)


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [6]:

model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,  # or peft_model if it's required
    label_pad_token_id=tokenizer.pad_token_id,
    pad_to_multiple_of=None,
    padding="longest"
)


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
ada_lora_config = AdaLoraConfig(
    # r=8,
    # target_r=4,
    # init_r=12,
    # beta1=0.9,
    # beta2=0.999,
    # tinit=200,
    # tfinal=1000,
    # deltaT=10,
    # lora_alpha=32,
    # lora_dropout=0.1,
    task_type=TaskType.SEQ_2_SEQ_LM,
    target_modules=[
        "SelfAttention.q",  # Query projection in self-attention
        "SelfAttention.v",  # Value projection in self-attention
        "SelfAttention.k",  # Output projection in self-attention
        "EncDecAttention.q",  # First linear layer in feed-forward network
        "EncDecAttention.k",
        "EncDecAttention.v"# Second linear layer in feed-forward network
    ],
    modules_to_save=["lm_head"],
    r=16,               # Start with a moderately higher rank
    target_r=4,          # Reduce to a quarter of the original rank by the end
    init_r=16,           # Begin with the same rank as r
    beta1=0.9,           # Adam defaults work well
    beta2=0.999,
    tinit=200,           # Begin adapting rank after a small warmup period (adjust based on total steps)
    tfinal=1000,         # Finish adapting rank near the end of training
    deltaT=10,           # Steps between rank updates (smooth adaptation)
    lora_alpha=16,       # Slightly lower alpha can help stabilize training
    lora_dropout=0.1,    # Some dropout for regularization
)

peft_model = get_peft_model(model, ada_lora_config)
peft_model.print_trainable_parameters()


trainable params: 17,335,136 || all params: 77,841,814 || trainable%: 22.2697


In [10]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Convert to lists if they're tensors
    if isinstance(predictions, torch.Tensor):
        predictions = predictions.cpu().tolist()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().tolist()

    # Replace any -100 or out-of-range tokens with pad_token_id
    # This ensures no invalid token IDs slip through
    vocab_size = tokenizer.vocab_size
    pad_id = tokenizer.pad_token_id
    predictions = [[p if 0 <= p < vocab_size else pad_id for p in pred] for pred in predictions]
    labels = [[l if 0 <= l < vocab_size else pad_id for l in lab] for lab in labels]

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Handle empty strings
    decoded_preds = [pred if pred.strip() != "" else " " for pred in decoded_preds]
    decoded_labels = [lbl if lbl.strip() != "" else " " for lbl in decoded_labels]

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )
    return {
        "rouge1": result["rouge1"],
        "rouge2": result["rouge2"],
        "rougeL": result["rougeL"]
    }



Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [11]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset

In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-ada-lora-billsum",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=1e-4,
    num_train_epochs=5,  # increase epochs for a better demonstration
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    predict_with_generate=True,
    fp16=True,  # Mixed precision
    push_to_hub=False,
    report_to="none",
    load_best_model_at_end=True,  # Loads the best model for evaluation
    metric_for_best_model="rouge2",  # Track ROUGE-2 for the best model

    logging_strategy="steps",  # Log by steps for continuous logs
)




In [13]:
trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Seq2SeqTrainer(


In [14]:
trainer.train()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,6.3102,6.240701,0.145649,0.054789,0.121462




Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,6.3102,6.240701,0.145649,0.054789,0.121462
2,5.4751,5.454769,0.144025,0.053101,0.119986
3,4.8676,4.967031,0.144094,0.052312,0.119814
4,4.9975,4.766723,0.143713,0.051917,0.120296
5,4.8998,4.691061,0.143242,0.051513,0.1199




TrainOutput(global_step=620, training_loss=5.466980201967301, metrics={'train_runtime': 226.5844, 'train_samples_per_second': 21.824, 'train_steps_per_second': 2.736, 'total_flos': 932604772976640.0, 'train_loss': 5.466980201967301, 'epoch': 5.0})

In [15]:
trainer.evaluate()



{'eval_loss': 6.240701198577881,
 'eval_rouge1': 0.14564939906228788,
 'eval_rouge2': 0.05478906618685818,
 'eval_rougeL': 0.12146206958730695,
 'eval_runtime': 16.3626,
 'eval_samples_per_second': 15.157,
 'eval_steps_per_second': 1.895,
 'epoch': 5.0}