In [42]:
# 1. Load the base model and tokenizer

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("t5-small")
base_model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [76]:
from peft import LoraConfig, get_peft_model

# 2. Create a LoRA configuration and apply it to the base model
def create_lora_config(r=16, lora_alpha=32, lora_dropout=0.01):
    return LoraConfig(
        task_type="SEQ_2_SEQ_LM",
        r=r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
    )

lora_config = create_lora_config()
lora_model = get_peft_model(base_model, lora_config)

In [78]:
full_model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [77]:
lora_model.print_trainable_parameters()

trainable params: 1,637,376 || all params: 62,144,000 || trainable%: 2.6348


In [79]:
from datasets import load_dataset

billsum = load_dataset("billsum", split="ca_test")
billsum = billsum.train_test_split(test_size=0.2)

In [80]:
billsum

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 989
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 248
    })
})

In [81]:
def preprocess_function(examples):
    # Prepends the string "summarize: " to each document in the 'text' field of the input examples.
    # This is done to instruct the T5 model on the task it needs to perform, which in this case is summarization.
    inputs = ["summarize: " + doc for doc in examples["text"]]

    # Tokenizes the prepended input texts to convert them into a format that can be fed into the T5 model.
    # Sets a maximum token length of 1024, and truncates any text longer than this limit.
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # Tokenizes the 'summary' field of the input examples to prepare the target labels for the summarization task.
    # Sets a maximum token length of 128, and truncates any text longer than this limit.
    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    # Assigns the tokenized labels to the 'labels' field of model_inputs.
    # The 'labels' field is used during training to calculate the loss and guide model learning.
    model_inputs["labels"] = labels["input_ids"]

    # Returns the prepared inputs and labels as a single dictionary, ready for training.
    return model_inputs

tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map: 100%|██████████| 989/989 [00:02<00:00, 371.50 examples/s]
Map: 100%|██████████| 248/248 [00:00<00:00, 372.00 examples/s]


In [82]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="t5-small")

In [83]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

def train(model, name):
    model_id = f"../out/{name}"
    training_args = Seq2SeqTrainingArguments(
        output_dir=model_id,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=5e-3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        fp16=True,
        num_train_epochs=1,
        load_best_model_at_end=True,
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_billsum["train"],
        eval_dataset=tokenized_billsum["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()
    trainer.save_model(model_id)

train(lora_model, "lora")
train(full_model, "full")

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,No log,2.152138


  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,No log,2.191865


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


In [84]:

lora_model.print_trainable_parameters()

trainable params: 1,637,376 || all params: 62,144,000 || trainable%: 2.6348


In [85]:
# 8. Load the saved LoRA adapter
# from peft import PeftModel

# loaded_lora_model = PeftModel.from_pretrained(base_model, "../out/lora")

# loaded_full_model = AutoModelForSeq2SeqLM.from_pretrained("../out/full")

In [86]:
# 9. Test the loaded model
passage = "There is a very big grand hall."
input_text = f"summarize: {passage}"
inputs = tokenizer(input_text, return_tensors="pt").input_ids

models = [
  ("base", base_model), 
  ("lora", lora_model.to('cpu')),
  # ("loaded lora", loaded_lora_model),
  ("full", full_model.to('cpu')),
  # ("loaded full", loaded_full_model)
]
for name, model in models:
  outputs = model.generate(input_ids=inputs, max_new_tokens=100, do_sample=False)
  print(name)
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))

base
There is a very big grand hall.
lora
There is a very big grand hall.
full
Existing law, there is a very big grand hall. Existing law requires the state to make a very big grand hall. Existing law requires the state to make a very big grand hall. Existing law requires the state to make a very big grand hall. Existing law requires the state to make a very big grand hall. Existing law requires the state to make a very big grand hall. Existing law requires the state to make a very big


In [54]:
print(billsum["train"][100]["summary"])

Existing law governing common interest developments, the Davis-Stirling Common Interest Development Act, requires the association of a common interest development, which includes a condominium project, to prepare and distribute to all of its members certain documents, including an annual budget report that includes, among other items of information, a pro forma operating budget. The act requires a notice to be provided if an insurance policy described in the annual budget report lapses, is canceled, or is not immediately renewed, restored, or replaced, or if there is a significant change as to the policy.
This bill would, beginning July 1, 2016, require the annual budget report of a condominium project to also include a separate statement describing the status of the common interest development as a Federal Housing Administration (FHA)-approved condominium project and as a federal Department of Veterans Affairs (VA)-approved condominium project.
