In [1]:
!pip install -q transformers datasets sentencepiece evaluate nltk rouge_score accelerate

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [2]:
import os
import numpy as np
import nltk

from datasets import load_dataset
import evaluate

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

nltk.download("punkt")

# ---------------------
# CONFIG
# ---------------------
MODEL_NAME = "t5-small"
MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 256
BATCH_SIZE = 4
NUM_EPOCHS = 2
LEARNING_RATE = 3e-4

OUTPUT_DIR = "/content/t5-billsum"
os.makedirs(OUTPUT_DIR, exist_ok=True)

rouge = evaluate.load("rouge")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

In [3]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
billsum = load_dataset("billsum")

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/91.8M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/15.8M [00:00<?, ?B/s]

data/ca_test-00000-of-00001.parquet:   0%|          | 0.00/6.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

In [5]:
from datasets import load_dataset

# Loading dataset
billsum = load_dataset("billsum")

# Converting each split into Pandas DataFrame
train_df = billsum["train"].to_pandas()
test_df = billsum["test"].to_pandas()

train_df.head()


Unnamed: 0,text,summary,title
0,SECTION 1. LIABILITY OF BUSINESS ENTITIES PROV...,Shields a business entity from civil liability...,A bill to limit the civil liability of busines...
1,SECTION 1. SHORT TITLE.\n\n This Act may be...,Human Rights Information Act - Requires certai...,Human Rights Information Act
2,SECTION 1. SHORT TITLE.\n\n This Act may be...,Jackie Robinson Commemorative Coin Act - Direc...,Jackie Robinson Commemorative Coin Act
3,SECTION 1. NONRECOGNITION OF GAIN WHERE ROLLOV...,Amends the Internal Revenue Code to provide (t...,To amend the Internal Revenue Code to provide ...
4,SECTION 1. SHORT TITLE.\n\n This Act may be...,Native American Energy Act - (Sec. 3) Amends t...,Native American Energy Act


In [6]:
test_df.head()

Unnamed: 0,text,summary,title
0,SECTION 1. ENVIRONMENTAL INFRASTRUCTURE.\n\n ...,Amends the Water Resources Development Act of ...,To make technical corrections to the Water Res...
1,That this Act may be cited as the ``Federal Fo...,Federal Forage Fee Act of 1993 - Subjects graz...,Federal Forage Fee Act of 1993
2,SECTION 1. SHORT TITLE.\n\n This Act may be...,. Merchant Marine of World War II Congression...,Merchant Marine of World War II Congressional ...
3,SECTION 1. SHORT TITLE.\n\n This Act may be...,Small Business Modernization Act of 2004 - Ame...,To amend the Internal Revenue Code of 1986 to ...
4,SECTION 1. SHORT TITLE.\n\n This Act may be...,Fair Access to Investment Research Act of 2016...,Fair Access to Investment Research Act of 2016


In [7]:
# Load full BillSum dataset
train_df = billsum["train"]      # ~18,949
test_df = billsum["test"]        # ~3,269
ca_test_df = billsum["ca_test"]  # ~1,237

print("Train size:", len(train_df))
print("Test size:", len(test_df))
print("CA Test size:", len(ca_test_df))

print(train_df[0])

Train size: 18949
Test size: 3269
CA Test size: 1237
{'text': "SECTION 1. LIABILITY OF BUSINESS ENTITIES PROVIDING USE OF FACILITIES \n              TO NONPROFIT ORGANIZATIONS.\n\n    (a) Definitions.--In this section:\n            (1) Business entity.--The term ``business entity'' means a \n        firm, corporation, association, partnership, consortium, joint \n        venture, or other form of enterprise.\n            (2) Facility.--The term ``facility'' means any real \n        property, including any building, improvement, or appurtenance.\n            (3) Gross negligence.--The term ``gross negligence'' means \n        voluntary and conscious conduct by a person with knowledge (at \n        the time of the conduct) that the conduct is likely to be \n        harmful to the health or well-being of another person.\n            (4) Intentional misconduct.--The term ``intentional \n        misconduct'' means conduct by a person with knowledge (at the \n        time of the conduct) tha

In [8]:
from nltk.tokenize import sent_tokenize

def count_sentences(text: str) -> int:
    return len(sent_tokenize(text))

train_sentence_counts = [count_sentences(x["text"]) for x in train_df]
test_sentence_counts = [count_sentences(x["text"]) for x in test_df]
ca_sentence_counts = [count_sentences(x["text"]) for x in ca_test_df]

total_train_sentences = sum(train_sentence_counts)
total_test_sentences = sum(test_sentence_counts)
total_ca_sentences = sum(ca_sentence_counts)

print("Total sentences (TRAIN):   ", total_train_sentences)
print("Total sentences (TEST):    ", total_test_sentences)
print("Total sentences (CA_TEST): ", total_ca_sentences)
print("TOTAL sentences (ALL):     ",
      total_train_sentences + total_test_sentences + total_ca_sentences)

print("\nAverage sentences per train doc:",
      float(np.mean(train_sentence_counts)))


Total sentences (TRAIN):    877056
Total sentences (TEST):     149829
Total sentences (CA_TEST):  64694
TOTAL sentences (ALL):      1091579

Average sentences per train doc: 46.285081006913295


In [9]:
# Creating training/validation split
# Using 5% of train as validation
train_valid = train_df.train_test_split(test_size=0.05, seed=42)

train_dataset = train_valid["train"]
valid_dataset = train_valid["test"]

print("Train dataset:", len(train_dataset))
print("Validation dataset:", len(valid_dataset))


Train dataset: 18001
Validation dataset: 948


In [10]:
# Tokenizer & preprocessing function
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["text"]]
    targets = examples["summary"]

    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
    )

    labels = tokenizer(
        targets,
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Batched mapping for speed, remove original text/summary columns
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names,
)

tokenized_valid = valid_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=valid_dataset.column_names,
)

tokenized_test = test_df.map(
    preprocess_function,
    batched=True,
    remove_columns=test_df.column_names,
)

tokenized_train



tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/18001 [00:00<?, ? examples/s]

Map:   0%|          | 0/948 [00:00<?, ? examples/s]

Map:   0%|          | 0/3269 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 18001
})

In [11]:
# Data collator & ROUGE metric function
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=MODEL_NAME,
)

def postprocess_text(preds, labels):
    preds = [p.strip() for p in preds]
    labels = [l.strip() for l in labels]
    return preds, labels

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    if len(decoded_preds) == 0 or len(decoded_labels) == 0:
        return {k: 0.0 for k in ["rouge1", "rouge2", "rougeL", "rougeLsum"]}

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
    )
    # Converting to percentages
    result = {k: round(v * 100, 2) for k, v in result.items()}

    return result


In [12]:
# Loading model & define Trainer
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",          # running eval at end of each epoch
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=NUM_EPOCHS,
    predict_with_generate=True,
    fp16=True,
    logging_dir="./logs",
    logging_steps=100,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(


In [13]:
# Training
train_result = trainer.train()
train_result


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,1.8477,1.680659,19.07,15.48,18.38,18.39
2,1.6768,1.611461,19.05,15.47,18.36,18.36


TrainOutput(global_step=9002, training_loss=1.8522208654941439, metrics={'train_runtime': 2749.6632, 'train_samples_per_second': 13.093, 'train_steps_per_second': 3.274, 'total_flos': 9745151073189888.0, 'train_loss': 1.8522208654941439, 'epoch': 2.0})

In [14]:
print("Validation metrics")
val_metrics = trainer.evaluate(tokenized_valid, max_length=MAX_TARGET_LENGTH)
val_metrics


Validation metrics


{'eval_loss': 1.6114614009857178,
 'eval_rouge1': 50.69,
 'eval_rouge2': 30.72,
 'eval_rougeL': 38.59,
 'eval_rougeLsum': 38.55,
 'eval_runtime': 823.1125,
 'eval_samples_per_second': 1.152,
 'eval_steps_per_second': 0.288,
 'epoch': 2.0}

In [15]:
print("Test metrics")
test_metrics = trainer.evaluate(tokenized_test, max_length=MAX_TARGET_LENGTH)
test_metrics


Test metrics


{'eval_loss': 1.5673980712890625,
 'eval_rouge1': 51.51,
 'eval_rouge2': 31.79,
 'eval_rougeL': 39.3,
 'eval_rougeLsum': 39.31,
 'eval_runtime': 2819.1458,
 'eval_samples_per_second': 1.16,
 'eval_steps_per_second': 0.29,
 'epoch': 2.0}

In [16]:
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("Model saved to:", OUTPUT_DIR)
!ls -R "$OUTPUT_DIR"


Model saved to: /content/t5-billsum
/content/t5-billsum:
checkpoint-4501		model.safetensors	 tokenizer.json
checkpoint-9002		special_tokens_map.json  training_args.bin
config.json		spiece.model
generation_config.json	tokenizer_config.json

/content/t5-billsum/checkpoint-4501:
config.json		scaler.pt		 tokenizer.json
generation_config.json	scheduler.pt		 trainer_state.json
model.safetensors	special_tokens_map.json  training_args.bin
optimizer.pt		spiece.model
rng_state.pth		tokenizer_config.json

/content/t5-billsum/checkpoint-9002:
config.json		scaler.pt		 tokenizer.json
generation_config.json	scheduler.pt		 trainer_state.json
model.safetensors	special_tokens_map.json  training_args.bin
optimizer.pt		spiece.model
rng_state.pth		tokenizer_config.json


In [17]:
import shutil

shutil.make_archive("t5-billsum", "zip", "/content/t5-billsum")
from google.colab import files
files.download("t5-billsum.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## BART MODEL:

In [18]:
# BART config & tokenizer
from transformers import BartTokenizerFast, BartForConditionalGeneration

BART_MODEL_NAME = "facebook/bart-base"
BART_OUTPUT_DIR = "/content/bart-billsum"
os.makedirs(BART_OUTPUT_DIR, exist_ok=True)

BART_MAX_INPUT_LENGTH = 1024
BART_MAX_TARGET_LENGTH = 256
BART_BATCH_SIZE = 4
BART_NUM_EPOCHS = 2

bart_tokenizer = BartTokenizerFast.from_pretrained(BART_MODEL_NAME)

bart_tokenizer


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

BartTokenizerFast(name_or_path='facebook/bart-base', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),
}
)

In [19]:
# Preprocessing function for BART
def preprocess_bart_function(examples):
    inputs = examples["text"]
    targets = examples["summary"]

    model_inputs = bart_tokenizer(
        inputs,
        max_length=BART_MAX_INPUT_LENGTH,
        truncation=True,
    )

    with bart_tokenizer.as_target_tokenizer():
        labels = bart_tokenizer(
            targets,
            max_length=BART_MAX_TARGET_LENGTH,
            truncation=True,
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_bart = train_dataset.map(
    preprocess_bart_function,
    batched=True,
    remove_columns=train_dataset.column_names,
)

tokenized_valid_bart = valid_dataset.map(
    preprocess_bart_function,
    batched=True,
    remove_columns=valid_dataset.column_names,
)

tokenized_test_bart = test_df.map(
    preprocess_bart_function,
    batched=True,
    remove_columns=test_df.column_names,
)

tokenized_train_bart


Map:   0%|          | 0/18001 [00:00<?, ? examples/s]



Map:   0%|          | 0/948 [00:00<?, ? examples/s]

Map:   0%|          | 0/3269 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 18001
})

In [20]:
# Data collator for BART
bart_data_collator = DataCollatorForSeq2Seq(
    tokenizer=bart_tokenizer,
    model=BART_MODEL_NAME,
)

def compute_metrics_bart(eval_pred):
    preds, labels = eval_pred

    preds = np.where(preds != -100, preds, bart_tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, bart_tokenizer.pad_token_id)

    decoded_preds = bart_tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = bart_tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [p.strip() for p in decoded_preds]
    decoded_labels = [l.strip() for l in decoded_labels]

    if len(decoded_preds) == 0 or len(decoded_labels) == 0:
        return {k: 0.0 for k in ["rouge1", "rouge2", "rougeL", "rougeLsum"]}

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
    )
    result = {k: round(v * 100, 2) for k, v in result.items()}
    return result



In [21]:
# Loading BART model & define Trainer
bart_model = BartForConditionalGeneration.from_pretrained(BART_MODEL_NAME)

bart_training_args = Seq2SeqTrainingArguments(
    output_dir=BART_OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BART_BATCH_SIZE,
    per_device_eval_batch_size=BART_BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=BART_NUM_EPOCHS,
    predict_with_generate=True,
    fp16=True,
    logging_dir="./logs_bart",
    logging_steps=100,
    report_to="none",
)

bart_trainer = Seq2SeqTrainer(
    model=bart_model,
    args=bart_training_args,
    train_dataset=tokenized_train_bart,
    eval_dataset=tokenized_valid_bart,
    tokenizer=bart_tokenizer,
    data_collator=bart_data_collator,
    compute_metrics=compute_metrics_bart,
)


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

  bart_trainer = Seq2SeqTrainer(


In [22]:
# Training BART model
bart_train_result = bart_trainer.train()
bart_train_result


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,2.092,1.955627,18.72,15.06,18.15,18.24
2,1.6585,1.708667,19.17,15.6,18.63,18.69




TrainOutput(global_step=9002, training_loss=2.060202572669381, metrics={'train_runtime': 2590.0809, 'train_samples_per_second': 13.9, 'train_steps_per_second': 3.476, 'total_flos': 2.195173196955648e+16, 'train_loss': 2.060202572669381, 'epoch': 2.0})

In [23]:
# Here Evaluating BART on test set + save model
print("BART Validation metrics")
bart_val_metrics = bart_trainer.evaluate(
    tokenized_valid_bart, max_length=BART_MAX_TARGET_LENGTH
)
bart_val_metrics



BART Validation metrics


{'eval_loss': 1.7086668014526367,
 'eval_rouge1': 49.12,
 'eval_rouge2': 29.12,
 'eval_rougeL': 37.01,
 'eval_rougeLsum': 40.42,
 'eval_runtime': 1069.7008,
 'eval_samples_per_second': 0.886,
 'eval_steps_per_second': 0.222,
 'epoch': 2.0}

In [24]:
print("BART Test metrics")
bart_test_metrics = bart_trainer.evaluate(
    tokenized_test_bart, max_length=BART_MAX_TARGET_LENGTH
)
bart_test_metrics


BART Test metrics


{'eval_loss': 1.7249075174331665,
 'eval_rouge1': 49.64,
 'eval_rouge2': 30.12,
 'eval_rougeL': 37.57,
 'eval_rougeLsum': 41.06,
 'eval_runtime': 3693.4305,
 'eval_samples_per_second': 0.885,
 'eval_steps_per_second': 0.221,
 'epoch': 2.0}

In [25]:
bart_trainer.save_model(BART_OUTPUT_DIR)
bart_tokenizer.save_pretrained(BART_OUTPUT_DIR)

print("BART model saved to:", BART_OUTPUT_DIR)
!ls -R "$BART_OUTPUT_DIR"


BART model saved to: /content/bart-billsum
/content/bart-billsum:
checkpoint-4501		merges.txt		 tokenizer.json
checkpoint-9002		model.safetensors	 training_args.bin
config.json		special_tokens_map.json  vocab.json
generation_config.json	tokenizer_config.json

/content/bart-billsum/checkpoint-4501:
config.json		rng_state.pth		 tokenizer.json
generation_config.json	scaler.pt		 trainer_state.json
merges.txt		scheduler.pt		 training_args.bin
model.safetensors	special_tokens_map.json  vocab.json
optimizer.pt		tokenizer_config.json

/content/bart-billsum/checkpoint-9002:
config.json		rng_state.pth		 tokenizer.json
generation_config.json	scaler.pt		 trainer_state.json
merges.txt		scheduler.pt		 training_args.bin
model.safetensors	special_tokens_map.json  vocab.json
optimizer.pt		tokenizer_config.json


In [26]:
shutil.make_archive("bart-billsum", "zip", "/content/bart-billsum")
files.download("bart-billsum.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>