In [1]:
! pip install transformers datasets -qqq
!pip install transformers datasets evaluate rouge_score -qqq
!pip install --upgrade -qqq wandb

In [2]:
from kaggle_secrets import UserSecretsClient

import wandb

user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("text-summarizer")
secret_value_1 = user_secrets.get_secret("wandb")

In [3]:
import huggingface_hub

wandb.login(key = secret_value_1)

huggingface_hub.login(token=secret_value_0)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
from datasets import load_dataset

billsum = load_dataset("billsum", split="ca_test")

Downloading readme:   0%|          | 0.00/7.27k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 91.8M/91.8M [00:00<00:00, 172MB/s] 
Downloading data: 100%|██████████| 15.8M/15.8M [00:00<00:00, 98.3MB/s]
Downloading data: 100%|██████████| 6.12M/6.12M [00:00<00:00, 46.5MB/s]


Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

In [5]:
billsum = billsum.train_test_split(test_size=0.2)

In [6]:
from transformers import AutoTokenizer

checkpoint = "Falconsai/text_summarization"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [7]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [9]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

2024-04-21 19:48:04.584774: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-21 19:48:04.584876: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-21 19:48:04.702824: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [10]:
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [11]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [12]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir="falcon-summ",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=30,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [14]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mn-suneel89[0m ([33mn-suneel-duke[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.65972,0.1336,0.0451,0.1109,0.1109,19.0
2,No log,2.512576,0.1519,0.0584,0.1258,0.1259,19.0
3,No log,2.43535,0.1702,0.0705,0.142,0.1422,19.0
4,No log,2.385477,0.1915,0.0922,0.1616,0.1618,19.0
5,No log,2.349905,0.1932,0.0939,0.1641,0.1644,19.0
6,No log,2.325103,0.1951,0.0959,0.1665,0.1667,19.0
7,No log,2.303489,0.1943,0.0963,0.1655,0.1656,19.0
8,No log,2.287869,0.1941,0.0953,0.1652,0.1653,19.0
9,2.636800,2.269467,0.1952,0.0958,0.1665,0.1667,19.0
10,2.636800,2.258846,0.194,0.0952,0.1651,0.1653,19.0




TrainOutput(global_step=1860, training_loss=2.3874246904926917, metrics={'train_runtime': 2124.7079, 'train_samples_per_second': 13.964, 'train_steps_per_second': 0.875, 'total_flos': 8031182499348480.0, 'train_loss': 2.3874246904926917, 'epoch': 30.0})

In [15]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1713728898.c9af55bc8b8e.34.0:   0%|          | 0.00/22.3k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/suneeln-duke/falcon-summ/commit/631c3d5975c4e4623b04eab82a8882eaf6fde185', commit_message='End of training', commit_description='', oid='631c3d5975c4e4623b04eab82a8882eaf6fde185', pr_url=None, pr_revision=None, pr_num=None)

In [16]:
from transformers import pipeline

def predict(text, model):
    
    return summarizer(text)

In [17]:
summarizer = pipeline("summarization", model="suneeln-duke/t5-summ")

config.json:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

In [18]:
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."

In [19]:
summarizer(text)

Your max_length is set to 200, but your input_length is only 103. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=51)


[{'summary_text': "The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It will ask the ultra-wealthy and corporations to pay their fair share."}]

In [24]:
text = """
To the north of the Ganga was the great
kingdom Kosala, made fertile by the river
Sarayu. Its capital was Ayodhya, built by
Manu, the famous ruler of the Solar
dynasty. From Valmiki's description of
the capital Kosala, it is clear that ancient
Ayodhya was not inferior to our modern
cities. Even in ancient India city
civilisation had reached a high level.
King Dasaratha ruled the kingdom
from the capital city of Ayodhya. He had
fought on the side of the Devas, and his
fame spread in the three worlds. He was
the equal of Indra and Kubera. The people
of Kosala were happy, contented and
virtuous. The land was protected by a
mighty army, and no enemy could come
anywhere near
"""

In [21]:
%%time

summarizer(text)

Token indices sequence length is longer than the specified maximum sequence length for this model (799 > 512). Running this sequence through the model will result in indexing errors


CPU times: user 6.83 s, sys: 365 ms, total: 7.2 s
Wall time: 3.6 s


[{'summary_text': 'king Dasaratha ruled the kingdom from the capital city of Ayodhya, and his fame spread in the three worlds. The land contained forts with moats around them as well as many defensive intallations, and true to its name, the land was protected by a mighty army.'}]

In [22]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("summarization", model="Falconsai/text_summarization")

In [25]:
%%time

pipe(text)

CPU times: user 3.87 s, sys: 61.7 ms, total: 3.93 s
Wall time: 1.97 s


[{'summary_text': 'Kosala was fertile by the river Sarayu . Its capital was Ayodhya, built by Manu, the famous ruler of the Solar dynasty . In ancient India city civilisation had reached a high level .'}]