Experiment: https://medium.com/@lokaregns/text-summarization-with-hugging-face-transformers-a-beginners-guide-9e6c319bb5ed

In [None]:
!wget -nc https://www.dropbox.com/s/7hb8bwbtjmxovlc/bbc_text_cls.csv?dl=0
!pip install transformers

In [None]:
from transformers import pipeline
import textwrap
import numpy as np
import pandas as pd
from pprint import pprint

In [None]:
df = pd.read_csv('bbc_text_cls.csv?dl=0')
df.head()

In [None]:
doc = df[df.labels == 'business']['text'].sample(random_state=42)

In [None]:
# text wrapping function
def wrap(x):
  return textwrap.fill(x, replace_whitespace = False, fix_sentence_endings = True)

print(wrap(doc.iloc[0]))

In [None]:
summarizer = pipeline('summarization')

In [None]:
summarizer(doc.iloc[0].split('\n',1)[1])

In [None]:
doc = df[df.labels == 'entertainment']['text'].sample(random_state=50)
print(wrap(doc.iloc[0]))

In [None]:
summarizer(doc.iloc[0].split('\n',1)[1])

# Experiment 2

 https://huggingface.co/docs/transformers/tasks/summarization

In [None]:
!pip install transformers[torch] datasets evaluate rouge_score
from datasets import load_dataset

billsum = load_dataset("billsum", split="ca_test")

In [None]:
billsum = billsum.train_test_split(test_size=0.2)
billsum["train"][0]

In [None]:
from transformers import AutoTokenizer

checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

In [None]:
print(tokenized_billsum['train'][0])

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
import evaluate

rouge = evaluate.load("rouge")

**Rouge**: Recall-Oriented Understudy for Gisting Evaluation: a set of metrics for evaluating automatic text summarization.

ROUGE metrics range between 0 and 1, with higher scores indicating higher similarity between the automatically produced summary and the reference.

Typical ROUGE metrics:
  * Overlap of n-grams
  * Longest Common Subsequence
  * Weighted LCS
  * Skip-bigrams

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
!pip install numba

from numba import cuda
# all of your code and execution
device = cuda.get_current_device()
device.reset()

In [None]:

# NOTE: The batch size can blow up the RAM the GPU uses.
# Use a smaller batch size to ensure the GPU is not overwhelmed.
training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    logging_strategy='epoch'
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."
text

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="./my_awesome_billsum_model")
summarizer(text)