In [14]:
import torch

### 2. Text Summarization Pipelines 

In [71]:
dataset = load_dataset("cnn_dailymail", version="3.0.0")

Using custom data configuration default
Reusing dataset cnn_dailymail (/root/.cache/huggingface/datasets/cnn_dailymail/default/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])

##### Example 1

In [6]:
string = "Persistence is all you need. IT IS ALL. YOU NEED U.N"

In [7]:
import nltk

In [8]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
string

'Persistence is all you need. IT IS ALL. YOU NEED U.N'

Split the `string` into separate lines

In [10]:
from nltk.tokenize import sent_tokenize

In [11]:
output = sent_tokenize(string)

In [12]:
output

['Persistence is all you need.', 'IT IS ALL.', 'YOU NEED U.N']

##### Example 2

In [72]:
sample_text = dataset["train"][1]["article"][:2000]

In [73]:
sample_text

'Editor\'s note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O\'Brien takes users inside a jail where many of the inmates are mentally ill. An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial. MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here, inmates with the most severe mental illnesses are incarcerated until they\'re ready to appear in court. Most often, they face drug charges or charges of assaulting an officer --charges that Judge Steven Leifman says are usually "avoidable felonies." He says the arrests often result from confrontations with police. Mentally ill people often won\'t do what they\'re told when police arrive on the scene -- confrontation seems to exacerbate their illness and they become more paranoid, delusional, and less likely to foll

### 3. Measuring the Quality of Generated Text

In [2]:
from datasets import load_metric

Given
- `predictions` contains a generated sentence
- `references` contains a target sentence

In [30]:
predictions = [["Persistence", "is", "all", "all", "you", "need"]]

In [31]:
references = [[
    ["Persistence", "is", "all", "you", "need"]
]]

Calculate the BLEU metric

In [32]:
bleu = load_metric("bleu")

In [33]:
output = bleu.compute(
    predictions=predictions, references=references
)

In [34]:
output

{'bleu': 0.0,
 'precisions': [0.8333333333333334, 0.8, 0.5, 0.0],
 'brevity_penalty': 1.0,
 'length_ratio': 1.2,
 'translation_length': 6,
 'reference_length': 5}

### 5. Evaluating PEGASUS on the CNN/DailyMail Dataset

In [1]:
#!pip install rouge_score

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
from datasets import load_dataset, load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

rouge_metric = load_metric("rouge", cache_dir=None)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

In [3]:
dataset = load_dataset("cnn_dailymail", version="3.0.0")

Using custom data configuration default
Reusing dataset cnn_dailymail (/root/.cache/huggingface/datasets/cnn_dailymail/default/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


  0%|          | 0/3 [00:00<?, ?it/s]

`dataset` is a dataset loaded from the `datasets` library

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

Return all the `article` in the training set

In [5]:
output = dataset["train"]["article"]

In [6]:
len(output)

287113

In [7]:
output[0]

'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details o

In [8]:
def evaluate_summaries_baseline(
    dataset, metric, column_text="article",
    column_summary="highlights"
):
    summaries = [three_sentence_summary(text) for text in
                 dataset[column_text]]
    metric.add_batch(predictions=summaries,
                    references=dataset[column_summary])
    
    score = metric.compute()
    
    return score

In [15]:
test_sampled = dataset["test"].shuffle(seed=42).select(range(1000))

Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/cnn_dailymail/default/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de/cache-888acb9a2eb72e89.arrow


In [10]:
score = evaluate_summaries_baseline(test_sampled, rouge_metric)

NameError: name 'three_sentence_summary' is not defined

In [22]:
score

{'rouge1': AggregateScore(low=Score(precision=0.32914746779915643, recall=0.4947584970371514, fmeasure=0.3824220570629594), mid=Score(precision=0.336293334514692, recall=0.5052703512675426, fmeasure=0.38927772456105647), high=Score(precision=0.34361849568803526, recall=0.5155112241687534, fmeasure=0.3965981837552523)),
 'rouge2': AggregateScore(low=Score(precision=0.14105667752414594, recall=0.2146389182840996, fmeasure=0.16444985320449765), mid=Score(precision=0.14796578756051865, recall=0.22405709792553796, fmeasure=0.17161039311197138), high=Score(precision=0.15478370148524653, recall=0.23359267628324024, fmeasure=0.178950525203355)),
 'rougeL': AggregateScore(low=Score(precision=0.2051085061699641, recall=0.31052518755982206, fmeasure=0.23864283092409694), mid=Score(precision=0.2111941369602029, recall=0.3195954236932692, fmeasure=0.24489813914890696), high=Score(precision=0.21793493437488534, recall=0.3285681780256508, fmeasure=0.25151512471187004)),
 'rougeLsum': AggregateScore(l

##### Example 2

In [11]:
from tqdm.auto import tqdm

In [16]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [17]:
def chunks(list_of_elements, batch_size):
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i:i+batch_size]

In [18]:
def evaluate_summaries_pegasus(dataset, metric, model, tokenizer, 
                               batch_size=16,
                               device=device,
                               column_text="article", 
                               column_summary="highlights"):
    article_batches = list(chunks(dataset[column_text], batch_size))
    target_batches = list(chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):
        
        inputs = tokenizer(article_batch, max_length=1024,  truncation=True, 
                        padding="max_length", return_tensors="pt")
        
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device), 
                         length_penalty=0.8, num_beams=8, max_length=128)
        
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
                                clean_up_tokenization_spaces=True) 
               for s in summaries]
        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
        
    score = metric.compute()
    return score

In [19]:
# def evaluate_summaries_pegasus(
#     dataset, metric, model, tokenizer,
#     batch_size,
#     #device=device,
#     column_text="article",
#     column_summary="highlights"
# ):
#     article_batches = list(chunks(dataset[column_text], batch_size))
#     target_batches = list(chunks(dataset[column_summary], batch_size))
    
#     for article_batch, target_batch in tqdm(
#         zip(article_batches, target_batches),
#         total=len(article_batches)
#     ):
#         inputs = tokenizer(
#             article_batch,
#             max_length=1024, truncation=True,
#             padding="max_length", return_tensors="pt"
#         )
        
#         summaries = model.generate(
#             input_ids=inputs["input_ids"],
#             attention_mask = inputs["attention_mask"],
#             length_penalty=0.8, num_beams=8,
#             max_length=128
#         )
        
#         decoded_summaries = [tokenizer.decode(
#             s, skip_special_tokens=True,
#             clean_up_tokenization_spaces=True
#         ) for s in summaries]
        
#         decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
        
#         metric.add_batch(
#             predictions=decoded_summaries,
#             references=target_batch
#         )
        
#         score = metric.compute()
        
#         return score

In [21]:
model_ckpt = "google/pegasus-cnn_dailymail"

In [22]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

In [None]:
test_sampled = dataset["test"].shuffle(seed=42).select(range(16))

In [None]:
test_sampled

In [28]:
rouge_metric

Metric(name: "rouge", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}, usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each prediction
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLSum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/datasets/issues/617
    use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
    use_aggregator: Return aggregates if this is set to True
Retu

In [26]:
score = evaluate_summaries_pegasus(test_sampled, rouge_metric, model, tokenizer, batch_size=8)

  0%|          | 0/2 [00:00<?, ?it/s]

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper__index_select)