# Abstractive Summarizaiton - PEGASUS

## 1. Install Packages

In [1]:
!pip install rouge --quiet
!pip install torch==2.0.1

Collecting torch==2.0.1
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m68.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.1)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8

In [2]:
# Import modules
import pandas as pd
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
from rouge import Rouge

In [3]:
!pip install ohmeow-blurr -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.1/81.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m53.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

## 2. Load Data from Huggingface

In [77]:
from datasets import load_dataset

# Load the dataset from Hugging Face
dataset = load_dataset('ccdv/pubmed-summarization')

# Dataframe for Dataset
train_data = dataset["train"].shuffle(seed=42).select(range(1000))
test_data = dataset["test"].shuffle(seed=42).select(range(125))
val_data = dataset["validation"].shuffle(seed=42).select(range(125))

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [35]:
test_data

Dataset({
    features: ['article', 'abstract'],
    num_rows: 125
})

## 3. Preprocessing

In [78]:
# Define dataset class
class SummarizationDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=512, max_output_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        article = self.data[index]["article"]
        abstract = self.data[index]["abstract"]
        input_ids = self.tokenizer.encode(article, max_length=self.max_input_length, truncation=True, padding="max_length")
        output_ids = self.tokenizer.encode(abstract, max_length=self.max_output_length, truncation=True, padding="max_length")
        return {"input_ids": input_ids, "attention_mask": [int(token_id != 0) for token_id in input_ids], "decoder_input_ids": output_ids[:-1], "decoder_attention_mask": [1] * (len(output_ids) - 1), "labels": output_ids[1:]}

In [79]:
# Load the pegasus tokenizer and model
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-large")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [80]:
# Data processing, use SummarizationDataset to process the data set and convert the data into the format required by the model
train_dataset = SummarizationDataset(train_data, tokenizer)
val_dataset = SummarizationDataset(val_data, tokenizer)

In [81]:
# Ensure that all samples in each batch have the same shape so that they can be input into the model for training at once
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    decoder_input_ids = [item["decoder_input_ids"] for item in batch]
    decoder_attention_mask = [item["decoder_attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]
    max_input_length = max(len(ids) for ids in input_ids)
    max_output_length = max(len(ids) for ids in decoder_input_ids)
    input_ids = [ids + [0] * (max_input_length - len(ids)) for ids in input_ids]
    attention_mask = [mask + [0] * (max_input_length - len(mask)) for mask in attention_mask]
    decoder_input_ids = [ids + [0] * (max_output_length - len(ids)) for ids in decoder_input_ids]
    decoder_attention_mask = [mask + [0] * (max_output_length - len(mask)) for mask in decoder_attention_mask]
    labels = [ids + [-100] * (max_output_length - len(ids)) for ids in labels]
    return {"input_ids": torch.tensor(input_ids), "attention_mask": torch.tensor(attention_mask), "decoder_input_ids": torch.tensor(decoder_input_ids), "decoder_attention_mask": torch.tensor(decoder_attention_mask), "labels": torch.tensor(labels)}

In [82]:
# Convert the processed training set and validation set into an iterable data loader
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=2,collate_fn=collate_fn)

## 4. Fine Tuning the Model

In [83]:
# Initialize the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=10000)



In [84]:
# Fine tuning the model
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
for epoch in range(10):
    train_loss = 0
    for step,batch in tqdm(enumerate(train_loader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        loss = model(**batch).loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        if step%100==0 and step>0:
            print("Step-{},Train Loss-{}".format(step,loss.item()))
            break
        train_loss += loss.item()
    train_loss /= len(train_loader)
    val_loss = 0
    model.eval()
    with torch.no_grad():
        for step,batch in tqdm(enumerate(val_loader)):
            batch = {k: v.to(device) for k, v in batch.items()}
            loss = model(**batch).loss
            val_loss += loss.item()
            if step%100==0 and step>0:
                print("Step-{},Val Loss-{}".format(step,loss.item()))
                break
        val_loss /= len(val_loader)
    model.train()
    break
    print(f"Epoch {epoch+1} train loss: {train_loss:.4f} val loss: {val_loss:.4f}")

100it [00:46,  2.14it/s]


Step-100,Train Loss-8.063518524169922


63it [00:09,  6.80it/s]


In [85]:
# Save the fine tuned model
model.save_pretrained("fine_tuned_pegasus")
tokenizer.save_pretrained("fine_tuned_pegasus")

Non-default generation parameters: {'max_length': 256, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


('fine_tuned_pegasus/tokenizer_config.json',
 'fine_tuned_pegasus/special_tokens_map.json',
 'fine_tuned_pegasus/spiece.model',
 'fine_tuned_pegasus/added_tokens.json')

## 5. Use Fine Tuned Model to Summarize

In [86]:
# Load the fine tuned model and tokenizer
model = PegasusForConditionalGeneration.from_pretrained("fine_tuned_pegasus")
tokenizer = PegasusTokenizer.from_pretrained("fine_tuned_pegasus")

In [87]:
# Define test dataset
test_dataset = SummarizationDataset(test_data, tokenizer)

In [88]:
# Define test dataloader
test_loader = DataLoader(test_dataset, batch_size=1,collate_fn=collate_fn)

In [89]:
# Summarization
model.to(device)
model.eval()
predictions = []
with torch.no_grad():
    for step, batch in tqdm(enumerate(test_loader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        output_ids = model.generate(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], max_length=128, decoder_start_token_id=tokenizer.pad_token_id)
        batch_predictions = [tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
        predictions.extend(batch_predictions)
        if step==124:
            break

124it [06:28,  3.13s/it]


In [90]:
test_data = test_data.to_pandas()

test_data["predictions"] = predictions

test_data.to_excel('results.xlsx', index = False)

In [91]:
test_data.head()

Unnamed: 0,article,abstract,predictions
0,the birth of a premature infant has long been ...,background : the purpose of this study was to ...,the premature birth of an infant and the follo...
1,"de ridder , vanneste , and focquaert address c...",our ( 2014 ) model for the regulation of cogni...,they raise a number of problems with the treat...
2,it is not easy to define a good health care ...,"aim : to date , the available information rega...",perceived quality is one of the principal dete...
3,the prosite database uses two kinds of signatu...,the prosite database consists of a large colle...,hence the selection procedure consists of runn...
4,although about two - thirds of patients with e...,"brivaracetam ( brv ) , a high - affinity synap...",although about two - thirds of patients with e...


## 6. Model Evaluation

In [92]:
# Installing packages to be used to calculate metrics
!pip install nltk rouge-score bert-score



In [93]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [94]:
from rouge_score import rouge_scorer
import bert_score

# A function which calculates all the metrics
def evaluate_summary(predictions, abstract):

    # ROUGE scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(abstract, predictions)
    rouge_1_precision = scores['rouge1'].precision
    rouge_1_recall = scores['rouge1'].recall
    rouge_1_f1 = scores['rouge1'].fmeasure

    rouge_2_precision = scores['rouge2'].precision
    rouge_2_recall = scores['rouge2'].recall
    rouge_2_f1 = scores['rouge2'].fmeasure

    rouge_l_precision = scores['rougeL'].precision
    rouge_l_recall = scores['rougeL'].recall
    rouge_l_f1 = scores['rougeL'].fmeasure

    # BERTScore
    bert_precision, bert_recall, bert_f1 = bert_score.score([predictions], [abstract], lang='en', model_type='bert-base-uncased')

    # BLEU score
    predictions_tokens = nltk.word_tokenize(predictions.lower())
    abstract_tokens = nltk.word_tokenize(abstract.lower())
    bleu_score = nltk.translate.bleu_score.sentence_bleu([abstract_tokens], predictions_tokens)

    return {
        'rouge-1_precision': rouge_1_precision,
        'rouge-1_recall': rouge_1_recall,
        'rouge-1_f1': rouge_1_f1,
        'rouge-2_precision': rouge_2_precision,
        'rouge-2_recall': rouge_2_recall,
        'rouge-2_f1': rouge_2_f1,
        'rouge-l_precision': rouge_l_precision,
        'rouge-l_recall': rouge_l_recall,
        'rouge-l_f1': rouge_l_f1,
        'bertscore-precision': bert_precision.item(),
        'bertscore-recall': bert_recall.item(),
        'bertscore-f1': bert_f1.item(),
        'bleu': bleu_score

    }

In [95]:
import numpy as np

# Function to call the evaluation
def get_eval_metrics(candidate_sum, reference_sum):
    rouge1_precision = []
    rouge1_recall = []
    rouge1_f1 = []
    rouge2_precision = []
    rouge2_recall = []
    rouge2_f1 = []
    rougeL_precision = []
    rougeL_recall = []
    rougeL_f1 = []
    bert_score_p = []
    bert_score_r = []
    bert_score_f1 = []
    bleu_scores = []

    for i in range(len(candidate_sum)):
       metrics = evaluate_summary(candidate_sum[i], reference_sum[i])

       rouge1_precision.append(metrics['rouge-1_precision'])
       rouge1_recall.append(metrics['rouge-1_recall'])
       rouge1_f1.append(metrics['rouge-1_f1'])

       rouge2_precision.append(metrics['rouge-2_precision'])
       rouge2_recall.append(metrics['rouge-2_recall'])
       rouge2_f1.append(metrics['rouge-2_f1'])

       rougeL_precision.append(metrics['rouge-l_precision'])
       rougeL_recall.append(metrics['rouge-l_recall'])
       rougeL_f1.append(metrics['rouge-l_f1'])

       bert_score_p.append(metrics['bertscore-precision'])
       bert_score_r.append(metrics['bertscore-recall'])
       bert_score_f1.append(metrics['bertscore-f1'])

       bleu_scores.append(metrics['bleu'])


    return {
        'rouge-1_precision': round(np.mean(rouge1_precision), 4),
        'rouge-1_recall': round(np.mean(rouge1_recall), 4),
        'rouge-1_f1': round(np.mean(rouge1_f1), 4),
        'rouge-2_precision': round(np.mean(rouge2_precision), 4),
        'rouge-2_recall': round(np.mean(rouge2_recall), 4),
        'rouge-2_f1': round(np.mean(rouge2_f1), 4),
        'rouge-l_precision': round(np.mean(rougeL_precision), 4),
        'rouge-l_recall': round(np.mean(rougeL_recall), 4),
        'rouge-l_f1': round(np.mean(rougeL_f1), 4),
        'bertscore-precision': round(np.mean(bert_score_p), 4),
        'bertscore-recall': round(np.mean(bert_score_r), 4),
        'bertscore-f1': round(np.mean(bert_score_f1), 4),
        'bleu': round(np.mean(bleu_scores), 4)

  }

In [96]:
predict_summary = test_data["predictions"]
abstract_summary = test_data["abstract"]
pegasus_scores = get_eval_metrics(predict_summary, abstract_summary)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [97]:
# Score results
pegasus_scores

{'rouge-1_precision': 0.457,
 'rouge-1_recall': 0.2105,
 'rouge-1_f1': 0.2698,
 'rouge-2_precision': 0.1426,
 'rouge-2_recall': 0.0631,
 'rouge-2_f1': 0.0807,
 'rouge-l_precision': 0.2768,
 'rouge-l_recall': 0.125,
 'rouge-l_f1': 0.1603,
 'bertscore-precision': 0.5948,
 'bertscore-recall': 0.5221,
 'bertscore-f1': 0.554,
 'bleu': 0.0284}

In [98]:
from datasets import load_metric
rouge = load_metric("rouge")

rouge_scores = rouge.compute(predictions=test_data["predictions"], references=test_data["abstract"])
print(rouge_scores)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'rouge1': AggregateScore(low=Score(precision=0.4045533882605184, recall=0.18283150811056773, fmeasure=0.2389389486923416), mid=Score(precision=0.42970027012513445, recall=0.1984768832752028, fmeasure=0.2542120990521726), high=Score(precision=0.4581086107097112, recall=0.21621503579276377, fmeasure=0.270071146479943)), 'rouge2': AggregateScore(low=Score(precision=0.11154688421627947, recall=0.04859435288428194, fmeasure=0.06369366687595103), mid=Score(precision=0.13355291184225063, recall=0.059085685072790835, fmeasure=0.07560508151545095), high=Score(precision=0.16359496697063813, recall=0.07143546569676912, fmeasure=0.08992721911286349)), 'rougeL': AggregateScore(low=Score(precision=0.24435885898433699, recall=0.10804903846730468, fmeasure=0.1424887333460836), mid=Score(precision=0.26544110806649895, recall=0.11954878802821375, fmeasure=0.15345133862132787), high=Score(precision=0.29173328580384816, recall=0.13151396963524747, fmeasure=0.16447118030673138)), 'rougeLsum': AggregateSco

In [99]:
from typing import Dict, Any

def simplify_rouge_scores(rouge_scores: Dict[str, Any]) -> str:
    simplified_text = ""
    for key, value in rouge_scores.items():
        # Extract low, mid, and high scores for each ROUGE metric
        low, mid, high = value.low, value.mid, value.high
        simplified_text += f"{key}: Precision ranges from {low.precision:.2%} to {high.precision:.2%}, "
        simplified_text += f"Recall ranges from {low.recall:.2%} to {high.recall:.2%}, "
        simplified_text += f"F1 Score ranges from {low.fmeasure:.2%} to {high.fmeasure:.2%}.\n"

    return simplified_text

In [100]:
# Rouge scores
print(simplify_rouge_scores(rouge_scores))

rouge1: Precision ranges from 40.46% to 45.81%, Recall ranges from 18.28% to 21.62%, F1 Score ranges from 23.89% to 27.01%.
rouge2: Precision ranges from 11.15% to 16.36%, Recall ranges from 4.86% to 7.14%, F1 Score ranges from 6.37% to 8.99%.
rougeL: Precision ranges from 24.44% to 29.17%, Recall ranges from 10.80% to 13.15%, F1 Score ranges from 14.25% to 16.45%.
rougeLsum: Precision ranges from 33.75% to 38.84%, Recall ranges from 15.10% to 17.87%, F1 Score ranges from 19.83% to 22.55%.

