# Abstractive Summarizaiton - PEGASUS

## 1. Install Packages

In [1]:
!pip install rouge --quiet
!pip install torch==2.0.1

Collecting torch==2.0.1
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m72.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.1)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8

In [2]:
# Import modules
import pandas as pd
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
from rouge import Rouge

In [3]:
!pip install ohmeow-blurr -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.1/81.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m54.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

## 2. Load Data from Huggingface

In [4]:
from datasets import load_dataset

# Load the dataset from Hugging Face
dataset = load_dataset('ccdv/pubmed-summarization')

# Dataframe for Dataset
train_data = pd.DataFrame(dataset['train'])
test_data = pd.DataFrame(dataset['test'])
val_data = pd.DataFrame(dataset['validation'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  # Means the user did not define a `HF_TOKEN` secret => warn
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.13k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/2.66k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/779M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/43.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/43.8M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

## 3. Preprocessing

In [5]:
# Define dataset class
class SummarizationDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=512, max_output_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        article = self.data.iloc[index]["article"]
        abstract = self.data.iloc[index]["abstract"]
        input_ids = self.tokenizer.encode(article, max_length=self.max_input_length, truncation=True, padding="max_length")
        output_ids = self.tokenizer.encode(abstract, max_length=self.max_output_length, truncation=True, padding="max_length")
        return {"input_ids": input_ids, "attention_mask": [int(token_id != 0) for token_id in input_ids], "decoder_input_ids": output_ids[:-1], "decoder_attention_mask": [1] * (len(output_ids) - 1), "labels": output_ids[1:]}

In [6]:
# Fine-tuning on first 1000 records
train_data = train_data.head(1000).sample(frac=1, random_state=42)
val_data = val_data.head(125).sample(frac=1, random_state=42)

In [7]:
# Load the pegasus tokenizer and model
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-large")

tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

In [8]:
# Data processing, use SummarizationDataset to process the data set and convert the data into the format required by the model
train_dataset = SummarizationDataset(train_data, tokenizer)
val_dataset = SummarizationDataset(val_data, tokenizer)

In [9]:
# Ensure that all samples in each batch have the same shape so that they can be input into the model for training at once
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    decoder_input_ids = [item["decoder_input_ids"] for item in batch]
    decoder_attention_mask = [item["decoder_attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]
    max_input_length = max(len(ids) for ids in input_ids)
    max_output_length = max(len(ids) for ids in decoder_input_ids)
    input_ids = [ids + [0] * (max_input_length - len(ids)) for ids in input_ids]
    attention_mask = [mask + [0] * (max_input_length - len(mask)) for mask in attention_mask]
    decoder_input_ids = [ids + [0] * (max_output_length - len(ids)) for ids in decoder_input_ids]
    decoder_attention_mask = [mask + [0] * (max_output_length - len(mask)) for mask in decoder_attention_mask]
    labels = [ids + [-100] * (max_output_length - len(ids)) for ids in labels]
    return {"input_ids": torch.tensor(input_ids), "attention_mask": torch.tensor(attention_mask), "decoder_input_ids": torch.tensor(decoder_input_ids), "decoder_attention_mask": torch.tensor(decoder_attention_mask), "labels": torch.tensor(labels)}

In [10]:
# Convert the processed training set and validation set into an iterable data loader
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=2,collate_fn=collate_fn)

## 4. Fine Tuning the Model

In [11]:
# Initialize the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=10000)



In [13]:
# Fine tuning the model
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
for epoch in range(10):
    train_loss = 0
    for step,batch in tqdm(enumerate(train_loader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        loss = model(**batch).loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        if step%100==0 and step>0:
            print("Step-{},Train Loss-{}".format(step,loss.item()))
            break
        train_loss += loss.item()
    train_loss /= len(train_loader)
    val_loss = 0
    model.eval()
    with torch.no_grad():
        for step,batch in tqdm(enumerate(val_loader)):
            batch = {k: v.to(device) for k, v in batch.items()}
            loss = model(**batch).loss
            val_loss += loss.item()
            if step%100==0 and step>0:
                print("Step-{},Val Loss-{}".format(step,loss.item()))
                break
        val_loss /= len(val_loader)
    model.train()
    break
    print(f"Epoch {epoch+1} train loss: {train_loss:.4f} val loss: {val_loss:.4f}")

100it [00:26,  3.78it/s]


Step-100,Train Loss-6.0546746253967285


63it [00:06, 10.17it/s]


In [14]:
# Save the fine tuned model
model.save_pretrained("fine_tuned_pegasus")
tokenizer.save_pretrained("fine_tuned_pegasus")

Non-default generation parameters: {'max_length': 256, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


('fine_tuned_pegasus/tokenizer_config.json',
 'fine_tuned_pegasus/special_tokens_map.json',
 'fine_tuned_pegasus/spiece.model',
 'fine_tuned_pegasus/added_tokens.json')

## 5. Use Fine Tuned Model to Summarize

In [15]:
# Load the fine tuned model and tokenizer
model = PegasusForConditionalGeneration.from_pretrained("fine_tuned_pegasus")
tokenizer = PegasusTokenizer.from_pretrained("fine_tuned_pegasus")

In [16]:
# Choose 125 data
test_data = test_data.head(125).sample(frac=1, random_state=42)

In [17]:
# Define test dataset
test_dataset = SummarizationDataset(test_data, tokenizer)

In [18]:
# Define test dataloader
test_loader = DataLoader(test_dataset, batch_size=1,collate_fn=collate_fn)

In [19]:
# Summarization
model.to(device)
model.eval()
predictions = []
with torch.no_grad():
    for step, batch in tqdm(enumerate(test_loader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        output_ids = model.generate(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], max_length=128, decoder_start_token_id=tokenizer.pad_token_id)
        batch_predictions = [tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
        predictions.extend(batch_predictions)
        if step==124:
            break

124it [06:04,  2.94s/it]


In [20]:
test_data["predictions"] = predictions
test_data.to_csv("test_predictions.csv", index=False)

In [22]:
test_data.head()

Unnamed: 0,article,abstract,predictions
18,"in a recent issue of critical care , we presen...",bivalirudin has been proposed as the sole anti...,"conversely, under some circumstances, the card..."
42,thyroid cancer is the most common endocrine ne...,"medullary thyroid carcinoma ( mtc ) , which or...",hereditary mtc occurs as a familial mtc ( fmtc...
36,ribonucleotide reductase ( rr ) inhibitors hav...,"ribonucleotide reductase ( rr ) , the rate lim...",there are currently two known smaller subunits...
76,sepsis is the systemic inflammation caused by ...,backgroundpediatric sepsis has high morbidity ...,acute kidney injury ( aki ) is a severe conseq...
53,gastric cancer is the second most frequent neo...,the typical symptoms of advanced cancer of the...,"a female patient, age 67, was diagnosed at the..."


## 6. Model Evaluation

In [23]:
# Installing packages to be used to calculate metrics
!pip install nltk rouge-score bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bert-score
Successfully installed bert-score-0.3.13


In [24]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [41]:
from rouge_score import rouge_scorer
import bert_score

# A function which calculates all the metrics
def evaluate_summary(predictions, abstract):

    # ROUGE scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(abstract, predictions)
    rouge_1_precision = scores['rouge1'].precision
    rouge_1_recall = scores['rouge1'].recall
    rouge_1_f1 = scores['rouge1'].fmeasure

    rouge_2_precision = scores['rouge2'].precision
    rouge_2_recall = scores['rouge2'].recall
    rouge_2_f1 = scores['rouge2'].fmeasure

    rouge_l_precision = scores['rougeL'].precision
    rouge_l_recall = scores['rougeL'].recall
    rouge_l_f1 = scores['rougeL'].fmeasure

    # BERTScore
    bert_precision, bert_recall, bert_f1 = bert_score.score([predictions], [abstract], lang='en', model_type='bert-base-uncased')

    # BLEU score
    predictions_tokens = nltk.word_tokenize(predictions.lower())
    abstract_tokens = nltk.word_tokenize(abstract.lower())
    bleu_score = nltk.translate.bleu_score.sentence_bleu([abstract_tokens], predictions_tokens)

    return {
        'rouge-1_precision': rouge_1_precision,
        'rouge-1_recall': rouge_1_recall,
        'rouge-1_f1': rouge_1_f1,
        'rouge-2_precision': rouge_2_precision,
        'rouge-2_recall': rouge_2_recall,
        'rouge-2_f1': rouge_2_f1,
        'rouge-l_precision': rouge_l_precision,
        'rouge-l_recall': rouge_l_recall,
        'rouge-l_f1': rouge_l_f1,
        'bertscore-precision': bert_precision.item(),
        'bertscore-recall': bert_recall.item(),
        'bertscore-f1': bert_f1.item(),
        'bleu': bleu_score

    }

In [47]:
import numpy as np

# Function to call the evaluation
def get_eval_metrics(candidate_sum, reference_sum):
    rouge1_precision = []
    rouge1_recall = []
    rouge1_f1 = []
    rouge2_precision = []
    rouge2_recall = []
    rouge2_f1 = []
    rougeL_precision = []
    rougeL_recall = []
    rougeL_f1 = []
    bert_score_p = []
    bert_score_r = []
    bert_score_f1 = []
    bleu_scores = []

    for i in range(len(candidate_sum)):
       metrics = evaluate_summary(candidate_sum[i], reference_sum[i])

       rouge1_precision.append(metrics['rouge-1_precision'])
       rouge1_recall.append(metrics['rouge-1_recall'])
       rouge1_f1.append(metrics['rouge-1_f1'])

       rouge2_precision.append(metrics['rouge-2_precision'])
       rouge2_recall.append(metrics['rouge-2_recall'])
       rouge2_f1.append(metrics['rouge-2_f1'])

       rougeL_precision.append(metrics['rouge-l_precision'])
       rougeL_recall.append(metrics['rouge-l_recall'])
       rougeL_f1.append(metrics['rouge-l_f1'])

       bert_score_p.append(metrics['bertscore-precision'])
       bert_score_r.append(metrics['bertscore-recall'])
       bert_score_f1.append(metrics['bertscore-f1'])

       bleu_scores.append(metrics['bleu'])


    return {
        'rouge-1_precision': round(np.mean(rouge1_precision), 4),
        'rouge-1_recall': round(np.mean(rouge1_recall), 4),
        'rouge-1_f1': round(np.mean(rouge1_f1), 4),
        'rouge-2_precision': round(np.mean(rouge2_precision), 4),
        'rouge-2_recall': round(np.mean(rouge2_recall), 4),
        'rouge-2_f1': round(np.mean(rouge2_f1), 4),
        'rouge-l_precision': round(np.mean(rougeL_precision), 4),
        'rouge-l_recall': round(np.mean(rougeL_recall), 4),
        'rouge-l_f1': round(np.mean(rougeL_f1), 4),
        'bertscore-precision': round(np.mean(bert_score_p), 4),
        'bertscore-recall': round(np.mean(bert_score_r), 4),
        'bertscore-f1': round(np.mean(bert_score_f1), 4),
        'bleu': round(np.mean(bleu_scores), 4)

  }

In [48]:
predict_summary = test_data["predictions"]
abstract_summary = test_data["abstract"]
pegasus_scores = get_eval_metrics(predict_summary, abstract_summary)

In [49]:
# Score results
pegasus_scores

{'rouge-1_precision': 0.4704,
 'rouge-1_recall': 0.2061,
 'rouge-1_f1': 0.2712,
 'rouge-2_precision': 0.1588,
 'rouge-2_recall': 0.068,
 'rouge-2_f1': 0.0903,
 'rouge-l_precision': 0.291,
 'rouge-l_recall': 0.1254,
 'rouge-l_f1': 0.1655,
 'bertscore-precision': 0.602,
 'bertscore-recall': 0.5294,
 'bertscore-f1': 0.5615,
 'bleu': 0.0324}

In [50]:
from datasets import load_metric
rouge = load_metric("rouge")

rouge_scores = rouge.compute(predictions=test_data["predictions"], references=test_data["abstract"])
print(rouge_scores)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'rouge1': AggregateScore(low=Score(precision=0.41952141348383554, recall=0.1793973741771589, fmeasure=0.24053455625582684), mid=Score(precision=0.44602517924475316, recall=0.19552453073423248, fmeasure=0.25727704926031714), high=Score(precision=0.47259351232642083, recall=0.21247439294172682, fmeasure=0.2752199744435756)), 'rouge2': AggregateScore(low=Score(precision=0.125453088114196, recall=0.05289093542906538, fmeasure=0.07106649326320018), mid=Score(precision=0.15122248050349935, recall=0.06499058374434663, fmeasure=0.08630073858658734), high=Score(precision=0.18241868488140106, recall=0.07867542529370407, fmeasure=0.10435520144014995)), 'rougeL': AggregateScore(low=Score(precision=0.25501378983382006, recall=0.10964407754750509, fmeasure=0.14646408229904542), mid=Score(precision=0.28002958635458997, recall=0.12154415912653407, fmeasure=0.15975277517026146), high=Score(precision=0.3085972801672277, recall=0.13474192407258062, fmeasure=0.17647850098043058)), 'rougeLsum': AggregateS

In [51]:
from typing import Dict, Any

def simplify_rouge_scores(rouge_scores: Dict[str, Any]) -> str:
    simplified_text = ""
    for key, value in rouge_scores.items():
        # Extract low, mid, and high scores for each ROUGE metric
        low, mid, high = value.low, value.mid, value.high
        simplified_text += f"{key}: Precision ranges from {low.precision:.2%} to {high.precision:.2%}, "
        simplified_text += f"Recall ranges from {low.recall:.2%} to {high.recall:.2%}, "
        simplified_text += f"F1 Score ranges from {low.fmeasure:.2%} to {high.fmeasure:.2%}.\n"

    return simplified_text

In [52]:
# Rouge scores
print(simplify_rouge_scores(rouge_scores))

rouge1: Precision ranges from 41.95% to 47.26%, Recall ranges from 17.94% to 21.25%, F1 Score ranges from 24.05% to 27.52%.
rouge2: Precision ranges from 12.55% to 18.24%, Recall ranges from 5.29% to 7.87%, F1 Score ranges from 7.11% to 10.44%.
rougeL: Precision ranges from 25.50% to 30.86%, Recall ranges from 10.96% to 13.47%, F1 Score ranges from 14.65% to 17.65%.
rougeLsum: Precision ranges from 35.35% to 40.87%, Recall ranges from 15.10% to 18.01%, F1 Score ranges from 20.15% to 23.50%.

