In [1]:
!pip install rouge --quiet
!pip install torch==2.0.1

Collecting torch==2.0.1
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m67.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m66.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.1)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8

In [2]:
import pandas as pd
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
from rouge import Rouge

In [3]:
class SummarizationDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=512, max_output_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        article = self.data.iloc[index]["article"]
        abstract = self.data.iloc[index]["abstract"]
        input_ids = self.tokenizer.encode(article, max_length=self.max_input_length, truncation=True, padding="max_length")
        output_ids = self.tokenizer.encode(abstract, max_length=self.max_output_length, truncation=True, padding="max_length")
        return {"input_ids": input_ids, "attention_mask": [int(token_id != 0) for token_id in input_ids], "decoder_input_ids": output_ids[:-1], "decoder_attention_mask": [1] * (len(output_ids) - 1), "labels": output_ids[1:]}

In [4]:
!pip install ohmeow-blurr -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.1/81.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [5]:
from datasets import load_dataset

# Load the dataset from Hugging Face
dataset = load_dataset('ccdv/pubmed-summarization')

# Dataframe for Train Dataset
train_data = pd.DataFrame(dataset['train'])
test_data = pd.DataFrame(dataset['test'])
val_data = pd.DataFrame(dataset['validation'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  # Means the user did not define a `HF_TOKEN` secret => warn
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.13k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/2.66k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/779M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/43.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/43.8M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [37]:
test_data = pd.DataFrame(dataset['test'])

In [6]:
# Fine-tuning on first 1000 records
train_data = train_data.head(1000).sample(frac=1, random_state=42)
val_data = val_data.head(125).sample(frac=1, random_state=42)

In [7]:
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-large")

tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

In [8]:
#data processing, Use SummarizationDataset to process the data set and convert the data into the format required by the model
train_dataset = SummarizationDataset(train_data, tokenizer)
val_dataset = SummarizationDataset(val_data, tokenizer)

In [9]:
#Ensure that all samples in each batch have the same shape so that they can be input into the model for training at once
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    decoder_input_ids = [item["decoder_input_ids"] for item in batch]
    decoder_attention_mask = [item["decoder_attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]
    max_input_length = max(len(ids) for ids in input_ids)
    max_output_length = max(len(ids) for ids in decoder_input_ids)
    input_ids = [ids + [0] * (max_input_length - len(ids)) for ids in input_ids]
    attention_mask = [mask + [0] * (max_input_length - len(mask)) for mask in attention_mask]
    decoder_input_ids = [ids + [0] * (max_output_length - len(ids)) for ids in decoder_input_ids]
    decoder_attention_mask = [mask + [0] * (max_output_length - len(mask)) for mask in decoder_attention_mask]
    labels = [ids + [-100] * (max_output_length - len(ids)) for ids in labels]
    return {"input_ids": torch.tensor(input_ids), "attention_mask": torch.tensor(attention_mask), "decoder_input_ids": torch.tensor(decoder_input_ids), "decoder_attention_mask": torch.tensor(decoder_attention_mask), "labels": torch.tensor(labels)}

In [10]:
#Convert the processed training set and validation set into an iterable data loader
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=2,collate_fn=collate_fn)

In [11]:
#Initialize the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=10000)



In [12]:
len(train_loader),len(val_loader)

(500, 63)

In [13]:
#training loop
from tqdm import tqdm

#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model.to(device)
model.train()
for epoch in range(10):
    train_loss = 0
    for step,batch in tqdm(enumerate(train_loader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        loss = model(**batch).loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        if step%100==0 and step>0:
            print("Step-{},Train Loss-{}".format(step,loss.item()))
            break
        train_loss += loss.item()
    train_loss /= len(train_loader)
    val_loss = 0
    model.eval()
    with torch.no_grad():
        for step,batch in tqdm(enumerate(val_loader)):
            batch = {k: v.to(device) for k, v in batch.items()}
            loss = model(**batch).loss
            val_loss += loss.item()
            if step%100==0 and step>0:
                print("Step-{},Val Loss-{}".format(step,loss.item()))
                break
        val_loss /= len(val_loader)
    model.train()
    break
    print(f"Epoch {epoch+1} train loss: {train_loss:.4f} val loss: {val_loss:.4f}")

100it [00:27,  3.69it/s]


Step-100,Train Loss-6.61557674407959


63it [00:06,  9.77it/s]


In [14]:
model.save_pretrained("fine_tuned_pegasus")
tokenizer.save_pretrained("fine_tuned_pegasus")

Non-default generation parameters: {'max_length': 256, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


('fine_tuned_pegasus/tokenizer_config.json',
 'fine_tuned_pegasus/special_tokens_map.json',
 'fine_tuned_pegasus/spiece.model',
 'fine_tuned_pegasus/added_tokens.json')

In [15]:
model = PegasusForConditionalGeneration.from_pretrained("fine_tuned_pegasus")
tokenizer = PegasusTokenizer.from_pretrained("fine_tuned_pegasus")

In [39]:
test_data = test_data.head(125).sample(frac=1, random_state=42)

In [41]:
test_dataset = SummarizationDataset(test_data, tokenizer)

In [43]:
test_loader = DataLoader(test_dataset, batch_size=1,collate_fn=collate_fn)

In [45]:
model.to(device)
model.eval()
predictions = []
with torch.no_grad():
    for step, batch in tqdm(enumerate(test_loader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        output_ids = model.generate(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], max_length=128, decoder_start_token_id=tokenizer.pad_token_id)
        batch_predictions = [tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
        predictions.extend(batch_predictions)
        if step==124:
            break

124it [06:28,  3.13s/it]


In [47]:
print(len(predictions))

125


In [46]:
test_data

Unnamed: 0,article,abstract
18,"in a recent issue of critical care , we presen...",bivalirudin has been proposed as the sole anti...
42,thyroid cancer is the most common endocrine ne...,"medullary thyroid carcinoma ( mtc ) , which or..."
36,ribonucleotide reductase ( rr ) inhibitors hav...,"ribonucleotide reductase ( rr ) , the rate lim..."
76,sepsis is the systemic inflammation caused by ...,backgroundpediatric sepsis has high morbidity ...
53,gastric cancer is the second most frequent neo...,the typical symptoms of advanced cancer of the...
...,...,...
106,while advances in free trade and globalization...,the mechanisms that facilitate success of an i...
14,this in vitro study was conducted in the chemi...,objectives : to investigate potential mechanis...
92,intramammary infection ( mastitis ) is the mos...,treatment of mastitis should be based on bacte...
51,older people have particular spiritual needs t...,background : spirituality is recognized as a p...


In [48]:
#test_data = test_data[:101]# for 100 predicitons only
print(len(test_data))
test_data["predictions"] = predictions
test_data.to_csv("test_predictions.csv", index=False)

125


In [52]:
! pip install -q evaluate rouge_score

!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bert-score
Successfully installed bert-score-0.3.13


In [63]:
from bert_score import BERTScorer

# BERTScore calculation
scorer = BERTScorer(model_type='bert-base-uncased')
P, R, F1 = scorer.score(test_data['predictions'].tolist(), test_data['abstract'].tolist())
bert_precision = f"{P.mean():.4f}"
bert_recall = f"{R.mean():.4f}"
bert_f1 = f"{F1.mean():.4f}"

print(f"BERTScore Precision: {bert_precision}, Recall: {bert_recall}, F1: {bert_f1}")

BERTScore Precision: 0.6023, Recall: 0.5303, F1: 0.5621


In [None]:
import nltk
import sacrebleu

# BLEU score
    predicted_tokens = nltk.word_tokenize(predicted_summary.lower())
    ground_truth_tokens = nltk.word_tokenize(ground_truth_summary.lower())
    bleu_score = nltk.translate.bleu_score.sentence_bleu([ground_truth_tokens], predicted_tokens)

In [64]:
# Installing packages to be used to calculate metrics
!pip install nltk rouge-score bert-score sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/106.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.8.2 sacrebleu-2.4.2


In [65]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [74]:
import nltk
from rouge_score import rouge_scorer
import bert_score
import sacrebleu

# A function which calculates all the metrics
def evaluate_summary(predictions, abstract):
    # ROUGE scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(abstract, predictions)
    rouge_1 = scores['rouge1'].fmeasure
    rouge_2 = scores['rouge2'].fmeasure
    rouge_l = scores['rougeL'].fmeasure

    # BERTScore
    bert_precision, bert_recall, bert_f1 = bert_score.score([predictions], [abstract], lang='en', model_type='bert-base-uncased')

    # BLEU score
    predictions_tokens = nltk.word_tokenize(abstract.lower())
    abstract_tokens = nltk.word_tokenize(abstract.lower())
    bleu_score = nltk.translate.bleu_score.sentence_bleu([abstract_tokens], predictions_tokens)


    return {
        'rouge-1': rouge_1,
        'rouge-2': rouge_2,
        'rouge-l': rouge_l,
        'bertscore-precision': bert_precision.item(),
        'bertscore-recall': bert_recall.item(),
        'bertscore-f1': bert_f1.item(),
        'bleu': bleu_score

    }

In [75]:
import numpy as np

# A function which calls the evaluare summary function, for list of summaries and ground truth
def get_eval_metrics(candidate_sum, reference_sum):
  rouge1_scores = []
  rouge2_scores = []
  rougeL_scores = []
  bert_score_p = []
  bert_score_r = []
  bert_score_f1 = []
  bleu_scores = []
  sacrebleu_scores = []
  meteor_scores = []

  for i in range(len(candidate_sum)):
    metrics = evaluate_summary(candidate_sum[i], reference_sum[i])

    rouge1_scores.append(metrics['rouge-1'])
    rouge2_scores.append(metrics['rouge-2'])
    rougeL_scores.append(metrics['rouge-l'])

    bert_score_p.append(metrics['bertscore-precision'])
    bert_score_r.append(metrics['bertscore-recall'])
    bert_score_f1.append(metrics['bertscore-f1'])

    bleu_scores.append(metrics['bleu'])


  return {
      'rouge-1': np.mean(rouge1_scores),
      'rouge-2': np.mean(rouge2_scores),
      'rouge-l': np.mean(rougeL_scores),
      'bertscore-precision': np.mean(bert_score_p),
      'bertscore-recall': np.mean(bert_score_r),
      'bertscore-f1': np.mean(bert_score_f1),
      'bleu': np.mean(bleu_scores)

  }

In [76]:

predict_summary = test_data["predictions"]
abstract_summary = test_data["abstract"]
pegasus_scores = get_eval_metrics(predict_summary, abstract_summary)

In [77]:

# Results for Pegasus
pegasus_scores

{'rouge-1': 0.27315985694092926,
 'rouge-2': 0.09215933081169043,
 'rouge-l': 0.1675499159359455,
 'bertscore-precision': 0.6023351583480835,
 'bertscore-recall': 0.5303039712905884,
 'bertscore-f1': 0.5621451914310456,
 'bleu': 1.0}