# Pegasus

This notebook is adapted from HuggingFace's summarisation notebook. https://huggingface.co/transformers/notebooks.html

# 1. Import Modules
# 2. Load Data
# 3. Preprocess the Data
# 4. Finetune the Model
# 3. Make Predictions

## 1. Import Modules

In [None]:
!pip install datasets transformers rouge-score nltk



In [None]:
%pip install optuna



In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
%pip install sentencepiece
import sentencepiece
import torch
from datasets import load_dataset, load_metric

%pip install pickle5
import pickle5 as pickle



In [None]:
import datasets
import random
import pandas as pd

%pip install wandb
import wandb

from transformers import AutoTokenizer
from datasets import Dataset

In [None]:
model_checkpoint = "sshleifer/distill-pegasus-xsum-16-4"

## Loading the dataset

In [None]:
# open a file, where you stored the pickled data
DATA_PATH = "/content"
file1 = open(DATA_PATH+'/tac_train_dataset_nodups2.pickle', 'rb')
file2 = open(DATA_PATH+'/tac_valid_dataset_nodups2.pickle', 'rb')
file3 = open(DATA_PATH+'/tac_test_dataset_nodups2.pickle', 'rb')

# dump information to that file
tac_train = pickle.load(file1)
tac_valid = pickle.load(file2)
tac_test = pickle.load(file3)

metric = load_metric("rouge")



In [None]:
metric

Metric(name: "rouge", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}, usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each predictions
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLSum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/datasets/issues/617
    use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
    use_agregator: Return aggregates if this is set to True
Retu

## Preprocessing the data

In [None]:
# Import tokenizer for Pegasus
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

loading configuration file https://huggingface.co/sshleifer/distill-pegasus-xsum-16-4/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/f4d75b72b5b1b972f59383c67541acaa3d1603e2983ad2c728efdbb91ce2587c.65887539bec5167d9468c15939399e0b235beaa472788081d130fbc3439fe723
Model config PegasusConfig {
  "activation_dropout": 0.1,
  "activation_function": "relu",
  "add_bias_logits": false,
  "add_final_layer_norm": true,
  "architectures": [
    "PegasusForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 4,
  "decoder_start_token_id": 0,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 16,
  "eos_token_id": 1,
  "extra_pos_embeddings": 1,
  "force_bos_token_to_be_generated": false,
  "

In [None]:
# Define task
if model_checkpoint in ["sshleifer/distill-pegasus-xsum-16-4"]:
    prefix = "summarize: "
else:
    prefix = ""

In [None]:
max(tac_train.astype('str').applymap(lambda x: len(x)).max())

546

In [None]:
max_input_length = 549
max_target_length = 200

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["Original Text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["Summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# define raw dataset

train_dataset = Dataset.from_pandas(tac_train)
val_dataset = Dataset.from_pandas(tac_valid)
test_dataset = Dataset.from_pandas(tac_test)

raw_datasets = datasets.DatasetDict({"train" : train_dataset, "validation" : val_dataset, "test" : test_dataset})

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['Summary', 'Original Text', '__index_level_0__'],
        num_rows: 607
    })
    validation: Dataset({
        features: ['Summary', 'Original Text', '__index_level_0__'],
        num_rows: 208
    })
    test: Dataset({
        features: ['Summary', 'Original Text', '__index_level_0__'],
        num_rows: 53
    })
})

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

searchdatasets_tokenized = searchdatasets.map(preprocess_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

## Fine-tuning the model

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

loading configuration file https://huggingface.co/sshleifer/distill-pegasus-xsum-16-4/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/f4d75b72b5b1b972f59383c67541acaa3d1603e2983ad2c728efdbb91ce2587c.65887539bec5167d9468c15939399e0b235beaa472788081d130fbc3439fe723
Model config PegasusConfig {
  "activation_dropout": 0.1,
  "activation_function": "relu",
  "add_bias_logits": false,
  "add_final_layer_norm": true,
  "architectures": [
    "PegasusForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 4,
  "decoder_start_token_id": 0,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 16,
  "eos_token_id": 1,
  "extra_pos_embeddings": 1,
  "force_bos_token_to_be_generated": false,
  "

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
# Clear space 
torch.cuda.empty_cache()

In [None]:
# Start a W&B run
wandb.init(project='Pegasus', entity='belin')
run_name = wandb.run.name
wandb.config
wandb.log({'loss': 0.2, 'epoch': 1})
%env WANDB_LOG_MODEL=true



VBox(children=(Label(value=' 1414.21MB of 1414.21MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, ma…

0,1
loss,0.2
epoch,1.0
_runtime,1177.0
_timestamp,1627821163.0
_step,16.0
train/loss,1.8414
train/learning_rate,0.0
train/epoch,7.0
train/global_step,4249.0
eval/loss,2.77977


0,1
loss,▁
epoch,▁
_runtime,▁▂▂▂▃▃▄▄▅▅▆▆▇▇▇██
_timestamp,▁▂▂▂▃▃▄▄▅▅▆▆▇▇▇██
_step,▁▁▂▂▃▃▄▄▅▅▅▆▆▇▇██
train/loss,█▄▃▂▂▁▁▁
train/learning_rate,█▇▆▅▄▃▂▁
train/epoch,▁▁▂▂▃▃▄▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▄▅▅▆▆▇▇███
eval/loss,█▃▂▁▁▁▁


env: WANDB_LOG_MODEL=true


In [None]:
batch_size = 1
args = Seq2SeqTrainingArguments(
    "test-summarization",
    evaluation_strategy = "epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=7,
    predict_with_generate=True,
    fp16=True,
    report_to="wandb",
    warmup_steps=500,                # number of warmup steps for learning rate scheduler

)

PyTorch: setting up devices


Then we just need to pass all of this along with our datasets to the `Seq2SeqTrainer`:

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp fp16 backend


In [None]:
# set generate hyperparameters
model.config.num_beams = 2
model.config.max_length = 30
model.config.min_length = 5
model.config.length_penalty = 2.0
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3

We can now finetune our model by just calling the `train` method:

In [None]:
# "model.Pegasus" is saved in wandb.run.dir & will be uploaded at the end of training
import os
trainer.save_model(os.path.join(wandb.run.dir, "model.Pegasus"))

Saving model checkpoint to /content/wandb/run-20210801_123639-7ew9lieh/files/model.Pegasus
Configuration saved in /content/wandb/run-20210801_123639-7ew9lieh/files/model.Pegasus/config.json
Model weights saved in /content/wandb/run-20210801_123639-7ew9lieh/files/model.Pegasus/pytorch_model.bin
tokenizer config file saved in /content/wandb/run-20210801_123639-7ew9lieh/files/model.Pegasus/tokenizer_config.json
Special tokens file saved in /content/wandb/run-20210801_123639-7ew9lieh/files/model.Pegasus/special_tokens_map.json


In [None]:
# Train the Model
trainer.train()

The following columns in the training set  don't have a corresponding argument in `PegasusForConditionalGeneration.forward` and have been ignored: Original Text, Summary, __index_level_0__.
***** Running training *****
  Num examples = 607
  Num Epochs = 7
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 4249
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,4.6901,3.137081,34.7849,16.2505,30.872,30.8907,13.6442
2,3.2614,2.763392,38.113,21.8608,35.2878,35.4489,11.5817
3,2.6358,2.673263,36.672,20.8849,34.1618,34.2593,11.0385
4,2.5046,2.658817,37.0495,21.2924,34.2131,34.3541,11.6346
5,2.074,2.65587,39.0252,23.5275,36.3944,36.6114,11.7692
6,1.8979,2.665773,38.5552,22.7022,36.1108,36.2387,11.8365
7,1.8273,2.684297,38.6279,22.3331,35.9829,36.1676,11.8942


  args.max_grad_norm,
Saving model checkpoint to test-summarization/checkpoint-500
Configuration saved in test-summarization/checkpoint-500/config.json
Model weights saved in test-summarization/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-summarization/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-summarization/checkpoint-500/special_tokens_map.json
Deleting older checkpoint [test-summarization/checkpoint-3000] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `PegasusForConditionalGeneration.forward` and have been ignored: Original Text, Summary, __index_level_0__.
***** Running Evaluation *****
  Num examples = 208
  Batch size = 1
  args.max_grad_norm,
Saving model checkpoint to test-summarization/checkpoint-1000
Configuration saved in test-summarization/checkpoint-1000/config.json
Model weights saved in test-summarization/checkpoint-1000/pytorch_model.bin
tokenizer config

TrainOutput(global_step=4249, training_loss=2.5940016748541015, metrics={'train_runtime': 1168.9976, 'train_samples_per_second': 3.635, 'train_steps_per_second': 3.635, 'total_flos': 377561867354112.0, 'train_loss': 2.5940016748541015, 'epoch': 7.0})

In [None]:
wandb.run.save()



True

In [None]:
# Save the model
trainer.save_model()

Saving model checkpoint to test-summarization
Configuration saved in test-summarization/config.json
Model weights saved in test-summarization/pytorch_model.bin
tokenizer config file saved in test-summarization/tokenizer_config.json
Special tokens file saved in test-summarization/special_tokens_map.json


## Make Predictions

In [None]:
tokenized_datasets["test"]

Dataset({
    features: ['Original Text', 'Summary', '__index_level_0__', 'attention_mask', 'input_ids', 'labels'],
    num_rows: 53
})

In [None]:
# Make predictions
predictions = trainer.predict(tokenized_datasets["test"])

The following columns in the test set  don't have a corresponding argument in `PegasusForConditionalGeneration.forward` and have been ignored: Original Text, Summary, __index_level_0__.
***** Running Prediction *****
  Num examples = 53
  Batch size = 1


In [None]:
# Create a function to decode predictions
def translate_predictions(prediction):
  for i in prediction:
    decoded_prediction = tokenizer.batch_decode(i, skip_special_tokens=True)
    return decoded_prediction

In [None]:
# Decode predictions
predictions = pd.DataFrame(predictions)
decoded_predictions = predictions.apply(translate_predictions)

  values = np.array([convert(v) for v in values])


In [None]:
# Maximise column width
pd.set_option('display.max_colwidth', -1)

  


In [None]:
# Take a random sample of the dataset
import random
random.seed(12)
randomlist = []
for i in range(0,20):
  n = random.randint(1,len(tokenized_datasets["test"]))
  randomlist.append(n)
print(randomlist)

[31, 18, 43, 34, 43, 23, 10, 25, 1, 24, 31, 18, 42, 52, 30, 45, 39, 15, 36, 1]


In [None]:
# See the corresponding decoded predictions
samples = pd.concat([decoded_predictions.iloc[i] for i in randomlist], axis = 1).T

In [None]:
# Find the originals 
originals = pd.concat([tac_test.iloc[i] for i in randomlist], axis = 1).T

In [None]:
# Reset sample index
samples.reset_index(drop=True, inplace=True)

In [None]:
# Reset originals index
originals.reset_index(drop=True, inplace=True)

In [None]:
# Combine the datasets to make a comparisons dataframe
comparisons = pd.concat([samples, originals], axis = 1)

In [None]:
# Label and look at dataframe
comparisons.set_axis(['Pegasus Prediction', 'Summary', 'Original Text'], axis=1, inplace=True)
comparisons

Unnamed: 0,Pegasus Prediction,Summary,Original Text
0,the court of law governing the terms is in location russia,the court of law governing the terms is in location st petersburg russia,in these terms and other special documents the vk site administration hereinafter the site administration administration is understood as llc v kontakte a legal entity created under the laws of the russian federation and registered at prem 1 n bld 12 14 lit a khersonskaya st st petersburg russia 191024
1,the service can remove objectionable content without notice,your content can be deleted if you violate the terms,mewe reserves the right to remove objectionable content without notice mewe can remove any content or information you post at mewe if we believe that it violates our terms of service
2,this service does not allow you to use your personal data without telling you,users agree not to submit libelous harassing or threatening content,not use the brainly services do anything unlawful misleading malicious or discriminatory
3,this service uses third party cookies to target advertising,this service tracks you on other websites,we also use retargeting cookies to present you with patreon advertising on other websites
4,this service does not allow you to use your personal data without telling you,users agree not to submit libelous harassing or threatening content,not use the brainly services do anything unlawful misleading malicious or discriminatory
5,this service respects users’ privacy practices,this service respects your browsers do not track dnt headers,one concrete way we commit to user privacy is by honoring do not track “dnt” browser settings there’s no consensus on how best to do this but we have adopted an approach that we believe honors the fundamental pro privacy aims of the dnt standard
6,the service provides you with the correct terms and conditions,users are subject to googles privacy policy,by using or visiting the youtube website or any youtube products software data feeds and services provided to you on from or through the youtube website collectively the service you signify your agreement to 1 these terms and conditions the terms of service 2 googles privacy policy found at and incorporated herein by reference and 3 youtubes community guidelines found at and also incorporated herein by reference
7,this service does not promote or condone content,discogs does not condone any ideas contained in the items listed via the service,e we do not promote or condone any ideas or messages contained in the user generated content available through the service
8,any liability on behalf of the service is only limited to 1 000,any liability on behalf of the service is only limited to 1 000,and ii for any damages losses and or causes of action exceeding one thousand u s dollars us 1 000 in the aggregate
9,this service will not keep logs on any portion of its infrastructure,user logs are never stored at any component of infrastructure,we will never keep logs at any component of our infrastructure


In [None]:
# Convert to csv
decoded_predictions.to_csv("decoded_predictions_Pegasus_nodups.csv")

In [None]:
# Download predictions made
from google.colab import files
files.download("decoded_predictions_Pegasus_nodups.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Comparisons dataframe to csv
comparisons.to_csv("comparisons_Peg_nodups.csv")

In [None]:
# Download comparisons
files.download("comparisons_Peg_nodups.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>