## Prophetnet

This notebook is adapted from HuggingFace's summarisation notebook. https://huggingface.co/transformers/notebooks.html

## 1. Import Modules
# 2. Load Data
# 3. Preprocess the Data
# 4. Finetune the Model
# 3. Make Predictions

In [None]:
!pip install datasets transformers rouge-score nltk
import nltk
nltk.download('punkt')
%pip install optuna
from datasets import load_dataset, load_metric

%pip install pickle5
import pickle5 as pickle
import datasets
import random
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from transformers import AutoTokenizer
%pip install sentencepiece
import sentencepiece
from datasets import Dataset

%pip install wandb
import wandb
import nltk
import numpy as np



In [None]:
# define model 
model_checkpoint = "microsoft/xprophetnet-large-wiki100-cased-xglue-ntg"

## Loading the dataset

In [None]:
# open a file, where you stored the pickled data
DATA_PATH = "/content"
file1 = open(DATA_PATH+'/tac_train_dataset_nodups2.pickle', 'rb')
file2 = open(DATA_PATH+'/tac_valid_dataset_nodups2.pickle', 'rb')
file3 = open(DATA_PATH+'/tac_test_dataset_nodups2.pickle', 'rb')

# dump information to that file
tac_train = pickle.load(file1)
tac_valid = pickle.load(file2)
tac_test = pickle.load(file3)

metric = load_metric("rouge")

In [None]:
metric

Metric(name: "rouge", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}, usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each predictions
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLSum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/datasets/issues/617
    use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
    use_agregator: Return aggregates if this is set to True
Retu

## Preprocessing the data

In [None]:
# Import tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/156 [00:00<?, ?B/s]

In [None]:
# Define task
if model_checkpoint in ["microsoft/xprophetnet-large-wiki100-cased-xglue-ntg"]:
    prefix = "summarize: "
else:
    prefix = ""

In [None]:
max(tac_train.astype('str').applymap(lambda x: len(x)).max())

546

In [None]:
# Define preprocessing function
max_input_length = 549
max_target_length = 200

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["Original Text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["Summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# define raw dataset

train_dataset = Dataset.from_pandas(tac_train)
val_dataset = Dataset.from_pandas(tac_valid)
test_dataset = Dataset.from_pandas(tac_test)

raw_datasets = datasets.DatasetDict({"train" : train_dataset, "validation" : val_dataset, "test" : test_dataset})

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['Summary', 'Original Text', '__index_level_0__'],
        num_rows: 607
    })
    validation: Dataset({
        features: ['Summary', 'Original Text', '__index_level_0__'],
        num_rows: 208
    })
    test: Dataset({
        features: ['Summary', 'Original Text', '__index_level_0__'],
        num_rows: 53
    })
})

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

## Fine-tuning the model

In [None]:
# Import model

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# define function for evaluation

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
# Clear space
import torch
torch.cuda.empty_cache()

In [None]:
# Install wandb to log the training training and validation loss and save the best model

# 1. Start a W&B run
wandb.init(project='Prophetnet', entity='belin')
run_name = wandb.run.name
wandb.config
wandb.log({'loss': 0.2, 'epoch': 1})
%env WANDB_LOG_MODEL=true

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


env: WANDB_LOG_MODEL=true


In [None]:
batch_size = 1
args = Seq2SeqTrainingArguments(
    "test-summarization",
    evaluation_strategy = "epoch",
    warmup_steps=500, 
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=7,
    predict_with_generate=True,
    fp16=True,
    report_to="wandb",
)

Then we just need to pass all of this along with our datasets to the `Seq2SeqTrainer`:

We can now finetune our model by just calling the `train` method:

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp fp16 backend


In [None]:
# "model_Prophetnet" is saved in wandb.run.dir & will be uploaded at the end of training
import os
trainer.save_model(os.path.join(wandb.run.dir, "model.Prophetnet"))

Saving model checkpoint to /content/wandb/run-20210801_150000-ssmbn03t/files/model.Prophetnet
Configuration saved in /content/wandb/run-20210801_150000-ssmbn03t/files/model.Prophetnet/config.json
Model weights saved in /content/wandb/run-20210801_150000-ssmbn03t/files/model.Prophetnet/pytorch_model.bin
tokenizer config file saved in /content/wandb/run-20210801_150000-ssmbn03t/files/model.Prophetnet/tokenizer_config.json
Special tokens file saved in /content/wandb/run-20210801_150000-ssmbn03t/files/model.Prophetnet/special_tokens_map.json


In [None]:
# Clear space 
torch.cuda.empty_cache()

## Explanation of the below results:

At the time the model was trained, there was no issue with Prophetnet's training/fine-tuning. This notebook was rerun on Sun, 1 Aug, 2021 to ensure the notebook was able to run through. During that time the issue seen below was shown.

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `XLMProphetNetForConditionalGeneration.forward` and have been ignored: __index_level_0__, Summary, Original Text.
***** Running training *****
  Num examples = 607
  Num Epochs = 7
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 4249
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
There is a known issue with ProphetNet training/fine-tuning that hasn't been fixed yet:https://github.com/huggingface/transformers/issues/9804. Please try to use an off-the-shelfcheckpoint from the model hub or fine-tune another architecture instead.
  args.max_grad_norm,


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,4.0521,3.473508,29.0576,16.3704,27.3606,27.3895,14.6875
2,3.1179,3.237804,26.8392,16.178,25.561,25.5505,13.1683
3,2.2164,3.045449,26.9736,16.285,25.637,25.5732,14.375
4,1.6312,3.195332,34.6237,20.0633,32.639,32.6599,13.5865
5,0.5981,3.27074,33.6695,20.7138,31.4913,31.4751,14.0048
6,0.3133,3.671238,35.045,21.4217,32.8603,32.9671,14.0
7,0.1395,3.850145,36.3839,21.8728,33.9002,33.9165,14.8125


There is a known issue with ProphetNet training/fine-tuning that hasn't been fixed yet:https://github.com/huggingface/transformers/issues/9804. Please try to use an off-the-shelfcheckpoint from the model hub or fine-tune another architecture instead.
There is a known issue with ProphetNet training/fine-tuning that hasn't been fixed yet:https://github.com/huggingface/transformers/issues/9804. Please try to use an off-the-shelfcheckpoint from the model hub or fine-tune another architecture instead.
There is a known issue with ProphetNet training/fine-tuning that hasn't been fixed yet:https://github.com/huggingface/transformers/issues/9804. Please try to use an off-the-shelfcheckpoint from the model hub or fine-tune another architecture instead.
There is a known issue with ProphetNet training/fine-tuning that hasn't been fixed yet:https://github.com/huggingface/transformers/issues/9804. Please try to use an off-the-shelfcheckpoint from the model hub or fine-tune another architecture inste

TrainOutput(global_step=4249, training_loss=1.5527961481372226, metrics={'train_runtime': 2558.3081, 'train_samples_per_second': 1.661, 'train_steps_per_second': 1.661, 'total_flos': 580152629145600.0, 'train_loss': 1.5527961481372226, 'epoch': 7.0})

In [None]:
# Saving the model
wandb.run.save()



True

In [None]:
# Finish WADNB run
wandb.finish()

VBox(children=(Label(value=' 2355.20MB of 2355.20MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, ma…

0,1
loss,0.2
epoch,1.0
_runtime,2588.0
_timestamp,1627832588.0
_step,16.0
train/loss,0.1395
train/learning_rate,0.0
train/epoch,7.0
train/global_step,4249.0
eval/loss,3.85014


0,1
loss,▁
epoch,▁
_runtime,▁▁▂▂▃▃▄▄▅▅▆▆▆▇▇██
_timestamp,▁▁▂▂▃▃▄▄▅▅▆▆▆▇▇██
_step,▁▁▂▂▃▃▄▄▅▅▅▆▆▇▇██
train/loss,█▆▅▄▃▂▁▁
train/learning_rate,█▇▆▅▄▃▂▁
train/epoch,▁▁▂▂▃▃▄▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▄▅▅▆▆▇▇███
eval/loss,▅▃▁▂▃▆█


In [None]:
# See what's in the dataset
tokenized_datasets["test"]

Dataset({
    features: ['Original Text', 'Summary', '__index_level_0__', 'attention_mask', 'input_ids', 'labels'],
    num_rows: 53
})

In [None]:
# Make predictions
predictions = trainer.predict(tokenized_datasets["test"])

The following columns in the test set  don't have a corresponding argument in `XLMProphetNetForConditionalGeneration.forward` and have been ignored: __index_level_0__, Summary, Original Text.
***** Running Prediction *****
  Num examples = 53
  Batch size = 1


In [None]:
# Create a function to decode predictions
def translate_predictions(prediction):
  for i in prediction:
    decoded_prediction = tokenizer.batch_decode(i, skip_special_tokens=True)
    return decoded_prediction

In [None]:
# Decode predictions
predictions = pd.DataFrame(predictions)
decoded_predictions = predictions.apply(translate_predictions)

  values = np.array([convert(v) for v in values])


In [None]:
# Maximise column width
pd.set_option('display.max_colwidth', -1)

  


In [None]:
# Take a random sample of the dataset
import random
random.seed(12)
randomlist = []
for i in range(0,20):
  n = random.randint(1,len(tokenized_datasets["test"]))
  randomlist.append(n)
print(randomlist)

[31, 18, 43, 34, 43, 23, 10, 25, 1, 24, 31, 18, 42, 52, 30, 45, 39, 15, 36, 1]


In [None]:
# See the corresponding decoded predictions
samples = pd.concat([decoded_predictions.iloc[i] for i in randomlist], axis = 1).T

In [None]:
# Find the originals 
originals = pd.concat([tac_test.iloc[i] for i in randomlist], axis = 1).T

In [None]:
# Reset sample index
samples.reset_index(drop=True, inplace=True)

In [None]:
# Reset originals index
originals.reset_index(drop=True, inplace=True)

In [None]:
# Combine the datasets to make a comparisons dataframe
comparisons = pd.concat([samples, originals], axis = 1)

In [None]:
# Label and look at dataframe
comparisons.set_axis(['Prophetnet Prediction', 'Summary', 'Original Text'], axis=1, inplace=True)
comparisons

Unnamed: 0,Prophetnet Prediction,Summary,Original Text
0,the court of law governing the terms is in the sate of russia,the court of law governing the terms is in location st petersburg russia,in these terms and other special documents the vk site administration hereinafter the site administration administration is understood as llc v kontakte a legal entity created under the laws of the russian federation and registered at prem 1 n bld 12 14 lit a khersonskaya st st petersburg russia 191024
1,this service reserves the right to remove any content you post,your content can be deleted if you violate the terms,mewe reserves the right to remove objectionable content without notice mewe can remove any content or information you post at mewe if we believe that it violates our terms of service
2,users agree not to use the service for illegal purposes,users agree not to submit libelous harassing or threatening content,not use the brainly services do anything unlawful misleading malicious or discriminatory
3,this service still retains your information even after you close their accounts,this service tracks you on other websites,we also use retargeting cookies to present you with patreon advertising on other websites
4,users agree not to use the service for illegal purposes,users agree not to submit libelous harassing or threatening content,not use the brainly services do anything unlawful misleading malicious or discriminatory
5,dnt headers are honored by the service to ensure quality,this service respects your browsers do not track dnt headers,one concrete way we commit to user privacy is by honoring do not track “dnt” browser settings there’s no consensus on how best to do this but we have adopted an approach that we believe honors the fundamental pro privacy aims of the dnt standard
6,you must provide your identifiable information,users are subject to googles privacy policy,by using or visiting the youtube website or any youtube products software data feeds and services provided to you on from or through the youtube website collectively the service you signify your agreement to 1 these terms and conditions the terms of service 2 googles privacy policy found at and incorporated herein by reference and 3 youtubes community guidelines found at and also incorporated herein by reference
7,discogs users shall not interfere with another persons enjoyment of the service,discogs does not condone any ideas contained in the items listed via the service,e we do not promote or condone any ideas or messages contained in the user generated content available through the service
8,any liability on behalf of the service is only limited to 1 000,any liability on behalf of the service is only limited to 1 000,and ii for any damages losses and or causes of action exceeding one thousand u s dollars us 1 000 in the aggregate
9,no logs are kept by the service to ensure quality,user logs are never stored at any component of infrastructure,we will never keep logs at any component of our infrastructure


In [None]:
decoded_predictions.to_csv("decoded_predictions_Prophetnet_nodups.csv")

In [None]:
# Download predictions made
from google.colab import files
files.download("decoded_predictions_Prophetnet_nodups.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Comparisons dataframe to csv
comparisons.to_csv("comparisons_Prophetnet_nodups.csv")

In [None]:
# Download comparisons
files.download("comparisons_Prophetnet_nodups.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>