# BART

This notebook is adapted from HuggingFace's summarisation notebook. https://huggingface.co/transformers/notebooks.html

# 1. Import Modules
# 2. Load Data
# 3. Preprocess the Data
# 4. Finetune the Model
# 3. Make Predictions

## 1. Import Modules

In [None]:
!pip install datasets transformers rouge-score nltk
%pip install optuna
!pip install ray==0.8.7
!pip install ray[tune]



In [None]:
import nltk
nltk.download('punkt')
from datasets import load_dataset, load_metric

%pip install pickle5
import pickle5 as pickle
import datasets
import random
import pandas as pd
from transformers import AutoTokenizer
from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, TrainingArguments

import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Define Model to be loaded
model_checkpoint = "sshleifer/distilbart-cnn-6-6"

## 2. Loading the dataset

In [None]:
# open a file, where you stored the pickled data
DATA_PATH = "/content"
file1 = open(DATA_PATH+'/tac_train_dataset_nodups2.pickle', 'rb')
file2 = open(DATA_PATH+'/tac_valid_dataset_nodups2.pickle', 'rb')
file3 = open(DATA_PATH+'/tac_test_dataset_nodups2.pickle', 'rb')

# dump information to that file
tac_train = pickle.load(file1)
tac_valid = pickle.load(file2)
tac_test = pickle.load(file3)

metric = load_metric("rouge")

In [None]:
metric

Metric(name: "rouge", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}, usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each predictions
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLSum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/datasets/issues/617
    use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
    use_agregator: Return aggregates if this is set to True
Retu

## 3. Preprocessing the data

In [None]:
# Import tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

loading configuration file https://huggingface.co/sshleifer/distilbart-cnn-6-6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/98e51ece807bb08f235356791c26c1d775cc56c394304f0ddf1809c6bc45b391.a394a5757192281a4f3940a7ccf20051a750f630dd86fffbaa84d8cff7a0d496
Model config BartConfig {
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "force_bos_t

In [None]:
# Define task
if model_checkpoint in ["sshleifer/distilbart-cnn-12-6", "facebook/bart-base", "sshleifer/distilbart-cnn-6-6"]:
    prefix = "summarize: "
else:
    prefix = ""

In [None]:
# Find max input length
max(tac_train.astype('str').applymap(lambda x: len(x)).max())

546

In [None]:
# Find max input length
max(pd.DataFrame(tac_train["Summary"].astype('str')).applymap(lambda x: len(x)).max())

176

In [None]:
# Define preprocessing function
max_input_length = max(tac_train.astype('str').applymap(lambda x: len(x)).max())
max_target_length = max(pd.DataFrame(tac_train["Summary"].astype('str')).applymap(lambda x: len(x)).max())

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["Original Text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["Summary"], max_length=20, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# Define the raw dataset

train_dataset = Dataset.from_pandas(tac_train)
val_dataset = Dataset.from_pandas(tac_valid)
test_dataset = Dataset.from_pandas(tac_test)

raw_datasets = datasets.DatasetDict({"train" : train_dataset, "validation" : val_dataset, "test" : test_dataset})

In [None]:
# View the raw dataset
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['Summary', 'Original Text', '__index_level_0__'],
        num_rows: 607
    })
    validation: Dataset({
        features: ['Summary', 'Original Text', '__index_level_0__'],
        num_rows: 208
    })
    test: Dataset({
        features: ['Summary', 'Original Text', '__index_level_0__'],
        num_rows: 53
    })
})

In [None]:
# Tokenize the data
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

searchdatasets_tokenized = searchdatasets.map(preprocess_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

## 4. Fine-tuning the model

In [None]:
#Load the model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

loading configuration file https://huggingface.co/sshleifer/distilbart-cnn-6-6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/98e51ece807bb08f235356791c26c1d775cc56c394304f0ddf1809c6bc45b391.a394a5757192281a4f3940a7ccf20051a750f630dd86fffbaa84d8cff7a0d496
Model config BartConfig {
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "force_bos_t

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# Define metric evaluation function

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True, max_length = 20)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
# Install wandb to log the training training and validation loss and save the best model
%pip install wandb
import wandb

# 1. Start a W&B run
wandb.init(project='BERT', entity='belin')
run_name = wandb.run.name
wandb.config
wandb.log({'loss': 0.2, 'epoch': 1})
%env WANDB_LOG_MODEL=true



[34m[1mwandb[0m: Currently logged in as: [33mbelin[0m (use `wandb login --relogin` to force relogin)


env: WANDB_LOG_MODEL=true


In [None]:
# Define training arguments
batch_size = 1
args = Seq2SeqTrainingArguments(
    "test-summarization",
    evaluation_strategy = "epoch",
    warmup_steps=500,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    report_to="wandb",
)

PyTorch: setting up devices


Then we just need to pass all of this along with our datasets to the `Seq2SeqTrainer`:

We can now finetune our model by just calling the `train` method:

In [None]:
# define trainer
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Using amp fp16 backend


In [None]:
# set generate hyperparameters
model.config.num_beams = 2
model.config.max_length = 30
model.config.min_length = 5
model.config.length_penalty = 2.0
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3

In [None]:
# "model_BERT" is saved in wandb.run.dir & will be uploaded at the end of training
import os
trainer.save_model(os.path.join(wandb.run.dir, "model.BART"))

Saving model checkpoint to /content/wandb/run-20210801_120708-3jwi3opd/files/model.BART
Configuration saved in /content/wandb/run-20210801_120708-3jwi3opd/files/model.BART/config.json
Model weights saved in /content/wandb/run-20210801_120708-3jwi3opd/files/model.BART/pytorch_model.bin
tokenizer config file saved in /content/wandb/run-20210801_120708-3jwi3opd/files/model.BART/tokenizer_config.json
Special tokens file saved in /content/wandb/run-20210801_120708-3jwi3opd/files/model.BART/special_tokens_map.json


In [None]:
# Train the model
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: __index_level_0__, Summary, Original Text.
***** Running training *****
  Num examples = 607
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 3035
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
  args.max_grad_norm,


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.2466,2.431912,33.6898,20.0745,31.2505,31.2561,18.625
2,2.0415,2.190782,37.0299,22.766,34.9516,35.0923,15.9952
3,1.2779,2.225284,38.9764,25.0834,37.2503,37.2079,15.8365
4,0.8301,2.607466,40.2964,26.1073,38.1973,38.2655,15.9615
5,0.2932,3.023801,40.5361,26.5354,38.8678,38.9014,16.0673


Saving model checkpoint to test-summarization/checkpoint-500
Configuration saved in test-summarization/checkpoint-500/config.json
Model weights saved in test-summarization/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-summarization/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-summarization/checkpoint-500/special_tokens_map.json
Deleting older checkpoint [test-summarization/checkpoint-1000] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: __index_level_0__, Summary, Original Text.
***** Running Evaluation *****
  Num examples = 208
  Batch size = 1
Saving model checkpoint to test-summarization/checkpoint-1000
Configuration saved in test-summarization/checkpoint-1000/config.json
Model weights saved in test-summarization/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test-summarization/checkpoint-10

TrainOutput(global_step=3035, training_loss=1.3522083904637143, metrics={'train_runtime': 530.5157, 'train_samples_per_second': 5.721, 'train_steps_per_second': 5.721, 'total_flos': 207760868474880.0, 'train_loss': 1.3522083904637143, 'epoch': 5.0})

In [None]:
# Saving the model
wandb.run.save()



True

In [None]:
# Finish WADNB run
wandb.finish()

VBox(children=(Label(value=' 879.91MB of 879.91MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=…

0,1
loss,0.2
epoch,1.0
_runtime,540.0
_timestamp,1627820168.0
_step,12.0
train/loss,0.2932
train/learning_rate,0.0
train/epoch,5.0
train/global_step,3035.0
eval/loss,3.0238


0,1
loss,▁
epoch,▁
_runtime,▁▂▂▃▄▄▅▅▆▆▇██
_timestamp,▁▂▂▃▄▄▅▅▆▆▇██
_step,▁▂▂▃▃▄▅▅▆▆▇▇█
train/loss,█▅▃▂▁▁
train/learning_rate,█▇▅▄▂▁
train/epoch,▁▁▂▃▄▅▅▆▇███
train/global_step,▁▁▂▃▄▅▅▆▇███
eval/loss,▃▁▁▅█


# Make Predictions

In [None]:
# See what's in the dataset
tokenized_datasets["test"]

Dataset({
    features: ['Original Text', 'Summary', '__index_level_0__', 'attention_mask', 'input_ids', 'labels'],
    num_rows: 53
})

In [None]:
# Make predictions
predictions = trainer.predict(tokenized_datasets["test"])

The following columns in the test set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: __index_level_0__, Summary, Original Text.
***** Running Prediction *****
  Num examples = 53
  Batch size = 1


In [None]:
# Create a function to decode predictions
def translate_predictions(prediction):
  for i in prediction:
    decoded_prediction = tokenizer.batch_decode(i, skip_special_tokens=True)
    return decoded_prediction

In [None]:
# Decode predictions
predictions = pd.DataFrame(predictions)
decoded_predictions = predictions.apply(translate_predictions)

  values = np.array([convert(v) for v in values])


In [None]:
# Maximise column width
pd.set_option('display.max_colwidth', -1)

  


In [None]:
# Take a random sample of the dataset
import random
random.seed(12)
randomlist = []
for i in range(0,20):
  n = random.randint(1,len(tokenized_datasets["test"]))
  randomlist.append(n)
print(randomlist)

[31, 18, 43, 34, 43, 23, 10, 25, 1, 24, 31, 18, 42, 52, 30, 45, 39, 15, 36, 1]


In [None]:
# See the corresponding decoded predictions
samples = pd.concat([decoded_predictions.iloc[i] for i in randomlist], axis = 1).T

In [None]:
# Find the originals 
originals = pd.concat([tac_test.iloc[i] for i in randomlist], axis = 1).T

In [None]:
# Reset sample index
samples.reset_index(drop=True, inplace=True)

In [None]:
# Reset originals index
originals.reset_index(drop=True, inplace=True)

In [None]:
# Combine the datasets to make a comparisons dataframe
comparisons = pd.concat([samples, originals], axis = 1)

In [None]:
# Label and look at dataframe
comparisons.set_axis(['BART Prediction', 'Summary', 'Original Text'], axis=1, inplace=True)
comparisons

Unnamed: 0,BART Prediction,Summary,Original Text
0,the court of law governing the terms is in russia 191024,the court of law governing the terms is in location st petersburg russia,in these terms and other special documents the vk site administration hereinafter the site administration administration is understood as llc v kontakte a legal entity created under the laws of the russian federation and registered at prem 1 n bld 12 14 lit a khersonskaya st st petersburg russia 191024
1,mewe can remove any content without reason and may do it without prior notice,your content can be deleted if you violate the terms,mewe reserves the right to remove objectionable content without notice mewe can remove any content or information you post at mewe if we believe that it violates our terms of service
2,the service does not use third party targeted advertising or threatening content,users agree not to submit libelous harassing or threatening content,not use the brainly services do anything unlawful misleading malicious or discriminatory
3,this service employs third party cookies but with opt out instructions,this service tracks you on other websites,we also use retargeting cookies to present you with patreon advertising on other websites
4,the service does not use third party targeted advertising or threatening content,users agree not to submit libelous harassing or threatening content,not use the brainly services do anything unlawful misleading malicious or discriminatory
5,do not track dnt headers on this service,this service respects your browsers do not track dnt headers,one concrete way we commit to user privacy is by honoring do not track “dnt” browser settings there’s no consensus on how best to do this but we have adopted an approach that we believe honors the fundamental pro privacy aims of the dnt standard
6,you are solely responsible for your agreement to the service,users are subject to googles privacy policy,by using or visiting the youtube website or any youtube products software data feeds and services provided to you on from or through the youtube website collectively the service you signify your agreement to 1 these terms and conditions the terms of service 2 googles privacy policy found at and incorporated herein by reference and 3 youtubes community guidelines found at and also incorporated herein by reference
7,no promotion is accepted for the service,discogs does not condone any ideas contained in the items listed via the service,e we do not promote or condone any ideas or messages contained in the user generated content available through the service
8,any liability on behalf of the service is only limited to 1 000,any liability on behalf of the service is only limited to 1 000,and ii for any damages losses and or causes of action exceeding one thousand u s dollars us 1 000 in the aggregate
9,this service does not log in user generated content is not really necessary,user logs are never stored at any component of infrastructure,we will never keep logs at any component of our infrastructure


In [None]:
# Convert to csv
decoded_predictions.to_csv("decoded_predictions_BART_nodups.csv")

In [None]:
# Download predictions made
from google.colab import files
files.download("decoded_predictions_BART_nodups.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Comparisons dataframe to csv
comparisons.to_csv("comparisons_BART_nodups.csv")

In [None]:
# Download comparisons
files.download("comparisons_BART_nodups.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>