<a href="https://colab.research.google.com/github/wooohun/BERT-Summarizer/blob/main/BART_Abstractive.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install dependencies
!pip install transformers
!pip install torch
!pip install datasets
!pip install evaluate

In [21]:
import pandas as pd
import torch
import nltk
import evaluate
from datasets import load_dataset, load_metric, DatasetDict
from evaluate import evaluator

In [3]:
# install kaggle
!pip install -q kaggle
!mkdir ~/.kaggle

# get kaggle api token from account -> API -> create new API Token
# move kaggle api token to kaggle folder
!cp -v kaggle.json ~/.kaggle

'kaggle.json' -> '/root/.kaggle/kaggle.json'


In [4]:
# download dataset
# !chmod 600 /root/.kaggle/kaggl
!kaggle datasets download -d gowrishankarp/newspaper-text-summarization-cnn-dailymail
!unzip newspaper-text-summarization-cnn-dailymail

Downloading newspaper-text-summarization-cnn-dailymail.zip to /content
100% 501M/503M [00:22<00:00, 26.4MB/s]
100% 503M/503M [00:22<00:00, 24.0MB/s]
Archive:  newspaper-text-summarization-cnn-dailymail.zip
  inflating: cnn_dailymail/test.csv  
  inflating: cnn_dailymail/train.csv  
  inflating: cnn_dailymail/validation.csv  


In [27]:
# grab datasets
train = load_dataset("cnn_dailymail", split="train[:67%]")
test = load_dataset("cnn_dailymail", split="test")
dataset = DatasetDict()
dataset['train'] = train
dataset['test'] = test
# formatted as:
# DatasetDict{
#   train: Dataset{ - len = 287113 * x%
#     features:[]
#     num_rows: int 
#   }
#   test: {} - len = 11490
#
# }





In [29]:
from transformers import BartTokenizerFast, BartForConditionalGeneration

# using fast tokenizer
tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-base')
model  = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [30]:
def preprocessing(dataset):
  inputs = [article for article in dataset['article']]
  tokenized_inputs = tokenizer(inputs, max_length = 1024, truncation=True)

  # set tokenizer to encode labels
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(dataset['highlights'], max_length = 128, truncation=True)

  tokenized_inputs['labels'] = labels['input_ids']
  return tokenized_inputs

In [None]:
# process all datasets in batches using fast tokenizer for efficiency
processed_dataset = dataset.map(preprocessing, batched = True)

Map:   0%|          | 0/192366 [00:00<?, ? examples/s]



In [None]:
tokenizer.batch_decode(processed_dataset['train']['labels'][0], skip_special_tokens=True)

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

In [None]:
# create training args
batch_size = 1
logging_steps = len(processed_dataset['train'])

args = Seq2SeqTrainingArguments(
    output_dir = "facebook-bart-base-finetuned-cnn-dailymail",
    learning_rate=5.6e-5,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    predict_with_generate=True,
    fp16=True,
    logging_steps=logging_steps
)

In [None]:
# install metrics
!pip install bert_score
!pip install rouge_score

In [None]:
# load metrics
rouge = evaluate.load('rouge')
bert_score = evaluate.load('bertscore')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [None]:
import numpy as np

# metric computation function to pass into trainer object
def metric_compute(predicted):
  predictions, labels = predicted
  # decode predictions, labels for metric computation
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  # batch_decode returns list of tokens, use nltk to convert list of tokens to list of sentences
  decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
  decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

  # # compute bert_score
  # bert_score_res = bert_score.compute(
  #     predictions = decoded_preds, references = decoded_labels, use_stemmer=True
  # )
  rouge_res = rouge.compute(
      predictions = decoded_preds, references = decoded_labels, use_stemmer=True
  )
  # result = {key: value.mid.fmeasure * 100 for key, value in bert_score_res.items()}
  # for key, value in rouge_res.items():
  #   result[key] = value.mid.fmeasure * 100
  result = {key: value.mid.fmeasure * 100 for key, value in rouge_res.items()}
  
  return {key: round(val, 4) for key, val in result.items()}

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=processed_dataset['train'],
    eval_dataset=processed_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=metric_compute
)

In [None]:
trainer.train()



Step,Training Loss


In [None]:
!pip install pynvml nvidia-ml-py3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pynvml
  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-ml-py3
  Downloading nvidia-ml-py3-7.352.0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: nvidia-ml-py3
  Building wheel for nvidia-ml-py3 (setup.py) ... [?25l[?25hdone
  Created wheel for nvidia-ml-py3: filename=nvidia_ml_py3-7.352.0-py3-none-any.whl size=19188 sha256=f48fae0b5e09b113b6f5de71583f1528b558424cd6867302c528b5198e57369a
  Stored in directory: /root/.cache/pip/wheels/5c/d8/c0/46899f8be7a75a2ffd197a23c8797700ea858b9b34819fbf9e
Successfully built nvidia-ml-py3
Installing collected packages: nvidia-ml-py3, pynvml
Successfully installed nvidia-ml-py3-7.352.0 pynvml-11.5.0


In [None]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [None]:
!pip install accelerate

In [None]:
from accelerate import Accelerator
from torch.utils.data.dataloader import DataLoader

In [None]:
dataloader = DataLoader(processed_dataset, batch_size=args.per_device_train_batch_size)

if args.gradient_checkpointing:
  model.gradient_checkpointing_enable()


accelerator = Accelerator(fp16=args.fp16)
model, optimizer, dataloader = accelerator.prepare(model, adam_bnb_optim, dataloader)

model.train()
for step, batch in enumerate(dataloader, start=1):
  loss = model(**batch).loss
  loss = loss / args.gradient_accumulation_steps
  accelerator.backward(loss)
  if step % args.gradient_accumulation_steps == 0:
    optimizer.step()
    optimizer.zero_grad()
