<a href="https://colab.research.google.com/github/wooohun/BERT-Summarizer/blob/main/BART_Abstractive.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install dependencies
!pip install transformers
!pip install torch
!pip install datasets
!pip install evaluate

In [None]:
import pandas as pd
import torch
import nltk
import evaluate
from datasets import load_dataset, load_metric
from evaluate import evaluator

In [None]:
# install kaggle
!pip install -q kaggle
!mkdir ~/.kaggle

# get kaggle api token from account -> API -> create new API Token
# move kaggle api token to kaggle folder
!cp -v kaggle.json ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists
'kaggle.json' -> '/root/.kaggle/kaggle.json'


In [None]:
# download dataset
# !chmod 600 /root/.kaggle/kaggl
!kaggle datasets download -d gowrishankarp/newspaper-text-summarization-cnn-dailymail
!unzip newspaper-text-summarization-cnn-dailymail

newspaper-text-summarization-cnn-dailymail.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  newspaper-text-summarization-cnn-dailymail.zip
replace cnn_dailymail/test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
# grab datasets
dataset = load_dataset("cnn_dailymail")
# formatted as:
# DatasetDict{
#   train: Dataset{
#     features:[]
#     num_rows: int 
#   }
#   test: {}
#   validation: {}  
# }



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
from transformers import BartTokenizerFast, BartForConditionalGeneration

# using fast tokenizer
tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-base')
model  = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

In [None]:
def preprocessing(dataset):
  inputs = [article for article in dataset['article']]
  tokenized_inputs = tokenizer(inputs, max_length = 1024, truncation=True)

  # set tokenizer to encode labels
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(dataset['highlights'], max_length = 128, truncation=True)

  tokenized_inputs['labels'] = labels['input_ids']
  return tokenized_inputs

In [None]:
# process all datasets in batches using fast tokenizer for efficiency
processed_dataset = dataset.map(preprocessing, batched = True)



Map:   0%|          | 0/11490 [00:00<?, ? examples/s]



In [None]:
tokenizer.batch_decode(processed_dataset['train']['labels'][0], skip_special_tokens=True)

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

In [None]:
# create training args
batch_size = 4
num_train_epochs = 2
logging_steps = len(processed_dataset['train'])

args = Seq2SeqTrainingArguments(
    output_dir = "facebook-bart-base-finetuned-cnn-dailymail",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps
)

In [None]:
# install metrics
!pip install bert_score
!pip install rouge_score

In [None]:
# load metrics
rouge = evaluate.load('rouge')
bert_score = evaluate.load('bertscore')

In [None]:
import numpy as np

# metric computation function to pass into trainer object
def metric_compute(predicted):
  predictions, labels = predicted
  # decode predictions, labels for metric computation
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  # batch_decode returns list of tokens, use nltk to convert list of tokens to list of sentences
  decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
  decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

  # # compute bert_score
  # bert_score_res = bert_score.compute(
  #     predictions = decoded_preds, references = decoded_labels, use_stemmer=True
  # )
  rouge_res = rouge.compute(
      predictions = decoded_preds, references = decoded_labels, use_stemmer=True
  )
  # result = {key: value.mid.fmeasure * 100 for key, value in bert_score_res.items()}
  # for key, value in rouge_res.items():
  #   result[key] = value.mid.fmeasure * 100
  result = {key: value.mid.fmeasure * 100 for key, value in rouge_res.items()}
  
  return {key: round(val, 4) for key, val in result.items()}

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=processed_dataset['train'],
    eval_dataset=processed_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=metric_compute
)

In [None]:
trainer.train()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


In [None]:
!pip install pynvml nvidia-ml-py3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()