In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# !pip install transformers datasets rouge-score accelerate



In [None]:
from transformers import (
  BartForConditionalGeneration, BartTokenizer, DataCollatorForSeq2Seq
)
import datasets
metric = datasets.load_metric("rouge")
import nltk
import numpy as np
import pprint

  metric = datasets.load_metric("rouge")


In [None]:
MAX_INPUT_LENGTH = 768
MAX_OUTPUT_LENGTH = 100
pp = pprint.PrettyPrinter(width=80, compact=True)

In [None]:
# Load the ROUGE scores metric

def calc_rouge_scores(candidates, references):
    result = metric.compute(predictions = candidates, references = references,
        use_stemmer = True)
    result = {key: round(value.mid.fmeasure * 100, 1)
              for key, value in result.items()}
    return result


In [None]:
dataset = datasets.load_dataset("cnn_dailymail", "3.0.0")

In [None]:
print(dataset["train"]["article"][0])
print(dataset["train"]["highlights"][0])

LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details of how

In [None]:
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

In [None]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

In [None]:
def preprocess(examples):
    article_split = examples["article"]
    labels_split = examples["highlights"]

    model_inputs = tokenizer(article_split, max_length = MAX_INPUT_LENGTH, truncation = True, padding = True)
    labels = tokenizer(labels_split, max_length = MAX_OUTPUT_LENGTH, truncation = True, padding = True)

    model_inputs["labels"] = labels["input_ids"]
    return {
        "attention_mask": model_inputs["attention_mask"],
        "input_ids": model_inputs["input_ids"],
        "labels": model_inputs["labels"]
    }


In [None]:
# Ensuring the tokenizer works
example_train_input = dataset["train"]["article"][0]
tokenizer(example_train_input)

{'input_ids': [0, 574, 4524, 6, 1156, 36, 1251, 43, 480, 3268, 10997, 999, 3028, 7312, 20152, 3077, 899, 7, 10, 431, 984, 844, 153, 1358, 4006, 4, 134, 153, 43, 13016, 25, 37, 4072, 504, 15, 302, 6, 53, 37, 9838, 5, 418, 351, 75, 2471, 10, 8921, 15, 123, 4, 3028, 7312, 20152, 25, 3268, 10997, 11, 22, 29345, 10997, 8, 5, 9729, 9, 5, 5524, 113, 598, 5, 10208, 9, 20445, 6730, 1952, 198, 5, 232, 6, 5, 664, 2701, 161, 37, 34, 117, 708, 7, 856, 3961, 1334, 39, 1055, 409, 15, 1769, 1677, 6, 4076, 8, 6794, 1799, 4, 22, 100, 218, 75, 563, 7, 28, 65, 9, 167, 82, 54, 6, 25, 1010, 25, 51, 1004, 504, 6, 6017, 907, 1235, 10, 2232, 1612, 512, 2783, 50, 402, 1122, 60, 37, 174, 41, 2059, 33242, 656, 42, 353, 4, 22, 100, 218, 75, 206, 38, 581, 28, 1605, 31879, 4, 22, 133, 383, 38, 101, 2159, 32, 383, 14, 701, 59, 158, 2697, 480, 2799, 8, 32570, 8, 37206, 72, 497, 504, 6, 7312, 20152, 40, 28, 441, 7, 23104, 11, 10, 10297, 6, 907, 10, 4076, 11, 10, 8881, 50, 192, 5, 8444, 822, 22, 40534, 523, 35, 4657, 30

In [None]:
reduced_train_dataset = dataset["train"].select(range(10000))
reduced_val_dataset = dataset["validation"].select(range(1000))

tokenized_reduced_train_dataset = reduced_train_dataset.map(lambda x: preprocess(x), batched=True, remove_columns=["article", "highlights", "id"])
tokenized_reduced_val_dataset = reduced_val_dataset.map(lambda x: preprocess(x), batched=True, remove_columns=["article", "highlights", "id"])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
from torch.utils.data import Dataset, Subset

class HFDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, index):
        return self.dataset[index]

    def __len__(self):
        return len(self.dataset)


In [None]:
from torch import cuda
device = "cuda" if cuda.is_available() else "cpu"

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from torch.utils.data import DataLoader

tokenized_reduced_train_dataset.set_format("torch")
tokenized_reduced_val_dataset.set_format("torch")
BATCH_SIZE = 16
train_dataloader = DataLoader(
    tokenized_reduced_train_dataset,
    shuffle = True,
    collate_fn = data_collator,
    batch_size = BATCH_SIZE
)

eval_dataloader = DataLoader(
    tokenized_reduced_val_dataset,
    collate_fn = data_collator,
    batch_size = BATCH_SIZE
)

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr = 1e-4)



In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 5
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # ROUGE expects a newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

In [None]:
output_dir = "/content/drive/MyDrive/FinTech Society News Summaries Project"

In [None]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=

In [None]:
from tqdm.auto import tqdm
import torch
import numpy as np

progress_bar = tqdm(range(num_training_steps))
results = []
for epoch in range(num_train_epochs):

    model.train()
    for step, batch in enumerate(train_dataloader):
        batch = batch.to(device)
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        del loss
        del outputs
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for step, batch in enumerate(eval_dataloader):
        batch = batch.to(device)
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
            )

            generated_tokens = accelerator.pad_across_processes(
                generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
            )
            labels = batch["labels"]

            # If we did not pad to max length, we need to pad the labels too
            labels = accelerator.pad_across_processes(
                batch["labels"], dim=1, pad_index=tokenizer.pad_token_id
            )

            generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
            labels = accelerator.gather(labels).cpu().numpy()

            # Replace -100 in the labels as we can't decode them
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            if isinstance(generated_tokens, tuple):
                generated_tokens = generated_tokens[0]
            decoded_preds = tokenizer.batch_decode(
                generated_tokens, skip_special_tokens=True
            )
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            decoded_preds, decoded_labels = postprocess_text(
                decoded_preds, decoded_labels
            )

            metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    # Compute metrics
    result = metric.compute()
    # Extract the median ROUGE scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    result = {k: round(v, 4) for k, v in result.items()}
    print(f"Epoch {epoch}:", result)
    results.append(result)

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    tokenizer.save_pretrained(output_dir)

  0%|          | 0/3125 [00:00<?, ?it/s]



Epoch 0: {'rouge1': 22.8282, 'rouge2': 9.2263, 'rougeL': 19.032, 'rougeLsum': 21.1967}
Epoch 1: {'rouge1': 22.6134, 'rouge2': 9.6175, 'rougeL': 19.1242, 'rougeLsum': 21.0171}
Epoch 2: {'rouge1': 23.31, 'rouge2': 9.6181, 'rougeL': 19.5538, 'rougeLsum': 21.6684}
Epoch 3: {'rouge1': 23.416, 'rouge2': 9.6271, 'rougeL': 19.4492, 'rougeLsum': 21.6326}
Epoch 4: {'rouge1': 23.2365, 'rouge2': 9.6987, 'rougeL': 19.4985, 'rougeLsum': 21.5544}


In [None]:
import os
torch.save(model, os.path.join("/content/drive/MyDrive/FinTech Society News Summaries Project", "finetuned-bart-base.pt"))

In [None]:
dataset["train"]["article"][50]

'AMMAN, Jordan (CNN) -- In the sunbathed schoolyard of the Shmisani Institute for Girls in Amman, Jordan, principal Sanaa Abu Harb makes an announcement over the speaker system. Iraqi students at the Shmisani school in Amman gather around a teacher. One in 5 students there is Iraqi. "All Iraqi girls come outside now. All Iraqi girls. Iraqi girls only!" she repeats several times, making sure the message is clear and waving away Jordanian pupils attracted by the commotion. Dozens of girls in green apron-like uniforms pour out into the courtyard and cluster on the top level of a stone staircase overlooking a concrete playground. Harb wants the CNN crew to see how many Iraqi refugee girls her school is accommodating. This school year, she says, 145 students are Iraqi -- roughly 20 percent of the students at this state-funded institution -- with another 40 Iraqi children on a waiting list.  Watch Iraqi girls describe a long way from home » . The reason behind the jump in the number of Iraqi

In [None]:
input_ids = tokenizer(dataset["train"]["article"][50], return_tensors = "pt").input_ids
generated_ids = model.generate(input_ids.to(device), do_sample = True, max_length = 100)
summary = tokenizer.decode(generated_ids.squeeze(), skip_special_tokens = True)
print(summary)

Jordan opens doors to all Iraqi children regardless of refugee status.
For first time since start of Iraq war, Jordan will accommodate 40,000 to 50,000 Iraqi kids.
Education minister: "All Iraqi girls come outside now. All Iraqi girls"


In [None]:
dataset["train"]["article"][0]

'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details o

In [None]:
import pickle
from pathlib import Path
FOLDER_PATH = Path("/content/drive/MyDrive/FinTech Society News Summaries Project")
RESULTS_FILE = Path("bart_base_results.pkl")
RESULTS_PATH = FOLDER_PATH / RESULTS_FILE
with RESULTS_PATH.open("wb") as f:
  pickle.dump(results, f)