In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# !pip install transformers datasets rouge-score accelerate



In [None]:
from transformers import (
  AutoTokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq
)
import datasets
metric = datasets.load_metric("rouge")
import nltk
import numpy as np
import pprint

  metric = datasets.load_metric("rouge")


In [None]:
MAX_INPUT_LENGTH = 2048
MAX_OUTPUT_LENGTH = 100
pp = pprint.PrettyPrinter(width=80, compact=True)

In [None]:
# Load the ROUGE scores metric

def calc_rouge_scores(candidates, references):
    result = metric.compute(predictions = candidates, references = references,
        use_stemmer = True)
    result = {key: round(value.mid.fmeasure * 100, 1)
              for key, value in result.items()}
    return result


In [None]:
dataset = datasets.load_dataset("cnn_dailymail", "3.0.0")

In [None]:
print(dataset["train"]["article"][0])
print(dataset["train"]["highlights"][0])

LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details of how

In [None]:
tokenizer = AutoTokenizer.from_pretrained("t5-base", model_max_length = MAX_INPUT_LENGTH)

In [None]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")

In [None]:
def preprocess(examples):
    article_split = examples["article"]
    labels_split = examples["highlights"]

    model_inputs = tokenizer(article_split, max_length = MAX_INPUT_LENGTH, truncation = True, padding = True)
    labels = tokenizer(labels_split, max_length = MAX_OUTPUT_LENGTH, truncation = True, padding = True)

    model_inputs["labels"] = labels["input_ids"]
    return {
        "attention_mask": model_inputs["attention_mask"],
        "input_ids": model_inputs["input_ids"],
        "labels": model_inputs["labels"]
    }


In [None]:
# Ensuring the tokenizer works
example_train_input = dataset["train"]["article"][0]
tokenizer(example_train_input)

{'input_ids': [301, 24796, 4170, 6, 2789, 41, 18844, 61, 1636, 8929, 16023, 2213, 4173, 6324, 12591, 15, 11391, 592, 12, 3, 9, 2196, 3996, 1755, 770, 8785, 591, 11039, 770, 61, 13462, 38, 3, 88, 5050, 507, 30, 2089, 6, 68, 3, 88, 10419, 7, 8, 540, 751, 31, 17, 4061, 3, 9, 10783, 30, 376, 5, 4173, 6324, 12591, 15, 38, 8929, 16023, 16, 96, 15537, 651, 16023, 11, 8, 5197, 13, 8, 12308, 121, 304, 8, 19142, 13, 29517, 6710, 343, 7, 300, 8, 296, 6, 8, 1021, 7556, 845, 3, 88, 65, 150, 1390, 12, 9030, 17, 449, 112, 1723, 550, 30, 1006, 2948, 6, 3281, 11, 17086, 2251, 5, 96, 196, 278, 31, 17, 515, 12, 36, 80, 13, 273, 151, 113, 6, 38, 1116, 38, 79, 919, 14985, 8247, 805, 1452, 3, 9, 3805, 2100, 443, 1232, 42, 424, 1126, 976, 3, 88, 1219, 46, 3746, 2772, 49, 2283, 48, 847, 5, 96, 196, 278, 31, 17, 317, 27, 31, 195, 36, 1989, 28887, 5, 96, 634, 378, 27, 114, 2611, 33, 378, 24, 583, 81, 335, 7051, 1636, 1335, 11, 3190, 7, 11, 5677, 7, 535, 486, 14985, 6324, 12591, 15, 56, 36, 3, 179, 12, 24068, 16

In [None]:
reduced_train_dataset = dataset["train"].select(range(10000))
reduced_val_dataset = dataset["validation"].select(range(1000))

tokenized_reduced_train_dataset = reduced_train_dataset.map(lambda x: preprocess(x), batched=True, remove_columns=["article", "highlights", "id"])
tokenized_reduced_val_dataset = reduced_val_dataset.map(lambda x: preprocess(x), batched=True, remove_columns=["article", "highlights", "id"])

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
from torch import cuda
device = "cuda" if cuda.is_available() else "cpu"

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from torch.utils.data import DataLoader

tokenized_reduced_train_dataset.set_format("torch")
tokenized_reduced_val_dataset.set_format("torch")
BATCH_SIZE = 4
train_dataloader = DataLoader(
    tokenized_reduced_train_dataset,
    shuffle = True,
    collate_fn = data_collator,
    batch_size = BATCH_SIZE
)

eval_dataloader = DataLoader(
    tokenized_reduced_val_dataset,
    collate_fn = data_collator,
    batch_size = BATCH_SIZE
)

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr = 1e-4)



In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 5
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # ROUGE expects a newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

In [None]:
output_dir = "/content/drive/MyDrive/FinTech Society News Summaries Project"

In [None]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [None]:
from tqdm.auto import tqdm
import torch
import numpy as np

progress_bar = tqdm(range(num_training_steps))
results = []
for epoch in range(num_train_epochs):

    model.train()
    for step, batch in enumerate(train_dataloader):
        batch = batch.to(device)
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        del loss
        del outputs
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for step, batch in enumerate(eval_dataloader):
        batch = batch.to(device)
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
            )

            generated_tokens = accelerator.pad_across_processes(
                generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
            )
            labels = batch["labels"]

            # If we did not pad to max length, we need to pad the labels too
            labels = accelerator.pad_across_processes(
                batch["labels"], dim=1, pad_index=tokenizer.pad_token_id
            )

            generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
            labels = accelerator.gather(labels).cpu().numpy()

            # Replace -100 in the labels as we can't decode them
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            if isinstance(generated_tokens, tuple):
                generated_tokens = generated_tokens[0]
            decoded_preds = tokenizer.batch_decode(
                generated_tokens, skip_special_tokens=True
            )
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            decoded_preds, decoded_labels = postprocess_text(
                decoded_preds, decoded_labels
            )

            metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    # Compute metrics
    result = metric.compute()
    # Extract the median ROUGE scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    result = {k: round(v, 4) for k, v in result.items()}
    print(f"Epoch {epoch}:", result)
    results.append(result)

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    tokenizer.save_pretrained(output_dir)

  0%|          | 0/12500 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: {'rouge1': 24.1725, 'rouge2': 9.9193, 'rougeL': 20.1293, 'rougeLsum': 22.2994}
Epoch 1: {'rouge1': 24.4458, 'rouge2': 10.3268, 'rougeL': 20.3687, 'rougeLsum': 22.6078}
Epoch 2: {'rouge1': 24.5264, 'rouge2': 10.246, 'rougeL': 20.3284, 'rougeLsum': 22.5756}
Epoch 3: {'rouge1': 24.2816, 'rouge2': 10.2497, 'rougeL': 20.2482, 'rougeLsum': 22.3621}
Epoch 4: {'rouge1': 24.7187, 'rouge2': 10.5633, 'rougeL': 20.6125, 'rougeLsum': 22.693}


In [None]:
import os
torch.save(model, os.path.join("/content/drive/MyDrive/FinTech Society News Summaries Project", "finetuned-base-t5.pt"))

In [None]:
dataset["train"]["article"][50]

In [None]:
input_ids = tokenizer(dataset["train"]["article"][50], return_tensors = "pt").input_ids
generated_ids = model.generate(input_ids.to(device), do_sample = True, max_length = 100)
summary = tokenizer.decode(generated_ids.squeeze(), skip_special_tokens = True)
print(summary)

Jordan allows all Iraqi children, regardless of refugee status, to enroll in state schools. Principal says 20 percent of school's students are Iraqi. Move cements a massive population shift in the Middle East. The U.N. estimates up to 250,000 school-age Iraqi children are in Jordan.


In [None]:
dataset["train"]["article"][0]

In [None]:
import pickle
from pathlib import Path
FOLDER_PATH = Path("/content/drive/MyDrive/FinTech Society News Summaries Project")
RESULTS_FILE = Path("t5_base_results.pkl")
RESULTS_PATH = FOLDER_PATH / RESULTS_FILE
with RESULTS_PATH.open("wb") as f:
  pickle.dump(results, f)