In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers datasets rouge-score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import torch
import tensorflow as tf
from transformers import (
    AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer, T5ForConditionalGeneration,
    pipeline, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
)
import datasets
import torch.nn as nn

metric = datasets.load_metric("rouge")
from typing import Optional, Tuple, Union
import nltk
import numpy as np


  metric = datasets.load_metric("rouge")


In [4]:
MAX_INPUT_LENGTH = 2048
MAX_OUTPUT_LENGTH = 100

In [5]:
# Load the ROUGE scores metric

def calc_rouge_scores(candidates, references):
    result = metric.compute(predictions = candidates, references = references, 
        use_stemmer = True)
    result = {key: round(value.mid.fmeasure * 100, 1)
              for key, value in result.items()}
    return result


In [6]:
dataset = datasets.load_dataset("cnn_dailymail", "3.0.0")



  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
print(dataset["train"]["article"][0])
print(dataset["train"]["highlights"][0])

LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details of how

In [8]:
tokenizer = AutoTokenizer.from_pretrained("t5-large", model_max_length = MAX_INPUT_LENGTH)
config = AutoConfig.from_pretrained("t5-large", d_model = 2048)

In [9]:
model = T5ForConditionalGeneration(config = config)

In [10]:
config

T5Config {
  "_name_or_path": "t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 4096,
  "d_kv": 64,
  "d_model": 2048,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "p

In [11]:
sample_input = tokenizer("There has been a misunderstanding on my position within the company.")
sample_input

{'input_ids': [290, 65, 118, 3, 9, 3, 28063, 30, 82, 1102, 441, 8, 349, 5, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [28]:
def preprocess(examples):
    article_split = examples["article"]
    labels_split = examples["highlights"]

    model_inputs = tokenizer(article_split, max_length = MAX_INPUT_LENGTH, truncation = True, padding = True)
    labels = tokenizer(labels_split, max_length = MAX_OUTPUT_LENGTH, truncation = True, padding = True)

    model_inputs["labels"] = labels["input_ids"]
    del model_inputs["article"]
    del model_inputs[]
    return model_inputs
    


In [29]:
example_train_input = dataset["train"]["article"][0]
tokenizer(example_train_input)

{'input_ids': [301, 24796, 4170, 6, 2789, 41, 18844, 61, 1636, 8929, 16023, 2213, 4173, 6324, 12591, 15, 11391, 592, 12, 3, 9, 2196, 3996, 1755, 770, 8785, 591, 11039, 770, 61, 13462, 38, 3, 88, 5050, 507, 30, 2089, 6, 68, 3, 88, 10419, 7, 8, 540, 751, 31, 17, 4061, 3, 9, 10783, 30, 376, 5, 4173, 6324, 12591, 15, 38, 8929, 16023, 16, 96, 15537, 651, 16023, 11, 8, 5197, 13, 8, 12308, 121, 304, 8, 19142, 13, 29517, 6710, 343, 7, 300, 8, 296, 6, 8, 1021, 7556, 845, 3, 88, 65, 150, 1390, 12, 9030, 17, 449, 112, 1723, 550, 30, 1006, 2948, 6, 3281, 11, 17086, 2251, 5, 96, 196, 278, 31, 17, 515, 12, 36, 80, 13, 273, 151, 113, 6, 38, 1116, 38, 79, 919, 14985, 8247, 805, 1452, 3, 9, 3805, 2100, 443, 1232, 42, 424, 1126, 976, 3, 88, 1219, 46, 3746, 2772, 49, 2283, 48, 847, 5, 96, 196, 278, 31, 17, 317, 27, 31, 195, 36, 1989, 28887, 5, 96, 634, 378, 27, 114, 2611, 33, 378, 24, 583, 81, 335, 7051, 1636, 1335, 11, 3190, 7, 11, 5677, 7, 535, 486, 14985, 6324, 12591, 15, 56, 36, 3, 179, 12, 24068, 16

In [30]:
tokenized_train_dataset = dataset["train"].map(lambda x: preprocess(x), batched = True)
tokenized_val_dataset = dataset["validation"].map(lambda x: preprocess(x), batched = True)
tokenized_test_dataset = dataset["test"].map(lambda x: preprocess(x), batched = True)


Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
tokenized_train_dataset = tokenized_train_dataset.remove_columns(
    dataset["train"].column_names
)

In [94]:
tokenized_val_dataset = tokenized_val_dataset.remove_columns(
    dataset["validation"].column_names
)

In [95]:
tokenized_train_dataset[0]

{'input_ids': tensor([  301, 24796,  4170,  ...,     0,     0,     0]),
 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]),
 'labels': tensor([ 8929, 16023,  2213,  4173,  6324, 12591,    15,  2347,  3996,  1755,
           329, 13462,    38,     3,    88,  5050,   507,  2089,     3,     5,
          5209,  7556,   845,     3,    88,    65,   150,  1390,    12,  9030,
            17,   449,   112,  1723,   550,     3,     5,  6324, 12591,    15,
            31,     7,  8783,    45,   166,   874, 16023,  4852,    43,   118,
          1213,    16,  2019,  3069,     3,     5,     1,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0])}

In [96]:
from torch import cuda
device = "cuda" if cuda.is_available() else "cpu"

In [97]:
BATCH_SIZE = 4
train_args = Seq2SeqTrainingArguments(
    output_dir = "./",
    do_train = True,
    do_eval = True,
    evaluation_strategy = "epoch",
    learning_rate = 1e-4,
    save_total_limit = 3,
    num_train_epochs = 5,
    fp16=True,
    predict_with_generate = True,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,

)

data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [98]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [99]:
trainer = Seq2SeqTrainer(
    model,
    train_args,
    train_dataset = tokenized_test_dataset,
    eval_dataset = tokenized_val_dataset,
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

Using cuda_amp half precision backend


In [100]:
from torch.utils.data import DataLoader

tokenized_train_dataset.set_format("torch")
tokenized_val_dataset.set_format("torch")
train_dataloader = DataLoader(
    tokenized_train_dataset,
    shuffle = True,
    collate_fn = data_collator,
    batch_size = 4
)

eval_dataloader = DataLoader(
    tokenized_val_dataset,
    collate_fn = data_collator,
    batch_size = 4
)

In [101]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr = 1e-4)



In [102]:
!pip install accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [103]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [104]:
from transformers import get_scheduler

num_train_epochs = 5
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [105]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # ROUGE expects a newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

In [106]:
output_dir = "/content/Drive/MyDrive/"

In [None]:
from tqdm.auto import tqdm
import torch
import numpy as np

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
            )

            generated_tokens = accelerator.pad_across_processes(
                generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
            )
            labels = batch["labels"]

            # If we did not pad to max length, we need to pad the labels too
            labels = accelerator.pad_across_processes(
                batch["labels"], dim=1, pad_index=tokenizer.pad_token_id
            )

            generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
            labels = accelerator.gather(labels).cpu().numpy()

            # Replace -100 in the labels as we can't decode them
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            if isinstance(generated_tokens, tuple):
                generated_tokens = generated_tokens[0]
            decoded_preds = tokenizer.batch_decode(
                generated_tokens, skip_special_tokens=True
            )
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            decoded_preds, decoded_labels = postprocess_text(
                decoded_preds, decoded_labels
            )

            metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    # Compute metrics
    result = metric.compute()
    # Extract the median ROUGE scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    result = {k: round(v, 4) for k, v in result.items()}
    print(f"Epoch {epoch}:", result)

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    tokenizer.save_pretrained(output_dir)

In [109]:
torch.cuda.empty_cache()