In [15]:
import random
# import pandas as pd
# from IPython.display import display, HTML

import torch

import accelerate

# import huggingface_hub
from transformers import T5ForConditionalGeneration, T5Config, T5TokenizerFast, T5Tokenizer, AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

import datasets
from datasets import load_dataset #, load_from_disk
# import evaluate
# from evaluate import load


import tqdm as notebook_tqdm
import os
from dotenv import load_dotenv

In [16]:
load_dotenv()
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
# huggingface_token = os.getenv("HF_TOKEN")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
print(device)

cpu


In [17]:
torch.cuda.empty_cache()
# torch.cuda.set_per_process_memory_fraction(0.8)

In [18]:
## Load the configuration and set it to output hidden states (OR, can use approach w/: model.config.output_hidden_states = True)
model_checkpoint = "google/flan-t5-small"
# config = T5Config.from_pretrained(model_checkpoint, output_hidden_states=True)

## Load the tokenizer and model with the updated configuration
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint)
# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint, config=config)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [19]:
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
# model = model.to(device)

In [20]:
raw_datasets = load_dataset("xsum")
# metric = load("rouge")

raw_datasets["train"][0:2]

  'A fire alarm went off at the Holiday Inn in Hope Street at about 04:20 BST on Saturday and guests were asked to leave the hotel.\nAs they gathered outside they saw the two buses, parked side-by-side in the car park, engulfed by flames.\nOne of the tour groups is from Germany, the other from China and Taiwan. It was their first night in Northern Ireland.\nThe driver of one of the buses said many of the passengers had left personal belongings on board and these had been destroyed.\nBoth groups have organised replacement coaches and will begin their tour of the north coast later than they had planned.\nPolice have appealed for information about the attack.\nInsp David Gibson said: "It appears as though the fire started under one of the buses before spreading to the second.\n"While the exact cause is still under investigation, it is thought that the fire was started deliberately."'],
 'summary': ['Clean-up operations are continuing across the Scottish Borders and Dumfries and Galloway a

In [21]:
tokenizer(["Hello, this one sentence!", "This is another sentence."])

{'input_ids': [[8774, 6, 48, 80, 7142, 55, 1], [100, 19, 430, 7142, 5, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [22]:
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b", "google/flan-t5-xl", "google/flan-t5-small"]:
    prefix = "summarize: "
else:
    prefix = ""

In [23]:
batch_size = 500  # Adjust this value based on your available memory
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, padding='longest', truncation=True, max_length=max_input_length, return_tensors="pt")

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["summary"], padding='longest', truncation=True, max_length=max_target_length, return_tensors="pt")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess_function, batched=True, batch_size=batch_size)
torch.save(tokenized_datasets["train"], 'train_data.pt')
torch.save(tokenized_datasets["validation"], 'valid_data.pt')

Map:   0%|          | 0/204045 [00:00<?, ? examples/s]

Map:   0%|          | 0/11332 [00:00<?, ? examples/s]

Map:   0%|          | 0/11334 [00:00<?, ? examples/s]

In [24]:
preprocess_function(raw_datasets['train'][:2])

{'input_ids': tensor([[21603,    10,    37,  ...,  1598,     5,     1],
        [21603,    10,    71,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[ 7433,    18,   413,  2673,    33,  6168,   640,     8, 12580, 17600,
             7,    11,   970,    51,    89,  2593,    11, 10987,    32,  1343,
           227, 18368,  2953,    57, 16133,  4937,     5,     1],
        [ 2759,  8548, 14264,    43,   118, 10932,    57,  1472,    16,     3,
             9, 18024,  1584,   739,  3211,    16, 27874,   690,  2050,     5,
             1,     0,     0,     0,     0,     0,     0,     0]])}

In [25]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-xsum",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

In [26]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [27]:
train_dataset = torch.load('train_data.pt')
eval_dataset = torch.load('valid_data.pt')
# train_dataset = tokenized_datasets["train"]
# eval_dataset = tokenized_datasets["validation"]

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics
)

In [28]:
# args.max_split_size_mb = 10
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacty of 14.58 GiB of which 7.56 MiB is free. Process 435192 has 14.57 GiB memory in use. Of the allocated memory 14.38 GiB is allocated by PyTorch, and 51.32 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF