In [322]:
import random
# import pandas as pd
# from IPython.display import display, HTML

import torch
from torch import utils
from torch.utils import data
from torch.utils.data import Dataset, DataLoader

import multiprocessing

import accelerate
from accelerate import Accelerator

# import huggingface_hub
from transformers import T5ForConditionalGeneration, T5Config, T5TokenizerFast, T5Tokenizer, AutoTokenizer
from transformers import DataCollatorWithPadding, DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer, default_data_collator, get_scheduler
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments


import datasets
from datasets import load_dataset #, load_from_disk
# import evaluate
# from evaluate import load


import tqdm as notebook_tqdm
import os
from dotenv import load_dotenv

from tqdm.auto import tqdm
import math

In [323]:
load_dotenv()
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
# huggingface_token = os.getenv("HF_TOKEN")

device_type = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(device_type)
# print(torch.cuda.device_count())
cpu_cores = multiprocessing.cpu_count()
# print(cpu_cores)
# device = torch.device("cpu")

accelerator = Accelerator()
device = accelerator.state.device


In [324]:
# torch.cuda.empty_cache()
# # torch.cuda.set_per_process_memory_fraction(0.8)

In [325]:

## Load the configuration and set it to output hidden states (OR, can use approach w/: model.config.output_hidden_states = True)

model_checkpoint = "t5-small"
# config = T5Config.from_pretrained(model_checkpoint, output_hidden_states=True)

tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint)
tokenizer = tokenizer(max_length=512, truncation=True, padding="max_length", pad_to_multiple_of=8)  

model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
# model.to(device)

# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint, config=config)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [326]:
raw_dataset = load_dataset("xsum")
print(raw_dataset)
print(raw_dataset["train"][0:2])

# metric = load("rouge")

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})


In [327]:
train_size = 1000
test_size = int(0.1 * train_size)

downsampled_dataset = raw_dataset["train"].train_test_split(train_size=train_size, test_size=test_size)

print(downsampled_dataset)
print(downsampled_dataset["train"][0:2])

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 100
    })
})
{'document': ["Ok, so it's not a real zebra, but a much loved statue called Gilbert.\nIt was part of Marwell Wildlife zoo's mass art extravaganza with 149 other sculptures marking a path for visitors to follow through the city.\nBut disaster struck - Gilbert was stolen and the zoo has launched a campaign for the statue to be returned.", 'As the form of Jose Mourinho\'s Chelsea side hit a new nadir with a 3-1 defeat by Liverpool, social media took great delight in mocking the under-pressure Blues boss.\nBut only around 3,700 tweets, of around 200,000 which were sent, contained both the words \'Mourinho\' and \'sack\'. Does, deep down, the public want to keep a man who tends to create talking points? Maybe. But even so, no one said they can\'t have a giggle.\nBy 16:00 GMT on S

In [328]:
tokenizer(["Hello, this one sentence!", "This is another sentence."])


{'input_ids': [[8774, 6, 48, 80, 7142, 55, 1], [100, 19, 430, 7142, 5, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [329]:
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b", "google/flan-t5-xl", "google/flan-t5-small"]:
    prefix = "summarize: "
else:
    prefix = ""

In [401]:
# batch_size = 1000  # Adjust this value based on your available memory
max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = downsampled_dataset.map(preprocess_function, batched=True, num_proc=(cpu_cores-1))
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]
# torch.save(tokenized_datasets["train"], 'train_data.pt')
# torch.save(tokenized_datasets["test"], 'test_data.pt')   

Map (num_proc=7):   0%|          | 0/1000 [00:00<?, ? examples/s]

  table = cls._concat_blocks(blocks, axis=0)


Map (num_proc=7):   0%|          | 0/100 [00:00<?, ? examples/s]

  table = cls._concat_blocks(blocks, axis=0)


In [400]:
# train_dataset = torch.load('train_data.pt')
# test_dataset = torch.load('test_data.pt')
# train_dataset = tokenized_datasets["train"]
# eval_dataset = tokenized_datasets["validation"]
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})


In [402]:
print(tokenized_datasets["train"][0:2])
print(tokenized_datasets)
print(preprocess_function(downsampled_dataset['train'][:2]))
print(train_dataset['input_ids'][0:2][0])

{'document': ["Ok, so it's not a real zebra, but a much loved statue called Gilbert.\nIt was part of Marwell Wildlife zoo's mass art extravaganza with 149 other sculptures marking a path for visitors to follow through the city.\nBut disaster struck - Gilbert was stolen and the zoo has launched a campaign for the statue to be returned.", 'As the form of Jose Mourinho\'s Chelsea side hit a new nadir with a 3-1 defeat by Liverpool, social media took great delight in mocking the under-pressure Blues boss.\nBut only around 3,700 tweets, of around 200,000 which were sent, contained both the words \'Mourinho\' and \'sack\'. Does, deep down, the public want to keep a man who tends to create talking points? Maybe. But even so, no one said they can\'t have a giggle.\nBy 16:00 GMT on Saturday the term \'Chelsea\' had been tweeted 578,000 times - the leading worldwide trend - while there were roughly 200,000 tweets sent containing the term \'Mourinho\' between midday and 15:00 GMT.\nGolf\'s world 

In [333]:
# batch_size = 64
# model_name = model_checkpoint.split("/")[-1]
# logging_steps = len(downsampled_dataset["train"]) // batch_size
# # args = TrainingArguments(
#     f"{model_name}-finetuned-xsum",
#     evaluation_strategy = "epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     weight_decay=0.01,
#     save_total_limit=3,
#     num_train_epochs=1,
#     # predict_with_generate=True,
#     fp16=True,
#     push_to_hub=False,
#     logging_steps=logging_steps,
# )

In [407]:
# Set optimizer, training_dataloader, scheduler, dataloader, and loss_function
batch_size = 64
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt", padding=True)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
training_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

loss_function = torch.nn.CrossEntropyLoss()


# train_dataset = train_dataset.remove_columns(["document", "summary"])

print(train_dataset)
print(train_dataset[0:2])


print(tokenizer(["Hello, this one sentence!", "This is another sentence."]))
print(data_collator.tokenizer(["Hello, this one sentence!", "This is another sentence."]))

print(data_collator.tokenizer(["Hello, this one sentence!", "This is another sentence."], padding=True, max_length=max_input_length, truncation=True, return_tensors="pt"))




for batch in training_dataloader:
    print(batch)


Dataset({
    features: ['id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})
{'id': ['36280761', '34688538'], 'input_ids': [[21603, 10, 8872, 6, 78, 34, 31, 7, 59, 3, 9, 490, 3, 776, 1939, 6, 68, 3, 9, 231, 1858, 12647, 718, 24378, 5, 94, 47, 294, 13, 1571, 2091, 18868, 3, 172, 32, 32, 31, 7, 3294, 768, 996, 900, 2565, 1629, 28, 3, 24816, 119, 10519, 7, 15285, 3, 9, 2071, 21, 2692, 12, 1130, 190, 8, 690, 5, 299, 6912, 10056, 3, 18, 24378, 47, 14244, 11, 8, 3, 172, 32, 32, 65, 3759, 3, 9, 2066, 21, 8, 12647, 12, 36, 3666, 5, 1], [21603, 10, 282, 8, 607, 13, 10854, 283, 1211, 23738, 31, 7, 14373, 596, 1560, 3, 9, 126, 3, 29, 9, 12594, 28, 3, 9, 220, 2292, 9589, 57, 15131, 6, 569, 783, 808, 248, 11235, 16, 17812, 53, 8, 365, 18, 26866, 2419, 7, 7930, 5, 299, 163, 300, 6180, 9295, 10657, 7, 6, 13, 300, 3, 22779, 84, 130, 1622, 6, 6966, 321, 8, 1234, 3, 31, 329, 1211, 23738, 31, 11, 3, 31, 15525, 31, 5, 3520, 6, 1659, 323, 6, 8, 452, 241, 12, 453, 3, 9, 388, 113, 2134, 7, 

RuntimeError: each element in list of batch should be of equal size

In [371]:
# train_features, train_labels = next(iter(training_dataloader))
# print(f"Feature batch shape: {train_features.size()}")
# print(f"Labels batch shape: {train_labels.size()}")
# img = train_features[0].squeeze()
# label = train_labels[0]

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`document` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [372]:
# print(training_dataloader)
# print(training_dataloader.dataset)
# # print(training_dataloader.dataset['document'])
# print(training_dataloader.dataset['input_ids'][0:2])

# for thing in data_collator:
#     print(thing)


for batch in training_dataloader:
    print(batch)

# testing_batchloaderabc = next(iter(training_dataloader))
# print(testing_batchloaderabc)   

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`document` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
model, optimizer, training_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(model, optimizer, training_dataloader, eval_dataloader, lr_scheduler)


In [None]:
# for batch in training_dataloader['']:
#     print(batch)



# testing_batchloaderabc = next(iter(training_dataloader))
# print(testing_batchloaderabc)   

# print the first batch in dataloader
# for batch in dataloader:
#     print(batch["document"])

In [None]:
num_train_epochs = 2
num_update_steps_per_epoch = len(training_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

# lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

# lr_scheduler = get_scheduler(
#     "linear",
#     optimizer=optimizer,
#     num_warmup_steps=0,
#     num_training_steps=num_training_steps,
#     gamma=0.95
# )

In [None]:
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in training_dataloader:
        inputs, targets = batch["input_ids"], batch["labels"]
        outputs = model(inputs)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    # model.eval()
    # losses = []
    # for step, batch in enumerate(eval_dataloader):
    #     with torch.no_grad():
    #         outputs = model(**batch)

    #     loss = outputs.loss
    #     losses.append(accelerator.gather(loss.repeat(batch_size)))

    # losses = torch.cat(losses)
    # losses = losses[: len(test_dataset)]
    # try:
    #     perplexity = math.exp(torch.mean(losses))
    # except OverflowError:
    #     perplexity = float("inf")

    # print(f">>> Epoch {epoch}: Perplexity: {perplexity}")



  0%|          | 0/32 [00:00<?, ?it/s]

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`document` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
# print(f">>> Perplexity: {math.exp(test_results['eval_loss']):.2f}")

In [None]:

# for batch in dataloader:
#     print(batch)

# test_batch = next(iter(dataloader))
# print((test_batch['input_ids']))

# print the first batch in dataloader
# for batch in dataloader:
#     print(batch["document"])


In [None]:
# for batch in training_dataloader:
#     inputs, targets = batch["input_ids"], batch["labels"]
#     inputs = torch.tensor(inputs).squeeze(1).long().to(device)
#     targets = torch.tensor(targets).squeeze(1).long().to(device)
#     outputs = model(inputs)
#     loss = loss_function(outputs, targets.view(-1))  # Convert targets to a tensor with a single dimension
#     accelerator.backward(loss)
#     optimizer.step()
#     scheduler.step()
#     optimizer.zero_grad()

In [None]:


# trainer = Seq2SeqTrainer(
#     model,
#     args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     # compute_metrics=compute_metrics
# )

In [None]:
# args.max_split_size_mb = 10
# trainer.train()