<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/misc/test_HuggingFace(BenchmarkSeq2Seq).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! nvidia-smi

In [None]:
! pip install transformers
! pip install datasets
! pip install sentencepiece

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import TrainingArguments, Trainer
from transformers import Text2TextGenerationPipeline
from transformers.optimization import Adafactor, AdafactorSchedule

In [None]:
dataset = load_dataset("squad")

In [5]:
contexts_train, contexts_valid = [], []
questions_train, questions_valid = [], []
for idx, sample in enumerate(dataset["train"]):
  if idx == 5850:
    break
  contexts_train.append(sample["context"])
  questions_train.append(sample["question"])
assert len(contexts_train) == len(questions_train)
assert len(contexts_train) == 5850

for idx, sample in enumerate(dataset["validation"]):
  if idx == 650:
    break
  contexts_valid.append(sample["context"])
  questions_valid.append(sample["question"])
assert len(contexts_valid) == len(questions_valid)
assert len(contexts_valid) == 650

In [6]:
class CustomSeq2SeqLMDataset(Dataset):
    def __init__(self, tokenizer, input_texts, target_texts, max_input_length=16, max_target_length=16):
        self.tokenizer = tokenizer
        self.input_texts = input_texts
        self.target_texts = target_texts
        assert len(self.input_texts) == len(self.target_texts), 'Input and Target texts sizes do not match'
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_sentences = self.input_texts[idx]
        target_sentences = self.target_texts[idx]
        return {
                'sents': input_sentences,
                'labels': target_sentences,
                }

    def collate_fn(self, batch):
        sents = []
        labels = []
        for sample in batch:
            sents.append(sample['sents'])
            labels.append(sample['labels'])
        tokens_input = self.tokenizer(sents,
                max_length=self.max_input_length,
                padding=True,
                truncation=True,
                return_tensors='pt'
                )
        tokens_target = self.tokenizer(labels,
                max_length=self.max_target_length,
                padding=True,
                truncation=True,
                return_tensors='pt'
                )
        if self.tokenizer.pad_token_id is not None:
            labels[labels == self.tokenizer.pad_token_id] = -100
        return {
                'input_ids': tokens_input['input_ids'],
                'attention_mask': tokens_input['attention_mask'],
                'labels': tokens_target['input_ids'],
                }

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
model.to(device)

params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable Parameters --> {params}")

In [8]:
BS = 8
INPUT_LENGTH = 512
TARGET_LENGTH = 40

In [9]:
train_dataset = CustomSeq2SeqLMDataset(tokenizer=tokenizer,
                                       input_texts=contexts_train,
                                       target_texts=questions_train,
                                       max_input_length=INPUT_LENGTH,
                                       max_target_length=TARGET_LENGTH,
                                       )

valid_dataset = CustomSeq2SeqLMDataset(tokenizer=tokenizer,
                                       input_texts=contexts_valid,
                                       target_texts=questions_valid,
                                       max_input_length=INPUT_LENGTH,
                                       max_target_length=TARGET_LENGTH,
                                       )

train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True, collate_fn=train_dataset.collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=BS, shuffle=False, collate_fn=valid_dataset.collate_fn)

print('Length of Train Loader: ', len(train_loader))
print('Length of Valid Loader: ', len(valid_loader))

Length of Train Loader:  732
Length of Valid Loader:  82


In [10]:
# For AdaFactor
training_args = TrainingArguments(
    output_dir='./results',         
    num_train_epochs=3,              
    per_device_train_batch_size=BS,  
    per_device_eval_batch_size=BS,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='no',
    fp16=False,
    gradient_accumulation_steps=(128 // BS),
)

In [11]:
optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
lr_scheduler = AdafactorSchedule(optimizer)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=train_dataset.collate_fn,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    optimizers=(optimizer, lr_scheduler),
)

In [None]:
trainer.train()