<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/T5_Math_Add_Sub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install transformers
! pip install datasets

In [1]:
import time
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
from datasets import load_dataset
dataset = load_dataset('math_dataset', 'arithmetic__add_or_sub')

Reusing dataset math_dataset (/root/.cache/huggingface/datasets/math_dataset/arithmetic__add_or_sub/1.0.0/2f29b6d4f28d5ba488f8fd53c0306771b5e4c8c636521ef729ecc5a4b586c5e6)


In [2]:
tokenizer = T5Tokenizer.from_pretrained('t5-large')
model = T5ForConditionalGeneration.from_pretrained('t5-large').to(device)

In [3]:
class CustomDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_len_src=16, max_len_tgt=4):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len_src = max_len_src
    self.max_len_tgt = max_len_tgt
    self.final_list = []
    self.build()
  
  def __len__(self):
    return len(self.final_list)
  
  def __getitem__(self, idx):
    tokens = self.final_list[idx]['tokens']
    labels = self.final_list[idx]['labels']
    src_ids = tokens['input_ids']
    src_mask = tokens['attention_mask']
    label_ids = labels['input_ids']
    return {
        'src_ids': torch.tensor(src_ids, dtype=torch.long),
        'src_mask': torch.tensor(src_mask, dtype=torch.long),
        'label_ids': torch.tensor(label_ids, dtype=torch.long)
    }
  
  def build(self):
    for text, label in zip(self.texts, self.labels):
      text = text + " </s>"
      tokens = tokenizer(text, max_length=self.max_len_src, truncation=True, padding='max_length')
      label = label + " </s>"
      label = tokenizer(label, max_length=self.max_len_tgt, truncation=True, padding='max_length')
      self.final_list.append({'tokens': tokens, 'labels': label})

In [4]:
train_texts = dataset['train']['question'][:10000]
train_labels = dataset['train']['answer'][:10000]
valid_texts = dataset['test']['question'][:1000]
valid_labels = dataset['test']['answer'][:1000]

In [5]:
# texts_train = [text.replace(" ", "") for text in train]
# labels_train = [text for text in train]

# texts_valid = [text.replace(" ", "") for text in valid]
# labels_valid = [text for text in valid]
for text, label in zip(train_texts, train_labels):
  print(text, label)
  break

What is -5 - 110911? -110916


In [6]:
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
valid_dataset = CustomDataset(valid_texts, valid_labels, tokenizer)

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."


In [7]:
BATCH_SIZE = 8
LEARNING_RATE = 3e-5
EPOCHS = 5

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

train_loader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=BATCH_SIZE)
valid_loader = DataLoader(dataset=valid_dataset, shuffle=False, batch_size=BATCH_SIZE)
print("Length of Training DataLoader: ", len(train_loader))
print("Length of Valid DataLoader: ", len(valid_loader))

Length of Training DataLoader:  1250
Length of Valid DataLoader:  125


In [8]:
pad_token_id = tokenizer.pad_token_id
def compute_loss(model, data_loader, device):
  total_loss = 0
  for idx, sample in enumerate(data_loader):
    ids = sample['src_ids'].to(device)
    mask = sample['src_mask'].to(device)
    labels = sample['label_ids'].to(device)

    decoder_input_ids = model._shift_right(labels)
    outputs = model(ids, attention_mask=mask, decoder_input_ids=decoder_input_ids)
    lm_logits = outputs[0]
    ce_loss_fct = nn.CrossEntropyLoss(ignore_index=pad_token_id)
    loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), labels.view(-1))
    total_loss += loss
  return total_loss / len(data_loader)

start_time = time.time()
for epoch in range(EPOCHS):
  model.train()
  for idx, sample in enumerate(train_loader):
    ids = sample['src_ids'].to(device)
    mask = sample['src_mask'].to(device)
    labels = sample['label_ids'].to(device)

    decoder_input_ids = model._shift_right(labels)

    optimizer.zero_grad()
    outputs = model(ids, attention_mask=mask, decoder_input_ids=decoder_input_ids, use_cache=False)
    lm_logits = outputs[0]
    ce_loss_fct = nn.CrossEntropyLoss(ignore_index=pad_token_id)
    loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), labels.view(-1))

    # LOGGING
    if idx % 200 == 0:
      print("Batch: %03d/%03d || Epoch: %03d/%03d" % (idx, len(train_loader), epoch+1, EPOCHS))

    loss.backward()
    optimizer.step()
  
  model.eval()
  with torch.set_grad_enabled(False):
    valid_loss = compute_loss(model, valid_loader, device)
    print("Average Validation Loss: ", valid_loss.item())
  epoch_elapsed_time = (time.time() - start_time) / 60
  print("Epoch Elapsed Time: ", epoch_elapsed_time)
total_training_time = (time.time() - start_time) / 60

Batch: 000/1250 || Epoch: 001/005
Batch: 200/1250 || Epoch: 001/005
Batch: 400/1250 || Epoch: 001/005
Batch: 600/1250 || Epoch: 001/005
Batch: 800/1250 || Epoch: 001/005
Batch: 1000/1250 || Epoch: 001/005
Batch: 1200/1250 || Epoch: 001/005
Average Validation Loss:  1.124136209487915
Epoch Elapsed Time:  9.743300990263622
Batch: 000/1250 || Epoch: 002/005
Batch: 200/1250 || Epoch: 002/005
Batch: 400/1250 || Epoch: 002/005
Batch: 600/1250 || Epoch: 002/005
Batch: 800/1250 || Epoch: 002/005
Batch: 1000/1250 || Epoch: 002/005
Batch: 1200/1250 || Epoch: 002/005
Average Validation Loss:  1.0466009378433228
Epoch Elapsed Time:  19.39113906621933
Batch: 000/1250 || Epoch: 003/005
Batch: 200/1250 || Epoch: 003/005
Batch: 400/1250 || Epoch: 003/005
Batch: 600/1250 || Epoch: 003/005
Batch: 800/1250 || Epoch: 003/005
Batch: 1000/1250 || Epoch: 003/005
Batch: 1200/1250 || Epoch: 003/005
Average Validation Loss:  0.9864254593849182
Epoch Elapsed Time:  29.005708662668862
Batch: 000/1250 || Epoch: 00

In [25]:
texts = ["Calculate 7+3", "What is 2 + 4?"] 
for text in texts:
  print("Original text = ", text)
  tokens = tokenizer(text, max_length=16, truncation=True, padding='max_length')
  ids = torch.tensor(tokens['input_ids'], dtype=torch.long).unsqueeze(0)
  mask = torch.tensor(tokens['attention_mask'], dtype=torch.long).unsqueeze(0)
  model.eval()
  with torch.set_grad_enabled(False):
    outs = model.generate(input_ids=ids.to(device), attention_mask=mask.to(device), max_length=4, num_beams=1, early_stopping=True)
    print("T5's Answer = ", tokenizer.decode(outs[0], skip_special_tokens=True), "\n")

Original text =  Calculate 7+3
T5's Answer =  10 

Original text =  What is 2 + 4?
T5's Answer =  6 



In [26]:
# Needs more Training.