<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/misc/test_sample_T5AutoComplete.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! nvidia-smi

Wed Jan 20 14:20:40 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
! pip install transformers
! pip install datasets
! pip install sentencepiece

In [None]:
# Restart the runtime after this !!
! python -m spacy download en_core_web_md

In [1]:
import time
import random

import spacy

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, Dataset

from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [2]:
nlp = spacy.load('en_core_web_md')
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
dataset_quora = load_dataset('quora')
dataset_cnn = load_dataset('cnn_dailymail', '3.0.0')

Using custom data configuration default
Reusing dataset quora (/root/.cache/huggingface/datasets/quora/default/0.0.0/2be517cf0ac6de94b77a103a36b141347a13f40637fbebaccb56ddbe397876be)
Reusing dataset cnn_dailymail (/root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0128610a44e10f25b4af6689441c72af86205282d26399642f7db38fa7535602)


In [4]:
start_time = time.time()
samples = dataset_cnn['train']['highlights']
samples_cnn = []
for sample in samples:
  if len(samples_cnn) >= 50000:
    break
  doc = nlp(sample)
  for sent in doc.sents:
    sentence = sent.string.strip()
    if len(sentence.split(' ')) >=5 and len(sentence.split(' ')) <= 16:
      samples_cnn.append(sentence)
elapsed_time = (time.time() - start_time) / 60
samples_cnn = samples_cnn[:50000]
print('Elapsed Time: %.2f min' % (elapsed_time))

Elapsed Time: 3.50 min


In [5]:
samples = dataset_quora['train']['questions']
samples_quora = [sample['text'][0] for sample in samples if len(sample['text'][0].split(' ')) >= 5 and len(sample['text'][0].split(' ')) <= 16]
samples_quora = samples_quora[:50000]

In [6]:
print('No. of samples from CNN Dailymail: ', len(samples_cnn))
print('No. of samples from Quora: ', len(samples_quora))

No. of samples from CNN Dailymail:  50000
No. of samples from Quora:  50000


In [7]:
samples = []
samples.extend(samples_cnn)
samples.extend(samples_quora)
random.shuffle(samples) # Shuffling statements and questions

train_samples = samples[:80000]
valid_samples = samples[80000:]

In [8]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')
model.to(device)

params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('Total Trainable Parameters: ', params)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1199.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891691430.0, style=ProgressStyle(descri…


Total Trainable Parameters:  222903552


In [9]:
class AutoCompleteDataset(Dataset):
  def __init__(self, tokenizer, texts_list):
    self.tokenizer = tokenizer
    self.texts_list = texts_list
    self.list_samples = []
    self.build()
  
  def __len__(self):
    return len(self.list_samples)
  
  def __getitem__(self, idx):
    sample = self.list_samples[idx]
    return {
        'input': sample['input_sent'],
        'target': sample['target_sent'],
    }
  
  def build(self):
    for text in self.texts_list:
      sample = text.split(' ')
      rand_int = random.randint(1, 3)
      input = ' '.join(sample[:rand_int])
      target = ' '.join(sample[rand_int:])
      self.list_samples.append({
          'input_sent': input,
          'target_sent': target,
      })

In [10]:
train_dataset = AutoCompleteDataset(tokenizer, train_samples)
valid_dataset = AutoCompleteDataset(tokenizer, valid_samples)

In [11]:
# Hyperparameters
EPOCHS = 5
BATCH_SIZE = 32
LEARNING_RATE = 3e-5
MAX_INPUT_LENGTH = 3
MAX_TARGET_LENGTH = 16

train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
print('Length of Training Loader: ', len(train_loader))
print('Length of Valid Loader: ', len(valid_loader))

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

Length of Training Loader:  2500
Length of Valid Loader:  625


In [12]:
def compute_loss(model, data_loader, device):
  list_loss = []
  for sample in data_loader:
    tokens = tokenizer.prepare_seq2seq_batch(src_texts=sample['input'], tgt_texts=sample['target'], max_length=MAX_INPUT_LENGTH, max_target_length=MAX_TARGET_LENGTH, padding='max_length', truncation=True, return_tensors='pt')
    ids = tokens['input_ids'].to(device)
    mask = tokens['attention_mask'].to(device)
    tgt = tokens['labels'].to(device)
    loss = model(input_ids=ids, attention_mask=mask, labels=tgt).loss
    list_loss.append(loss.item())
  final_loss_mean = torch.tensor(list_loss).mean()
  return final_loss_mean

start_time = time.time()
for epoch in range(EPOCHS):
  model.train()
  for idx, sample in enumerate(train_loader):
    tokens = tokenizer.prepare_seq2seq_batch(src_texts=sample['input'], tgt_texts=sample['target'], max_length=MAX_INPUT_LENGTH, max_target_length=MAX_TARGET_LENGTH, padding='max_length', truncation=True, return_tensors='pt')
    ids = tokens['input_ids'].to(device)
    mask = tokens['attention_mask'].to(device)
    tgt = tokens['labels'].to(device)
    outputs = model(input_ids=ids, attention_mask=mask, labels=tgt)

    optimizer.zero_grad()
    loss = outputs.loss
    loss.backward()
    optimizer.step()

    # LOGGING
    if idx % 500 == 0:
      print("BATCH: %04d/%04d || Epoch: %04d/%04d || Loss: %.3f" % (idx, len(train_loader), epoch+1, EPOCHS, loss.item()))
  
  model.eval()
  with torch.set_grad_enabled(False):
    valid_loss = compute_loss(model, valid_loader, device)
    print('Valid Loss: %.3f' % (valid_loss))
  epoch_elapsed_time = (time.time() - start_time) / 60
  print('Epoch elapsed time: %.2f min' % (epoch_elapsed_time))
total_training_time = (time.time() - start_time) / 60
print('Total training time: ', total_training_time)

BATCH: 0000/2500 || Epoch: 0001/0005 || Loss: 8.682
BATCH: 0500/2500 || Epoch: 0001/0005 || Loss: 2.975
BATCH: 1000/2500 || Epoch: 0001/0005 || Loss: 2.942
BATCH: 1500/2500 || Epoch: 0001/0005 || Loss: 3.403
BATCH: 2000/2500 || Epoch: 0001/0005 || Loss: 2.813
Valid Loss: 2.732
Epoch elapsed time: 9.51 min
BATCH: 0000/2500 || Epoch: 0002/0005 || Loss: 3.236
BATCH: 0500/2500 || Epoch: 0002/0005 || Loss: 3.046
BATCH: 1000/2500 || Epoch: 0002/0005 || Loss: 3.084
BATCH: 1500/2500 || Epoch: 0002/0005 || Loss: 2.791
BATCH: 2000/2500 || Epoch: 0002/0005 || Loss: 2.896
Valid Loss: 2.663
Epoch elapsed time: 19.14 min
BATCH: 0000/2500 || Epoch: 0003/0005 || Loss: 3.093
BATCH: 0500/2500 || Epoch: 0003/0005 || Loss: 2.982
BATCH: 1000/2500 || Epoch: 0003/0005 || Loss: 3.227
BATCH: 1500/2500 || Epoch: 0003/0005 || Loss: 2.800
BATCH: 2000/2500 || Epoch: 0003/0005 || Loss: 2.911
Valid Loss: 2.617
Epoch elapsed time: 28.73 min
BATCH: 0000/2500 || Epoch: 0004/0005 || Loss: 2.687
BATCH: 0500/2500 || Epoch

In [33]:
model.eval()
with torch.set_grad_enabled(False):
  text = "Someone in"
  print("Your input: ", text)
  tokens = tokenizer(text, max_length=3, padding='max_length', return_tensors='pt')
  ids = tokens['input_ids'].to(device)
  mask = tokens['attention_mask'].to(device)
  outputs = model.generate(input_ids=ids, max_length=16, early_stopping=True)
  print("Your completed sentence: ", text,tokenizer.decode(outputs[0], skip_special_tokens=True))

Your input:  Someone in
Your completed sentence:  Someone in the United States is suspected of killing a woman in a car crash
