<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/misc/t5_fluentQA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! nvidia-smi

In [1]:
! git clone https://github.com/google-research-datasets/Disfl-QA.git
! git clone https://github.com/vishal-burman/PyTorch-Architectures.git
%cd PyTorch-Architectures/
! cp ../Disfl-QA/*.json .

fatal: destination path 'Disfl-QA' already exists and is not an empty directory.
fatal: destination path 'PyTorch-Architectures' already exists and is not an empty directory.
/content/PyTorch-Architectures


In [None]:
! pip install transformers
! pip install datasets
! pip install sentencepiece
! pip install wget

In [2]:
from tqdm.auto import tqdm
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers.optimization import Adafactor, AdafactorSchedule
from toolkit.utils import get_optimal_batchsize, dict_to_device
from toolkit.metrics import nlp_compute_mean_loss

In [3]:
def get_samples(file_name: str):
  final_list = []
  with open(file_name, "r") as f:
    data = json.load(f)
  
  for key in list(data.keys()):
    sample = data[key]
    final_list.append((sample["original"], sample["disfluent"]))
  
  return final_list

In [4]:
train_list = get_samples("train.json")
valid_list = get_samples("dev.json")
test_list = get_samples("test.json")

In [5]:
class CustomDataset(Dataset):
  def __init__(self, 
               tokenizer, 
               list_samples: list,
               max_input_length: int = 16,
               max_target_length: int = 16,
               ):
    self.tokenizer = tokenizer
    self.list_samples = list_samples
    self.max_input_length = max_input_length
    self.max_target_length = max_target_length
  
  def __len__(self,):
    return len(self.list_samples)
  
  def __getitem__(self, idx):
    orig, df = self.list_samples[idx]
    return (orig, df)
  
  def collate_fn(self, batch):
    list_orig = []
    list_df = []
    
    for sample in batch:
      list_orig.append(sample[0])
      list_df.append(sample[1])
    
    tokens_target = self.tokenizer(
        list_orig,
        max_length=self.max_input_length,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )
    tokens_input = self.tokenizer(
        list_df,
        max_length=self.max_input_length,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )

    return {
        "input_ids": tokens_input["input_ids"],
        "attention_mask": tokens_input["attention_mask"],
        "labels": tokens_target["input_ids"],
    }

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_str = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_str)
model = T5ForConditionalGeneration.from_pretrained(model_str)
model.to(device)

In [7]:
params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable Parameters: {params}")

Trainable Parameters: 60506624


In [8]:
dataset_train = CustomDataset(tokenizer=tokenizer,
                              list_samples=train_list,
                              max_input_length=64,
                              max_target_length=64,
                              )

dataset_valid = CustomDataset(tokenizer=tokenizer,
                              list_samples=valid_list,
                              max_input_length=64,
                              max_target_length=64,
                              )

In [None]:
# get_optimal_batchsize(dataset_train, model) --> 192(fp32)

192

In [9]:
# Hyperparameter section
BS = 128
EPOCHS = 5

In [10]:
# Sanity check forward pass
sample_loader = DataLoader(dataset=dataset_train,
                           batch_size=4,
                           collate_fn=dataset_train.collate_fn,
                           )
model.eval()
with torch.set_grad_enabled(False):
  for sample in sample_loader:
    outputs = model(**dict_to_device(sample, device=device))
    loss = outputs.loss.item()
    print(f"Loss: {loss:.2f}")
    break

Loss: 3.05


In [11]:
train_loader = DataLoader(dataset_train,
                          batch_size=BS,
                          shuffle=True,
                          collate_fn=dataset_train.collate_fn,
                          )

valid_loader = DataLoader(dataset_valid,
                          batch_size=BS,
                          shuffle=True,
                          collate_fn=dataset_valid.collate_fn,
                          )
print(f"Length of Train Loader: {len(train_loader)}")
print(f"Length of Valid Loader: {len(valid_loader)}")

Length of Train Loader: 57
Length of Valid Loader: 8


In [12]:
optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
scheduler = AdafactorSchedule(optimizer)

In [13]:
num_training_steps = len(train_loader) * EPOCHS
progress_bar = tqdm(range(num_training_steps))

for epoch in range(EPOCHS):
  model.train()
  for sample in train_loader:
    outputs = model(**dict_to_device(sample, device=device))
    loss = outputs.loss
    loss.backward()

    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

  model.eval()
  with torch.set_grad_enabled(False):
    valid_loss = nlp_compute_mean_loss(model, valid_loader, device)
  print(f"Valid Perplexity: {valid_loss: .2f}")

  0%|          | 0/285 [00:00<?, ?it/s]

Valid Perplexity:  6.89
Valid Perplexity:  1.93
Valid Perplexity:  1.34
Valid Perplexity:  1.21
Valid Perplexity:  1.19


In [30]:
index = 7
text = valid_list[index][1]
model.eval()
with torch.set_grad_enabled(False):
  tokens = tokenizer(text, return_tensors="pt")
  input_ids = tokens["input_ids"].to(device)
  outputs = model.generate(input_ids)
  print(f"Original Text: {text}")
  print(f"Predicted Text: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")

Original Text: When or uh what did European empires rely on to supply them with resources?
Predicted Text: What did European empires depend on to supply them with resources?
