<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/misc/t5_fluentQA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! git clone https://github.com/google-research-datasets/Disfl-QA.git
! git clone https://github.com/vishal-burman/PyTorch-Architectures.git
%cd PyTorch-Architectures/
! cp ../Disfl-QA/*.json .

fatal: destination path 'Disfl-QA' already exists and is not an empty directory.
fatal: destination path 'PyTorch-Architectures' already exists and is not an empty directory.
/content/PyTorch-Architectures


In [None]:
! pip install transformers
! pip install datasets
! pip install sentencepiece

In [2]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [3]:
def get_samples(file_name: str):
  final_list = []
  with open(file_name, "r") as f:
    data = json.load(f)
  
  for key in list(data.keys()):
    sample = data[key]
    final_list.append((sample["original"], sample["disfluent"]))
  
  return final_list

In [4]:
train_list = get_samples("train.json")
valid_list = get_samples("dev.json")
test_list = get_samples("test.json")

In [5]:
class CustomDataset(Dataset):
  def __init__(self, 
               tokenizer, 
               list_samples: list,
               max_input_length: int = 16,
               max_target_length: int = 16,
               ):
    self.tokenizer = tokenizer
    self.list_samples = list_samples
    self.max_input_length = max_input_length
    self.max_target_length = max_target_length
  
  def __len__(self,):
    return len(self.list_samples)
  
  def __getitem__(self, idx):
    orig, df = self.list_samples[idx]
    return (orig, df)
  
  def collate_fn(self, batch):
    list_orig = []
    list_df = []
    
    for sample in batch:
      list_orig.append(sample[0])
      list_df.append(sample[1])
    
    tokens_input = self.tokenizer(
        list_orig,
        max_length=self.max_input_length,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )
    tokens_target = self.tokenizer(
        list_df,
        max_length=self.max_input_length,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )

    return {
        "orig_ids": tokens_input["input_ids"],
        "orig_mask": tokens_input["attention_mask"],
        "df_ids": tokens_target["input_ids"],
    }

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_str = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_str)
model = T5ForConditionalGeneration.from_pretrained(model_str)
model.to(device)