<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/misc/SentenceSimilarity(DistilRoBERTa_DistilRoBERTa).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install transformers
! pip install datasets

In [None]:
! git clone https://github.com/vishal-burman/PyTorch-Architectures.git
%cd PyTorch-Architectures/

In [6]:
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel

In [None]:
dataset = load_dataset("quora")

In [5]:
train_p = []
train_n = []
test_list = []
count_p, count_n = 0, 0
for idx, sample in enumerate(dataset["train"]):
  text_1, text_2 = sample["questions"]["text"][0], sample["questions"]["text"][1]
  if len(train_p) < 10000 and sample["is_duplicate"]:
    train_p.append((text_1, text_2, 1))
  elif len(train_n) < 10000 and not sample["is_duplicate"]:
    train_n.append((text_1, text_2, 0))
  elif len(test_list) < 10000:
    is_duplicate = 1 if sample["is_duplicate"] else 0
    test_list.append((text_1, text_2, is_duplicate))
train_list = []
train_list.extend(train_p)
train_list.extend(train_n)
random.shuffle(train_list)
print(len(train_list), len(test_list))

20000 10000


In [None]:
path_str = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(path_str)
model_1 = AutoModel.from_pretrained(path_str)
model_2 = AutoModel.from_pretrained(path_str)

In [32]:
class CustomDataset(Dataset):
  def __init__(self, tokenizer, list_samples: list, max_input_length: int = 16):
    self.tokenizer = tokenizer
    self.list_samples = list_samples
    self.max_input_length = max_input_length
  
  def __len__(self,):
    return len(self.list_samples)
  
  def __getitem__(self, idx):
    sample = self.list_samples[idx]
    return {
        'text_1': sample[0],
        'text_2': sample[1],
        'is_duplicate': sample[2],
    }
  
  def collate_fn(self, batch):
    text_1 = []
    text_2 = []
    labels = []
    for sample in batch:
      text_1.append(sample["text_1"])
      text_2.append(sample["text_2"])
      labels.append(sample["is_duplicate"])
    tokens_1 = self.tokenizer(text_1,
                              max_length=self.max_input_length,
                              padding=True,
                              truncation=True,
                              return_tensors="pt",
                              )
    tokens_2 = self.tokenizer(text_2,
                              max_length=self.max_input_length,
                              padding=True,
                              truncation=True,
                              return_tensors="pt",
                              )
    ids_1, att_1 = tokens_1["input_ids"], tokens_1["attention_mask"]
    ids_2, att_2 = tokens_2["input_ids"], tokens_2["attention_mask"]
    labels = torch.tensor(labels, dtype=torch.long)
    return {
        "input_ids_1": ids_1,
        "attention_mask_1": att_1,
        "input_ids_2": ids_2,
        "attention_mask_2": att_2,
        "labels": labels,
    }