<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/modeling_NNLM/test_sample_NNLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install datasets

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from datasets import load_dataset
dataset = load_dataset('quora')

Using custom data configuration default
Reusing dataset quora (/root/.cache/huggingface/datasets/quora/default/0.0.0/2be517cf0ac6de94b77a103a36b141347a13f40637fbebaccb56ddbe397876be)


In [2]:
sentences = []
for sample in dataset['train']:
  if len(sentences) == 10000:
    break
  sent = sample['questions']['text'][0]
  if len(sent.split()) >= 4:
    sentences.append(sent)

In [3]:
word_list = ' '.join(sentences).split()
word_list = list(set(word_list))

word_dict = {w: i for i, w in enumerate(word_list)}
number_dict = {i: w for i, w in enumerate(word_list)}
n_class = len(word_dict)
print('Vocabulary Size: ', n_class)

Vocabulary Size:  18198


In [4]:
class CustomDataset(Dataset):
  def __init__(self, list_sentences, max_inp_length=4):
    self.list_sentences = list_sentences
    self.max_inp_length = max_inp_length
  
  def __len__(self):
    return len(self.list_sentences)
  
  def __getitem__(self, idx):
    input_batch = []
    target_batch = []
    sentences = self.list_sentences[idx]
    tokens = self.tokenize_into_tensors(sentences)
    return {
        'input_batch': tokens['inp_batch'],
        'target_batch': tokens['tgt_batch'],
    }
  
  def tokenize_into_tensors(self, sentence):
    input_batch = []
    target_batch = []
    word = sentence.split()
    word = word[:self.max_inp_length]
    input_tokens = [word_dict[n] for n in word[:-1]]
    target_tokens = word_dict[word[-1]]
    input_batch.append(input_tokens)
    target_batch.append(target_tokens)
    return {
        'inp_batch': torch.tensor(input_batch),
        'tgt_batch': torch.tensor(target_batch),
    }

In [7]:
lim = 90 * len(sentences) // 100
train_sentences = sentences[:lim]
valid_sentences = sentences[lim:]
print('Train Samples: ', len(train_sentences))
print('Valid Samples: ', len(valid_sentences))

Train Samples:  9000
Valid Samples:  1000


In [8]:
train_dataset = CustomDataset(train_sentences)
valid_dataset = CustomDataset(valid_sentences)