<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/modeling_NNLM/test_sample_NNLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! git clone https://github.com/vishal-burman/PyTorch-Architectures.git
%cd PyTorch-Architectures/modeling_NNLM/

In [None]:
! pip install datasets

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from model import NNLM

from datasets import load_dataset
dataset = load_dataset('quora')

In [4]:
sentences = []
for sample in dataset['train']:
  if len(sentences) == 10000:
    break
  sent = sample['questions']['text'][0]
  if len(sent.split()) >= 4:
    sentences.append(sent)

In [5]:
word_list = ' '.join(sentences).split()
word_list = list(set(word_list))

word_dict = {w: i for i, w in enumerate(word_list)}
number_dict = {i: w for i, w in enumerate(word_list)}
n_class = len(word_dict)
print('Vocabulary Size: ', n_class)

Vocabulary Size:  18198


In [6]:
class CustomDataset(Dataset):
  def __init__(self, list_sentences, max_inp_length=4):
    self.list_sentences = list_sentences
    self.max_inp_length = max_inp_length
  
  def __len__(self):
    return len(self.list_sentences)
  
  def __getitem__(self, idx):
    input_batch = []
    target_batch = []
    sentences = self.list_sentences[idx]
    tokens = self.tokenize_into_tensors(sentences)
    return {
        'input_batch': tokens['inp_batch'],
        'target_batch': tokens['tgt_batch'],
    }
  
  def tokenize_into_tensors(self, sentence):
    input_batch = []
    target_batch = []
    word = sentence.split()
    word = word[:self.max_inp_length]
    input_tokens = [word_dict[n] for n in word[:-1]]
    target_tokens = word_dict[word[-1]]
    input_batch.append(input_tokens)
    target_batch.append(target_tokens)
    return {
        'inp_batch': torch.tensor(input_batch),
        'tgt_batch': torch.tensor(target_batch),
    }

In [7]:
lim = 90 * len(sentences) // 100
train_sentences = sentences[:lim]
valid_sentences = sentences[lim:]
print('Train Samples: ', len(train_sentences))
print('Valid Samples: ', len(valid_sentences))

Train Samples:  9000
Valid Samples:  1000


In [8]:
train_dataset = CustomDataset(train_sentences, max_inp_length=3)
valid_dataset = CustomDataset(valid_sentences, max_inp_length=3)

In [9]:
# Hyperparameters
m = 200
n_hidden = 100
n_step = 2
BATCH_SIZE = 32
LEARNING_RATE = 0.001
EPOCHS = 5

In [10]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = NNLM(n_class=n_class, m=m, n_hidden=n_hidden, n_step=n_step)
model.to(device)

NNLM(
  (C): Embedding(18198, 200)
  (H): Linear(in_features=400, out_features=100, bias=False)
  (U): Linear(in_features=100, out_features=18198, bias=False)
  (W): Linear(in_features=400, out_features=18198, bias=False)
)

In [18]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Sanity check DataLoader
for sample in train_loader:
  assert sample['input_batch'].squeeze(1).dim() == 2
  assert sample['target_batch'].dim() == 2
  break

In [21]:
# Sanity check model outputs
model.eval()
with torch.set_grad_enabled(False):
  outputs = model(sample['input_batch'].squeeze(1))
  assert outputs.size(1) == n_class

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
def compute_loss(model, data_loader, device):
  pass

start_time = time.time()
for epoch in range(EPOCHS):
  model.train()
  for idx, sample in enumerate(train_loader):
    features = sample['input_batch'].squeeze(1)
    target = sample['target_batch']

    logits = model(features)
    loss = F.cross_entropy(logits, targets)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # LOGGING
    if idx % 50 == 0:
      print('Batch: %04d/%04d || Epoch: %04d/%04d || Loss: %.2f' % (idx, len(train_loader), epoch+1, EPOCHS, loss.item()))
  
  model.eval()
  with torch.set_grad_enabled(False):
    valid_loss = compute_loss(model, valid_loader, device)
    pass