<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/modeling_NNLM/test_sample_NNLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! git clone https://github.com/vishal-burman/PyTorch-Architectures.git

In [None]:
! pip install datasets

In [1]:
%cd PyTorch-Architectures/modeling_NNLM/

/content/PyTorch-Architectures/modeling_NNLM


In [2]:
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from model import NNLM

from datasets import load_dataset
dataset = load_dataset('quora')

Using custom data configuration default
Reusing dataset quora (/root/.cache/huggingface/datasets/quora/default/0.0.0/2be517cf0ac6de94b77a103a36b141347a13f40637fbebaccb56ddbe397876be)


In [3]:
sentences = []
for sample in dataset['train']:
  if len(sentences) == 10000:
    break
  sent = sample['questions']['text'][0]
  if len(sent.split()) >= 4:
    sentences.append(sent)

In [4]:
word_list = ' '.join(sentences).split()
word_list = list(set(word_list))

word_dict = {w: i for i, w in enumerate(word_list)}
number_dict = {i: w for i, w in enumerate(word_list)}
n_class = len(word_dict)
print('Vocabulary Size: ', n_class)

Vocabulary Size:  18198


In [5]:
class CustomDataset(Dataset):
  def __init__(self, list_sentences, max_inp_length=4):
    self.list_sentences = list_sentences
    self.max_inp_length = max_inp_length
  
  def __len__(self):
    return len(self.list_sentences)
  
  def __getitem__(self, idx):
    input_batch = []
    target_batch = []
    sentences = self.list_sentences[idx]
    tokens = self.tokenize_into_tensors(sentences)
    return {
        'input_batch': tokens['inp_batch'],
        'target_batch': tokens['tgt_batch'],
    }
  
  def tokenize_into_tensors(self, sentence):
    input_batch = []
    target_batch = []
    word = sentence.split()
    word = word[:self.max_inp_length]
    input_tokens = [word_dict[n] for n in word[:-1]]
    target_tokens = word_dict[word[-1]]
    input_batch.append(input_tokens)
    target_batch.append(target_tokens)
    return {
        'inp_batch': torch.tensor(input_batch),
        'tgt_batch': torch.tensor(target_batch),
    }

In [6]:
lim = 90 * len(sentences) // 100
train_sentences = sentences[:lim]
valid_sentences = sentences[lim:]
print('Train Samples: ', len(train_sentences))
print('Valid Samples: ', len(valid_sentences))

Train Samples:  9000
Valid Samples:  1000


In [7]:
train_dataset = CustomDataset(train_sentences, max_inp_length=4)
valid_dataset = CustomDataset(valid_sentences, max_inp_length=4)

In [8]:
# Hyperparameters
m = 200
n_hidden = 100
n_step = 3
BATCH_SIZE = 32
LEARNING_RATE = 0.001
EPOCHS = 10

In [9]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = NNLM(n_class=n_class, m=m, n_hidden=n_hidden, n_step=n_step)
model.to(device)

NNLM(
  (C): Embedding(18198, 200)
  (H): Linear(in_features=600, out_features=100, bias=False)
  (U): Linear(in_features=100, out_features=18198, bias=False)
  (W): Linear(in_features=600, out_features=18198, bias=False)
)

In [10]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Sanity check DataLoader
for sample in train_loader:
  assert sample['input_batch'].squeeze(1).dim() == 2
  assert sample['target_batch'].dim() == 2
  break

print('Length of Train Loader: ', len(train_loader))
print('Length of Valid Loader: ', len(valid_loader))

Length of Train Loader:  282
Length of Valid Loader:  32


In [11]:
# Sanity check model outputs
model.eval()
with torch.set_grad_enabled(False):
  outputs = model(sample['input_batch'].squeeze(1))
  assert outputs.size(1) == n_class

In [12]:
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

In [13]:
def compute_loss(model, data_loader, device):
  list_loss = []
  with torch.set_grad_enabled(False):
    for sample in data_loader:
      features = sample['input_batch'].squeeze(1)
      targets = sample['target_batch'].squeeze(1)

      logits = model(features)
      loss = F.cross_entropy(logits, targets)
      list_loss.append(loss.item())
  return torch.tensor(list_loss).mean()

start_time = time.time()
for epoch in range(EPOCHS):
  model.train()
  for idx, sample in enumerate(train_loader):
    features = sample['input_batch'].squeeze(1)
    targets = sample['target_batch'].squeeze(1)

    logits = model(features)
    loss = F.cross_entropy(logits, targets)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # LOGGING
    if idx % 50 == 0:
      print('Batch: %04d/%04d || Epoch: %04d/%04d || Loss: %.2f' % (idx, len(train_loader), epoch+1, EPOCHS, loss.item()))
  
  model.eval()
  with torch.set_grad_enabled(False):
    train_loss = compute_loss(model, train_loader, device)
    valid_loss = compute_loss(model, valid_loader, device)

    print('Train Loss: %.2f' % (train_loss.item()))
    print('Valid Loss: %.2f' % (valid_loss.item()))
  epoch_elapsed_time = (time.time() - start_time) / 60
  print('Epoch Elapsed Time: %.2f min' % (epoch_elapsed_time))
total_training_time = (time.time() - start_time) / 60
print('Total Training Time: %.2f min' % (total_training_time))

Batch: 0000/0282 || Epoch: 0001/0010 || Loss: 9.95
Batch: 0050/0282 || Epoch: 0001/0010 || Loss: 9.98
Batch: 0100/0282 || Epoch: 0001/0010 || Loss: 9.90
Batch: 0150/0282 || Epoch: 0001/0010 || Loss: 9.46
Batch: 0200/0282 || Epoch: 0001/0010 || Loss: 9.51
Batch: 0250/0282 || Epoch: 0001/0010 || Loss: 9.49
Train Loss: 9.22
Valid Loss: 9.22
Epoch Elapsed Time: 0.65 min
Batch: 0000/0282 || Epoch: 0002/0010 || Loss: 9.35
Batch: 0050/0282 || Epoch: 0002/0010 || Loss: 9.36
Batch: 0100/0282 || Epoch: 0002/0010 || Loss: 8.77
Batch: 0150/0282 || Epoch: 0002/0010 || Loss: 8.60
Batch: 0200/0282 || Epoch: 0002/0010 || Loss: 8.47
Batch: 0250/0282 || Epoch: 0002/0010 || Loss: 9.19
Train Loss: 8.90
Valid Loss: 8.90
Epoch Elapsed Time: 1.28 min
Batch: 0000/0282 || Epoch: 0003/0010 || Loss: 9.78
Batch: 0050/0282 || Epoch: 0003/0010 || Loss: 8.93
Batch: 0100/0282 || Epoch: 0003/0010 || Loss: 7.81
Batch: 0150/0282 || Epoch: 0003/0010 || Loss: 8.87
Batch: 0200/0282 || Epoch: 0003/0010 || Loss: 9.08
Batch: 

In [21]:
model.eval()
with torch.set_grad_enabled(False):
  text = "What will be".split()
  input_tokens = [word_dict[n] for n in text]
  input_tokens = torch.tensor(input_tokens).unsqueeze(0)
  logits = model(input_tokens)
  probas = F.softmax(logits, dim=1)
  _, predicted_word_idx = torch.max(probas, 1)
  print('Your input --> ', ' '.join(text))
  print('Predicted next token --> ', number_dict[predicted_word_idx.item()])

Your input -->  What will be
Predicted next token -->  the
