In [1]:
import argparse
import logging
import time
import torch
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
from torchtext.data.utils import get_tokenizer, ngrams_iterator
from torchtext.datasets import DATASETS
from torchtext.prototype.transforms import load_sp_model, PRETRAINED_SP_MODEL, SentencePieceTokenizer
from torchtext.utils import download_from_url
from torchtext.vocab import build_vocab_from_iterator
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torchviz import make_dot
from tqdm import tqdm

### Information
- torchtext repo: https://github.com/pytorch/text/tree/main/torchtext
- torchtext documentation: https://pytorch.org/text/stable/index.html

### Constants

In [2]:
DATASET = "WikiText2"
DATA_DIR = ".data"
DEVICE = "mps"
LR = 4.0
BATCH_SIZE = 16
NUM_EPOCHS = 5
MIN_FREQUENCY = 5
PADDING_VALUE = 0
PADDING_IDX = PADDING_VALUE

# n-gram level
n = 3
# Hidden layer dimension
h = 100
# Word embedding dimension
m = 100

### Get the tokenizer

In [3]:
basic_english_tokenizer = get_tokenizer("basic_english")

In [4]:
basic_english_tokenizer("This is some text ...")

['this', 'is', 'some', 'text', '.', '.', '.']

In [5]:
# Needed later.
TOKENIZER = basic_english_tokenizer

### Get the data and get the vocabulary.

In [6]:
def yield_tokens(data_iter):
    for text in data_iter:
        yield TOKENIZER(text)

In [7]:
train_iter = DATASETS[DATASET](root=DATA_DIR, split="train")
VOCAB = build_vocab_from_iterator(yield_tokens(train_iter), min_freq = MIN_FREQUENCY, specials=['<unk>'])

# Set the default index to 1. Otherwise, VOCAB['unknownbigword'] will raise an Exception.
VOCAB.set_default_index(VOCAB['<unk>'])

Examples

In [8]:
VOCAB['yoyooyoyoy'], VOCAB['house'], VOCAB['<pad>'], VOCAB['<unk>']

(0, 324, 0, 0)

In [9]:
print(len(VOCAB))

20409


In [10]:
VOCAB.lookup_indices(TOKENIZER("House house houses ThisisnotaKNownWord"))

[324, 324, 1374, 0]

### Helper functions

In [8]:
def text_pipeline(x):
    return VOCAB(TOKENIZER(x))

Nice link on collate_fn and DataLoader in PyTorch: https://python.plainenglish.io/understanding-collate-fn-in-pytorch-f9d1742647d3

In [9]:
def collate_batch(batch):
    source_list, target_list = [], []
        
    for sentence in batch:
                        
        tokens = text_pipeline(sentence)
        
        for i in range(len(tokens)):
            if i + n -1 <= len(tokens) - 1:
                source, target = tokens[i:i+n-1], tokens[i+n-1]
                source_list.append(torch.tensor(source))
                target_list.append(torch.tensor(target))
            else:
                break
                
    source_list = torch.vstack(source_list) if source_list else torch.empty()
    target_list = torch.vstack(target_list) if target_list else torch.empty()
            
    return source_list.to(DEVICE), target_list.to(DEVICE)

### Set up the model
- nn.Embedding(V, D) is like a hash map.
- It takes in data generally of the shape N X T where N is the batch size and returns a tensor of shape N X T X D.

In [10]:
# Data is of size 16 by 10 with a vocabulary of size 10.
# Imagine that each token / word has a mapping of the sort {word -> id}.
x = torch.randint(0, 10, (16, 10))
e = nn.Embedding(10, 5)
print(x.shape)

torch.Size([16, 10])


In [22]:
x.shape

torch.Size([16, 10])

In [14]:
e(torch.tensor(3))

tensor([ 0.5317, -0.2339,  0.5199,  0.0933,  0.5804],
       grad_fn=<EmbeddingBackward0>)

In [15]:
F.one_hot(torch.tensor(3), num_classes=10).shape

torch.Size([10])

In [16]:
F.one_hot(torch.tensor(3), num_classes=10).float() @ e.weight

tensor([ 0.5317, -0.2339,  0.5199,  0.0933,  0.5804],
       grad_fn=<SqueezeBackward3>)

In [17]:
e = nn.Embedding(10, 5)
# N X T X D - PyTorch is smart enough to realize you are passing in a batch.
print(e(x).shape)

torch.Size([16, 10, 5])


In [18]:
# One of the first Neural language models!
class NeuralLanguageModel(nn.Module):
    def __init__(self, V, m, h, n):
        super(NeuralLanguageModel, self).__init__()
        
        # Vocabulary size.
        self.V = V
        
        # Embedding dimension, per word.
        self.m = m
        
        # Hidden dimension.
        self.h = h
        
        # n in "n-gram".
        self.n = n
        
        # Can you change all this stuff to use nn.Linear?
        # Can also use nn.Parameter(torch.zeros(V, m)) for self.C but then we need one-hot and this is slow.
        self.C = nn.Embedding(V, m)
        
        # nn.Linear((n-1) * m, h, bias=False) would give the same thing for the first one below, and similarly later. 
        self.H = nn.Parameter(torch.zeros((n-1) * m, h))
        self.W = nn.Parameter(torch.zeros((n-1) * m, V))
        self.U = nn.Parameter(torch.zeros(h, V))
        
        self.b = torch.nn.Parameter(torch.ones(V))
        self.d = torch.nn.Parameter(torch.ones(h))
        
    def forward(self, x):
        
        # x is initially of dimension N X n-1 since batch is size N and context is of size n-1.
        
        # N X (n-1) X m 
        x = self.C(x)
        
        # N
        B = x.shape[0]
        
        # N X (n-1) * m
        x = x.view(B, -1)
    
        # N X V
        y = self.b + torch.matmul(x, self.W) + torch.matmul(nn.Tanh()(self.d + torch.matmul(x, self.H)), self.U)
        
        return y

### Set up the 

In [19]:
criterion = torch.nn.CrossEntropyLoss().to(DEVICE)
model = NeuralLanguageModel(len(VOCAB), m, h, n).to(DEVICE)
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

### Set up the data

In [20]:
train_iter = DATASETS[DATASET](root=DATA_DIR, split="train")
test_iter = DATASETS[DATASET](root=DATA_DIR, split="test")

train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

### Train the model

In [21]:
def train(dataloader, model, optimizer, criterion, epoch):
    model.train()
    total_loss, total_batches = 0.0, 0.0
    log_interval = 100

    for idx, (x, y) in tqdm(enumerate(dataloader)):
        optimizer.zero_grad()
        
        if x.nelement() == 0:
            continue
        
        logits = model(x)
                        
        # Get the loss.
        loss = criterion(input=logits, target=y.squeeze(-1))

        # Do back propagation.
        loss.backward()
                        
        # Clip the gradients.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        
        # Do an optimization step.
        optimizer.step()
        total_loss += loss.item()
        total_batches += 1
                
        if idx % log_interval == 0 and idx > 0:
            perplexity = torch.exp(torch.tensor(total_loss / total_batches)).item()
            print(
                "| epoch {:3d} "
                "| {:5d}/{:5d} batches "
                "| perplexity {:8.3f} "
                "| loss {:8.3f} "
                .format(
                    epoch,
                    idx,
                    len(dataloader),
                    perplexity,
                    total_loss / total_batches,
                )
            )
            total_loss, total_batches = 0.0, 0

In [22]:
def evaluate(dataloader, model, criterion):
    model.eval()
    total_loss, total_batches = 0.0, 0

    with torch.no_grad():
        for idx, (x, y) in enumerate(dataloader):
            logits = model(x)
            total_loss += criterion(input=logits, target=y.squeeze(-1)).item()
            total_batches += 1
    return total_loss / total_batches, torch.exp(torch.tensor(total_loss / total_batches)).item()

In [23]:
for epoch in range(1, NUM_EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader, model, optimizer, criterion, epoch)
    loss_val, perplexity_val = evaluate(valid_dataloader, model, criterion)
    scheduler.step()
    print("-" * 59)
    print(
        "| end of epoch {:3d} "
        "| time: {:5.2f}s "
        "| valid perplexity {:8.3f} "
        "| valid loss {:8.3f}".format(
            epoch,
            time.time() - epoch_start_time,
            perplexity_val,
            loss_val
        )
    )
    print("-" * 59)

print("Checking the results of test dataset.")
loss_test, perplexity_test = evaluate(test_dataloader, model, criterion)
print("test perplexity {:8.3f} | test loss {:8.3f} ".format(perplexity_test, loss_test))

102it [00:12,  6.63it/s]

| epoch   1 |   100/ 2181 batches | perplexity 1355.038 | loss    7.212 


201it [00:30,  6.54it/s]

| epoch   1 |   200/ 2181 batches | perplexity  821.589 | loss    6.711 


301it [00:54,  3.17it/s]

| epoch   1 |   300/ 2181 batches | perplexity  710.921 | loss    6.567 


403it [01:22,  4.79it/s]

| epoch   1 |   400/ 2181 batches | perplexity  658.290 | loss    6.490 


501it [01:54,  2.84it/s]

| epoch   1 |   500/ 2181 batches | perplexity  627.470 | loss    6.442 


600it [02:31,  2.07it/s]

| epoch   1 |   600/ 2181 batches | perplexity  611.853 | loss    6.416 


701it [03:16,  2.50it/s]

| epoch   1 |   700/ 2181 batches | perplexity  585.083 | loss    6.372 


801it [03:59,  2.75it/s]

| epoch   1 |   800/ 2181 batches | perplexity  554.135 | loss    6.317 


901it [04:36,  2.70it/s]

| epoch   1 |   900/ 2181 batches | perplexity  557.891 | loss    6.324 


1002it [05:17,  2.08it/s]

| epoch   1 |  1000/ 2181 batches | perplexity  560.686 | loss    6.329 


1102it [05:53,  2.79it/s]

| epoch   1 |  1100/ 2181 batches | perplexity  536.561 | loss    6.285 


1201it [06:32,  2.14it/s]

| epoch   1 |  1200/ 2181 batches | perplexity  513.865 | loss    6.242 


1302it [07:17,  7.51it/s]

| epoch   1 |  1300/ 2181 batches | perplexity  515.425 | loss    6.245 


1402it [07:58,  3.33it/s]

| epoch   1 |  1400/ 2181 batches | perplexity  490.106 | loss    6.195 


1502it [08:34,  2.43it/s]

| epoch   1 |  1500/ 2181 batches | perplexity  488.864 | loss    6.192 


1578it [09:03,  2.90it/s]


KeyboardInterrupt: 

## Questions:
- What is wrong with this implementation?
- Preprocess! ... Or Normalize Correctly!
- My batches are sentences, and a sentence might give rise to many (x, y) pairs.
- Each sentence - batch has 16 sentences.
- I.e. if I have batch 1 maybe it gives rise to 200 (x, y) pairs and another gives rise to 100 (x, y) pairs.
- Assume the sum of the cross-etropy is $L_1$ for the first batch and $L_2$ for the second batch.
- If I have two batches, one of size 200 and another of size 100, I get ($L_1$/200 + $L_2$/100) / 2 != ($L_1$ + $L_2$)(200 + 100)
- Can you fix this?
- HW!