### Simple Language Models Continued...

In [1]:
# load dataset and store the words in a list
words = open('names.txt', 'r').read().split()

print(f"Total number of words: {len(words)}")
print(f"Shortest word: {min([len(w) for w in words])}")
print(f"Longest word: {max([len(w) for w in words])}")

Total number of words: 32033
Shortest word: 2
Longest word: 15


#### We looked at the count based bi-gram language model and the equivalent simple neural network based bi-gram model which takes in a one-hot encoded character as input. Now we will try to build a language model using a neural network with one hidden layer which learns to predict the next character giving a sequence of multiple previous characters, so larger context window than bi-gram model. We will also represent each character of the vocabulary with a learned embedding vector.  

In [2]:
# now lets create an indexed vocabulary of characters
pad_token = '*'
vocab = sorted(set([pad_token] + list("".join(words))))
ctoi = {vocab[i]:i for i in range(len(vocab))}
ctoi[pad_token] = 0 # special token for padding
itoc = {i:vocab[i] for i in range(len(vocab))}
print(f"Vocabulary: {vocab}")
print(f"Character indices: {ctoi}")

Vocabulary: ['*', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Character indices: {'*': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}


In [3]:
import torch 

# Now lets prepare the training dataset
block_size = 3 # size of context window, i.e. number of previous characters in input sequence

X, Y = [], []
for w in words:
    # word is padded on the left with block_size padding tokens and on the right with one padding token 
    w_ix = [ctoi[c] for c in w]
    chars = [0]*block_size + w_ix + [0]
    for i in range(len(chars)-block_size):
        # context characters in window of size block_size
        xc = chars[i:i+block_size]
        # target character
        yc = chars[i+block_size]
        X.append(xc)
        Y.append(yc)
        #print(xc, "--> ", yc)
        
# convert to pytorch tensors
X = torch.tensor(X)
Y = torch.tensor(Y)

In [4]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

In [5]:
# matrix of embedding vectors, randomly initialized
embedding_dims = 2
C = torch.randn((len(vocab), embedding_dims))

In [6]:
# convert input character indices to embedding vectors and concatenate
X_emb = C[X]
shape = X_emb.shape
X_emb = X_emb.view(-1, block_size*embedding_dims)
X_emb.shape

torch.Size([228146, 6])

In [7]:
# intiialize parameters of the hidden layer
hidden_dims = 100
W1 = torch.randn((block_size*embedding_dims, hidden_dims))
b1 = torch.randn(hidden_dims)

In [8]:
# compute hidden layer activations
h = torch.tanh(X_emb @ W1 + b1)
h.shape

torch.Size([228146, 100])

In [9]:
# initialize output layer parameters
W2 = torch.randn((hidden_dims, len(vocab)))
b2 = torch.randn(len(vocab))

In [10]:
# compute output logits
logits = h @ W2 + b2

In [11]:
import torch.nn.functional as F

# compute average negative log-likelihood loss
loss = F.cross_entropy(logits, Y)
loss

tensor(13.4277)

#### Training the model

In [30]:
# first, create train-dev-test (80-10-10) splits of the dataset
block_size = 8

def build_dataset(words):
    X, Y = [], []
    for w in words:
        # word is padded on the left with block_size padding tokens and on the right with one padding token 
        '''
        context = [0] * block_size
        for ch in w + '*':
            ix = ctoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
        '''
        w_ix = [ctoi[c] for c in w]
        chars = [0]*block_size + w_ix + [0]
        for i in range(len(chars)-block_size):
            # context characters in window of size block_size
            xc = chars[i:i+block_size]
            # target character
            yc = chars[i+block_size]
            X.append(xc)
            Y.append(yc)

    # convert to pytorch tensors
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    
    return X, Y

import random

random.seed(1)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtrain, Ytrain = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xtest, Ytest = build_dataset(words[n2:])

Note: When initialiazing model parameters, we would like to ensure that the output logits during the beginning of training don't vary across a large range of values, otherwise we would get high loss values at the start. Therefore it's a good idea to initialize biases to zero and weigts to small random values close to zero.

In [31]:
# create a pytorch generator
g = torch.Generator().manual_seed(123)

num_epochs = 50000
lr = 0.01            # gradient descent learning rate
batch_size = 32
embedding_dims = 10
hidden_dims = 200
vocab_size = len(vocab)

# initialize model parameters
C = torch.randn((vocab_size, embedding_dims), generator=g)
W1 = torch.randn((block_size*embedding_dims, hidden_dims), generator=g) * 0.01
# b1 = torch.randn(hidden_dims, generator=g) * 0
W2 = torch.randn((hidden_dims, vocab_size), generator=g) * 0.01
b2 = torch.randn(vocab_size, generator=g) * 0

# parameters for batch normalization
bngain = torch.ones((1,hidden_dims))
bnbias = torch.zeros((1,hidden_dims))

parameters = [C, W1, W2, b2, bngain, bnbias] # don't need to include b1 since we're not using it
for p in parameters:
    p.requires_grad=True

print(f"Total number of params: {sum(p.nelement() for p in parameters)}")

Total number of params: 22097


In [32]:
def train(num_epochs=1000, lr=0.00):
    print("Training...")
    for i in range(num_epochs):
        # randomly select a batch of input instances
        batch_ix = torch.randint(0, Xtrain.shape[0], (batch_size,), generator=g)
        X_batch, Y_batch = Xtrain[batch_ix], Ytrain[batch_ix] 

        # reset parameter gradients
        for p in parameters:
            p.grad = None
        
        # forward pass
        X_emb = C[X_batch].view(-1,block_size*embedding_dims)
        h_linear = X_emb @ W1 # + b1

        # before passing into tanh activation function, we apply batch normalization
        # batch norm involves normalizing the linear layer output neuron to have zero mean and unit variance 
        # over the entire batch of inputs. This is followed by scaling and shifting the values. The advantage 
        # of batch norm is that the linear layer outputs passed into the tanh will be mostly concentrated in 
        # the non-flat regions of the tanh function and so we avoid vanishing gradients during packprop and dead neurons
        # Note: when using batch norm on the output of a linear layer, we don't need to use any bias in that linear layer
        # because the batch norm shifting term 'bnbias' will do the job of a bias term in the linear layer, so having a bias
        # in the linear layer is just redundant and not needed.  

        # normalize
        bnmeani = h_linear.mean(dim=0, keepdims=True)
        bnstdi = h_linear.std(dim=0, keepdims=True)
        h_linear = (h_linear - bnmeani) / bnstdi 
        # scale and shift
        h_linear = bngain * h_linear  + bnbias

        h = torch.tanh(h_linear)
        logits = h @ W2 + b2
        loss = F.cross_entropy(logits, Y_batch)

        # backward pass
        loss.backward()

        # reduce learning rate by factor of 10 halfway through training
        if(i//num_epochs == 2):
            lr = 0.1 * lr

        # update parameters via gradient descent
        for p in parameters:
            p.data -= lr * p.grad
        
        if i % 500 == 0:
            print(f"Epoch# {i}, Mini Batch Average Loss: {loss.item()}")

    return h_linear, h

In [33]:
# training loop
lr = 0.01
num_epochs = 200000
h_linear, h = train(num_epochs=num_epochs, lr=lr)

Training...
Epoch# 0, Mini Batch Average Loss: 3.294069528579712


Epoch# 500, Mini Batch Average Loss: 2.6730916500091553
Epoch# 1000, Mini Batch Average Loss: 2.764364242553711
Epoch# 1500, Mini Batch Average Loss: 2.3868703842163086
Epoch# 2000, Mini Batch Average Loss: 2.2646641731262207
Epoch# 2500, Mini Batch Average Loss: 2.175185441970825
Epoch# 3000, Mini Batch Average Loss: 2.229996681213379
Epoch# 3500, Mini Batch Average Loss: 2.2371535301208496
Epoch# 4000, Mini Batch Average Loss: 1.9388017654418945
Epoch# 4500, Mini Batch Average Loss: 2.5900118350982666
Epoch# 5000, Mini Batch Average Loss: 2.445107936859131
Epoch# 5500, Mini Batch Average Loss: 2.2273054122924805
Epoch# 6000, Mini Batch Average Loss: 2.4328255653381348
Epoch# 6500, Mini Batch Average Loss: 2.18696928024292
Epoch# 7000, Mini Batch Average Loss: 2.333089828491211
Epoch# 7500, Mini Batch Average Loss: 2.240892171859741
Epoch# 8000, Mini Batch Average Loss: 2.2384283542633057
Epoch# 8500, Mini Batch Average Loss: 2.191762685775757
Epoch# 9000, Mini Batch Average Loss: 2.3

Since we've used batch normalization during training, we need to calibrate the batch normalization parameters so that they can be used correctly during inference. The calibration involves estimating the mean and standard deviations of the hidden linear layer neurons and using these values during inference for normalization. 

In [34]:
# batch norm calibration using training set
with torch.no_grad():
    X_emb = C[Xtrain].view(-1,block_size*embedding_dims)
    h_linear = X_emb @ W1 + b1
    # estimate mean and std over the entire training set
    bnmean = h_linear.mean(dim=0, keepdims=True)
    bnstd = h_linear.std(dim=0, keepdims=True)
    

In [35]:
@torch.no_grad() # disable gradient tracking
def split_loss(split='train'):
    X, Y = {'train' : (Xtrain, Ytrain), 'val': (Xdev, Ydev), 'test': (Xtest, Ytest)}[split]
    X_emb = C[X].view(-1,block_size*embedding_dims)
    h_linear = X_emb @ W1 + b1
    h_linear = (h_linear - bnmean) / bnstd 
    h_linear = bngain * h_linear  + bnbias
    h = torch.tanh(h_linear)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y)
    print(f"{split} loss: {loss}")

In [36]:
split_loss('train')
split_loss('val')
split_loss('test')

train loss: 1.9706335067749023
val loss: 2.0401453971862793
test loss: 2.0596413612365723


### Sampling from our trained model

In [37]:
@torch.no_grad() # disable gradient tracking
def generate_word_nn(start_char=None):

    generated_word = []
    end_ix = ctoi[pad_token]

    # start with the special token
    if start_char is None:
        context = [end_ix]*block_size
    else:
        assert start_char in vocab, "Error! Start character must be a letter from the English alphabet."
        context = [end_ix]*(block_size-1) + [ctoi[start_char]]
        generated_word.append(start_char)

    # generate characters one by one by sampling from the probability distribution p(c2|c1)
    while True:
        # get probabilities from the neural network
        X_emb = C[torch.tensor([context])].view(1,-1)

        h_linear = X_emb @ W1 + b1
        h_linear = (h_linear - bnmean) / bnstd 
        h_linear = bngain * h_linear  + bnbias
        h = torch.tanh(h_linear)
        logits = h @ W2 + b2
        p = F.softmax(logits, dim=-1)
        
        # sample the next character 
        cnext = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()

        # stop if we generate the special token
        if cnext==end_ix:
            break
        
        context = context[1:] + [cnext]

        generated_word.append(itoc[cnext])

    return ''.join(generated_word) 

In [38]:
# generate a bunch of names
for i in range(20):
    print(generate_word_nn())

nyrel
emrekd
raella
elscael
besten
blytha
aryia
rakell
tesiah
ellisa
alis
kehly
jaquan
janeel
gumrri
braiken
tifkai
tabiena
madel
hehli
