### Simple Language Models Continued...

In [195]:
# load dataset and store the words in a list
words = open('names.txt', 'r').read().split()

print(f"Total number of words: {len(words)}")
print(f"Shortest word: {min([len(w) for w in words])}")
print(f"Longest word: {max([len(w) for w in words])}")

Total number of words: 32033
Shortest word: 2
Longest word: 15


#### We looked at the count based bi-gram language model and the equivalent simple neural network based bi-gram model which takes in a one-hot encoded character as input. Now we will try to build a language model using a neural network with one hidden layer which learns to predict the next character giving a sequence of multiple previous characters, so larger context window than bi-gram model. We will also represent each character of the vocabulary with a learned embedding vector.  

In [196]:
# now lets create an indexed vocabulary of characters
pad_token = '*'
vocab = sorted(set([pad_token] + list("".join(words))))
ctoi = {vocab[i]:i for i in range(len(vocab))}
ctoi[pad_token] = 0 # special token for padding
itoc = {i:vocab[i] for i in range(len(vocab))}
print(f"Vocabulary: {vocab}")
print(f"Character indices: {ctoi}")

Vocabulary: ['*', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Character indices: {'*': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}


In [211]:
import torch 

# Now lets prepare the training dataset
block_size = 3 # size of context window, i.e. number of previous characters in input sequence

X, Y = [], []
for w in words:
    # word is padded on the left with block_size padding tokens and on the right with one padding token 
    w_ix = [ctoi[c] for c in w]
    chars = [0]*block_size + w_ix + [0]
    for i in range(len(chars)-block_size):
        # context characters in window of size block_size
        xc = chars[i:i+block_size]
        # target character
        yc = chars[i+block_size]
        X.append(xc)
        Y.append(yc)
        #print(xc, "--> ", yc)
        
# convert to pytorch tensors
X = torch.tensor(X)
Y = torch.tensor(Y)

In [198]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

In [199]:
# matrix of embedding vectors, randomly initialized
embedding_dims = 2
C = torch.randn((len(vocab), embedding_dims))

In [200]:
# convert input character indices to embedding vectors and concatenate
X_emb = C[X]
shape = X_emb.shape
X_emb = X_emb.view(-1, block_size*embedding_dims)
X_emb.shape

torch.Size([228146, 6])

In [201]:
# intiialize parameters of the hidden layer
hidden_dims = 100
W1 = torch.randn((block_size*embedding_dims, hidden_dims))
b1 = torch.randn(hidden_dims)

In [202]:
# compute hidden layer activations
h = torch.tanh(X_emb @ W1 + b1)
h.shape

torch.Size([228146, 100])

In [203]:
# initialize output layer parameters
W2 = torch.randn((hidden_dims, len(vocab)))
b2 = torch.randn(len(vocab))

In [204]:
# compute output logits
logits = h @ W2 + b2

In [205]:
import torch.nn.functional as F

# compute average negative log-likelihood loss
loss = F.cross_entropy(logits, Y)
loss

tensor(16.4521)

#### Training the model

In [212]:
# first, create train-dev-test (80-10-10) splits of the dataset
block_size = 3 

def build_dataset(words):
    X, Y = [], []
    for w in words:
        # word is padded on the left with block_size padding tokens and on the right with one padding token 
        '''
        context = [0] * block_size
        for ch in w + '*':
            ix = ctoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
        '''
        w_ix = [ctoi[c] for c in w]
        chars = [0]*block_size + w_ix + [0]
        for i in range(len(chars)-block_size):
            # context characters in window of size block_size
            xc = chars[i:i+block_size]
            # target character
            yc = chars[i+block_size]
            X.append(xc)
            Y.append(yc)

    # convert to pytorch tensors
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    
    return X, Y

import random

random.seed(1)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtrain, Ytrain = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xtest, Ytest = build_dataset(words[n2:])

In [253]:
# create a pytorch generator
g = torch.Generator().manual_seed(123)

num_epochs = 50000
lr = 0.01            # gradient descent learning rate
batch_size = 32
embedding_dims = 10
hidden_dims = 200

# initialize model parameters
C = torch.randn((len(vocab), embedding_dims), generator=g)
W1 = torch.randn((block_size*embedding_dims, hidden_dims), generator=g)
b1 = torch.randn(hidden_dims, generator=g)
W2 = torch.randn((hidden_dims, len(vocab)), generator=g)
b2 = torch.randn(len(vocab), generator=g)
parameters = [C, W1, b1, W2, b2]
for p in parameters:
    p.requires_grad=True

print(f"Total number of params: {sum(p.nelement() for p in parameters)}")
print("Training...")


Total number of params: 11897
Training...


In [260]:
# training loop
lr = 0.005
num_epochs = 50000
for i in range(num_epochs):

    # randomly select a batch of input instances
    batch_ix = torch.randint(0, Xtrain.shape[0], (batch_size,))
    X_batch = Xtrain[batch_ix] 

    # reset parameter gradients
    for p in parameters:
        p.grad = None
    
    # forward pass
    X_emb = C[X_batch].view(-1,block_size*embedding_dims)
    h = torch.tanh(X_emb @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Ytrain[batch_ix])

    # backward pass
    loss.backward()

    # update parameters via gradient descent
    for p in parameters:
        p.data -= lr * p.grad
    
    if i % 5 == 0:
        print(f"Epoch# {i}, Mini Batch Average Loss: {loss.item()}")

Epoch# 0, Mini Batch Average Loss: 2.2035439014434814
Epoch# 5, Mini Batch Average Loss: 2.7356204986572266
Epoch# 10, Mini Batch Average Loss: 2.189345359802246
Epoch# 15, Mini Batch Average Loss: 2.5006918907165527
Epoch# 20, Mini Batch Average Loss: 2.1742568016052246
Epoch# 25, Mini Batch Average Loss: 2.1284828186035156
Epoch# 30, Mini Batch Average Loss: 2.544597864151001
Epoch# 35, Mini Batch Average Loss: 2.2844319343566895
Epoch# 40, Mini Batch Average Loss: 2.312385320663452
Epoch# 45, Mini Batch Average Loss: 2.2803292274475098
Epoch# 50, Mini Batch Average Loss: 2.228780508041382
Epoch# 55, Mini Batch Average Loss: 2.0597105026245117
Epoch# 60, Mini Batch Average Loss: 2.3032867908477783
Epoch# 65, Mini Batch Average Loss: 2.2449727058410645
Epoch# 70, Mini Batch Average Loss: 2.1648638248443604
Epoch# 75, Mini Batch Average Loss: 2.2720022201538086
Epoch# 80, Mini Batch Average Loss: 2.4217476844787598
Epoch# 85, Mini Batch Average Loss: 2.555250883102417
Epoch# 90, Mini B

In [262]:
X_emb = C[Xtrain].view(-1,block_size*embedding_dims)
h = torch.tanh(X_emb @ W1 + b1)
logits = h @ W2 + b2
loss_train = F.cross_entropy(logits, Ytrain)
print(f"Average loss over entire training set: {loss_train}")

X_emb = C[Xdev].view(-1,block_size*embedding_dims)
h = torch.tanh(X_emb @ W1 + b1)
logits = h @ W2 + b2
loss_dev = F.cross_entropy(logits, Ydev)
print(f"Average loss over dev set: {loss_dev}")

X_emb = C[Xtest].view(-1,block_size*embedding_dims)
h = torch.tanh(X_emb @ W1 + b1)
logits = h @ W2 + b2
loss_test = F.cross_entropy(logits, Ytest)
print(f"Average loss over dev set: {loss_test}")

Average loss over entire training set: 2.267941951751709
Average loss over dev set: 2.271620512008667
Average loss over dev set: 2.282083749771118


### Sampling from our trained model

In [263]:
def generate_word_nn(start_char=None):

    generated_word = []
    end_ix = ctoi[pad_token]

    # start with the special token
    if start_char is None:
        context = [end_ix]*3
    else:
        assert start_char in vocab, "Error! Start character must be a letter from the English alphabet."
        context = [end_ix]*2 + [ctoi[start_char]]
        generated_word.append(start_char)

    # generate characters one by one by sampling from the probability distribution p(c2|c1)
    while True:
        # get probabilities from the neural network
        X_emb = C[context].view(-1,block_size*embedding_dims)
        h = torch.tanh(X_emb @ W1 + b1)
        logits = h @ W2 + b2
        counts = logits.exp()
        p = counts / counts.sum()
        
        # sample the next character 
        cnext = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()

        # stop if we generate the special token
        if cnext==end_ix:
            break
        
        context = context[1:] + [cnext]

        generated_word.append(itoc[cnext])

    return ''.join(generated_word) 

In [265]:
# generate a bunch of names
for i in range(20):
    print(generate_word_nn())

jealu
toafiannes
con
carwen
rubekrooum
jayleana
amarsinorio
brenna
vacka
rise
kanaritn
drey
alyn
jasealauhofvente
edelech
kami
den
kin
magix
avie
