# WaveNet

In this notebook, I have tried to implement the Makemore: Part 5 from Andrej Karpathy's series. This notebook is my implementation, along with the notes that I took while following along the lecture.

In [45]:
# imports
import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F
%matplotlib inline

### Building the Dataset

Same as always- we're creating training, test, and validation datasets where each training input is a three character sequence and the output is the 4th character that we want to predict.

In [46]:
# Read Data
words = open(r'../names.txt', 'r').read().splitlines()
len(words), words[:8]

(32033,
 ['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia'])

In [47]:
# Building vocabulary and Basic Character encoding and decoding

chars = sorted(list(set(''.join(words)))) # Get all the unique chars in sorted order

stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}
vocab_size = len(itos)
print("itos: ", itos)
print("Vocab Size is: ", vocab_size)

itos:  {1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
Vocab Size is:  27


In [48]:
import random

random.seed(42)
random.shuffle(words)

In [49]:
block_size = 3 

def build_dataset(words):
    X, Y = [ ], [ ]

    for w in words:
        context = [0] * block_size # For start of the word, have a padded context
        for char in w + ".":
            idx = stoi[char]
            X.append(context)
            Y.append(idx)

            context = context[1:] + [idx]

    X = torch.tensor(X)
    Y = torch.tensor(Y)

    return X, Y

In [50]:
# Train, Validation, and test split
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

X_train, Y_train = build_dataset(words=words[:n1]) # 80% of data
X_val, Y_val = build_dataset(words=words[n1:n2]) # 10% of data
X_test, Y_test = build_dataset(words=words[n2:]) # 10% of data

print("Train sizes: ", X_train.shape, Y_train.shape)
print("Validation Sizes: ", X_val.shape, Y_val.shape)
print("Test Sizes: ", X_test.shape, Y_test.shape)

Train sizes:  torch.Size([182625, 3]) torch.Size([182625])
Validation Sizes:  torch.Size([22655, 3]) torch.Size([22655])
Test Sizes:  torch.Size([22866, 3]) torch.Size([22866])


## PyTorch-ifying

In part 3, we PyTorch-ified some of our code. We defined layers and modules, but there is more simplification that we can do. We do it in this section.

This is what we had so far.

In [51]:
class Linear:
    def __init__(self, fan_in, fan_out, bias=True):
        self.weight = torch.randn((fan_in, fan_out)) / (fan_in **0.5) # Divide by sqrt(fan_in)
        self.bias = torch.zeros(fan_out) if bias else None

    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out
    
    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])

class BatchNorm1d:
    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps # Small number to avoid zero division error when normalization
        self.momentum = momentum
        self.training = True

        # Parameters
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

        # Not parameters- "buffers"
        self.running_mean = torch.ones(dim)
        self.running_var = torch.zeros(dim)

    def __call__(self, x:torch.Tensor):
        if self.training:
            xmean = x.mean(0, keepdim=True) # Batch mean
            xvar = x.var(0, keepdim=True, unbiased=True) # Batch variance

        else:
            xmean = self.running_mean
            xvar = self.running_var

        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # Normalize to zero mean and unit variance

        self.out = self.gamma * xhat +  self.beta
        
        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]

class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out

    def parameters(self):
        return []

With these modules, we were defining the layers, like this. But before going there, let's define the manual seed for PyTorch.

In [52]:
torch.manual_seed(42)

<torch._C.Generator at 0x225fdd3fab0>

In [53]:
n_emb = 10
n_hidden = 200

C = torch.randn((vocab_size, n_emb))
layers = [
    Linear(n_emb * block_size, n_hidden, bias=False), 
    BatchNorm1d(n_hidden), 
    Tanh(), 
    Linear(n_hidden, vocab_size)
]

with torch.no_grad():
    layers[-1].weight *= 0.1 # Make the last layer less confident

parameters = [ C ] + [ p for layer in layers for p in layer.parameters() ]

for p in parameters:
    p.requires_grad = True

In [54]:
epochs = 10_000
batch_size = 32
lossi = [ ]

for i in range(epochs):

    ix = torch.randint(0, X_train.shape[0], (batch_size, ))
    x_batch, y_batch = X_train[ix], Y_train[ix]

    # Observe these two lines- no PyTorch-ifying them yet
    embeddings = C[x_batch]
    x = embeddings.view(embeddings.shape[0], -1)

    for layer in layers:
        x = layer(x)

    loss = F.cross_entropy(x, y_batch)

    for p in parameters:
        p.grad = None

    loss.backward()

    for p in parameters:
        p.data -= 0.01 * p.grad

    break

But note that the embedding matrix and the `torch.view()` operation weren't part of the layers. So we build these modules.

In [55]:
class Embedding:
    def __init__(self, num_embeddings, embedding_dim):
        self.weight = torch.randn((num_embeddings, embedding_dim))

    def __call__(self, ix):
        self.out = self.weight[ix]
        return self.out
    
    def parameters(self):
        return [self.weight]
    
class Flatten:
    def __call__(self, x) -> None:
        self.out = x.view(x.shape[0], -1)
        return self.out
    
    def parameters(self):
        return []

Now, we can get rid of the `C` matrix and `embeddings.view()` method, and define these two classes inside the layers list:

In [56]:
n_emb = 10
n_hidden = 200

layers = [
    Embedding(vocab_size, n_emb),
    Flatten(),
    Linear(n_emb * block_size, n_hidden, bias=False), 
    BatchNorm1d(n_hidden), 
    Tanh(), 
    Linear(n_hidden, vocab_size)
]

with torch.no_grad():
    layers[-1].weight *= 0.1 # Make the last layer less confident

parameters = [ p for layer in layers for p in layer.parameters() ] # No need to add 'C' in this list, bcoz it's included
for p in parameters:
    p.requires_grad = True

We can get rid of those two lines in the training loop also:

In [57]:
epochs = 10_000
batch_size = 32
lossi = [ ]

for i in range(epochs):

    ix = torch.randint(0, X_train.shape[0], (batch_size, ))
    x_batch, y_batch = X_train[ix], Y_train[ix]

    x = x_batch # Added this line to pass the x_batch as input to the layers
    for layer in layers:
        x = layer(x)

    loss = F.cross_entropy(x, y_batch)

    for p in parameters:
        p.grad = None

    loss.backward()

    for p in parameters:
        p.data -= 0.01 * p.grad

    break

print(loss.item())

3.313666343688965


We can do even better! Instead of defining the `layers` list and then looping over it in the loop, we can define a class that does it for us. We define container classes, such as `Sequential` or `Module`, which are also present in the PyTorch's `nn` module. Since we want to iterate over the layers sequentially, we will define the `Sequential` class.

In [63]:
class Sequential:
    def __init__(self, layers):
        self.layers = layers

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        
        self.out = x
        return self.out
    
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

Now, our model definition and training loop simplifies even more:

In [67]:
n_emb = 10
n_hidden = 200

model = Sequential([
    Embedding(vocab_size, n_emb),
    Flatten(),
    Linear(n_emb * block_size, n_hidden, bias=False), 
    BatchNorm1d(n_hidden), 
    Tanh(), 
    Linear(n_hidden, vocab_size)
])

with torch.no_grad():
    model.layers[-1].weight *= 0.1 # Make the last layer less confident

parameters = model.parameters()
for p in parameters:
    p.requires_grad = True


In [69]:
epochs = 10_000
batch_size = 32
lossi = [ ]

for i in range(epochs):

    ix = torch.randint(0, X_train.shape[0], (batch_size, ))
    x_batch, y_batch = X_train[ix], Y_train[ix]

    logits = model(x_batch)
    loss = F.cross_entropy(logits, y_batch)

    for p in parameters:
        p.grad = None

    loss.backward()

    for p in parameters:
        p.data -= 0.01 * p.grad

    break

print(loss.item())

3.285482883453369


For inference, however, we need to set that each of the layers are not in training mode. Remember how the batch normalization layer has a different behavior when it is in training mode as compared to in testing mode?

In [70]:
# Set "evaluation" mode for each layer
for layer in model.layers:
    layer.training = False

In [71]:
@torch.no_grad()
def split_loss(split):
    x, y = {
        'train': [X_train, Y_train], 
        'test': [X_test, Y_test], 
        'val': [X_val, Y_val]
    }[split]

    logits = model(x)

    loss = F.cross_entropy(logits, y) # PyTorch will apply softmax internally

    print(f"For {split}, loss was: {loss.item()}")

Sampling from the model also simplifies:

In [76]:
for _ in range(20):
    out = [ ]
    context = [ 0 ] * block_size

    while True:
        logits = model(torch.tensor([context]))
        probabilities = F.softmax(logits, dim=1)

        ix = torch.multinomial(probabilities, num_samples=1).item()

        context = context[1:] + [ix]
        out.append(ix)

        if ix == 0:
            break
    print(''.join(itos[i] for i in out))

oszeezxzkzopvpvyqiienjytrklcbpgmykxluu.
.
bwdit.
djswv.
rsewg.
iat.
ixlcebimytygshgv.
zmzaexnablmsydfqpxoqyhvhylgwchguovw.
xmkrhdpkymssxzcvasfrnqhrjxofaqrzconoo.
puufzecwrovytavbbwssmzsnv.
ojykljozjfiqqmizlfamwrrmpuoornmuxtkklzqgcpptfrssouwzvbkfntlca.
txufgherhqtxnhirhbjamhvmr.
ryjtdcnlenebpsympnujprgfgvowtuxblvosbadgwceursj.
dazmkkbqg.
yymtqbnxieuvefjbdecwcefhzdjrefvhlfvoyi.
oqvwqzebdnccoypxunqqykvgpgdsxefwgsvzigcdtlbx.
cphykamwuijubkswgvldegslvfogqeaax.
vtctjosmxupgdukrnyq.
jcjebdzghq.
hfdovvupkuehkehqtskoqs.
