In [1]:
# Given the training loss and val loss from before
# train loss : 2.0583250522613525
# val loss : 2.1065292358398438
#
# we can see that the losses are very similar
# so we can see that we are not overfitting too much
#
# so we can try to get better performance by scaling up the size of the NN
# bigger and deeper
#
# the model we are using is a simple 1 hidden layer MLP
# we don't have a naive way of making it bigger in a productive way
# of course we can make the model deeper
# but it doesn't take way from the fact that we are forcing
# all our examples to be crammed into the first layer right in the beginning
# It's kind silly to squash all that information SO FAST even if we increase the size of the first layer
#
# What we want our network to look like is Wavenet
# In wavenet the inputs are not all crushed directly to one layer
# Instead they are crushed lot more slowly
# 2 characters are fused in one layers, the next fused the ones from the other 2*2, and so on
# dilated causal convolutional layers
# the basic idea is progressive fusion


In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import torch
import torch.nn.functional as F

In [4]:
words = open('names.txt').read().splitlines()
len(words), words[:8]

(32033,
 ['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia'])

In [5]:
# let's only have one special token, and let's have it at index 0, offset others by 1
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
num_classes = len(stoi)
vocab_size = len(itos)
print(vocab_size, itos)

27 {1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [6]:
def build_dataset(words, block_size):
    X, Y, = [], [] # X, input | Y, labels

    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix] # crop and append moving window

    X = torch.tensor(X)
    Y = torch.tensor(Y)    
    return X, Y

In [7]:
# Splits
import random
random.seed(42)
random.shuffle(words)

n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

block_size = 8 # context length : How many characters do we take to predict the next one : 3 chars to predict the 4th

Xtr, Ytr = build_dataset(words[:n1], block_size)
Xva, Yva = build_dataset(words[n1:n2], block_size)
Xte, Yte = build_dataset(words[n2:], block_size)

Xtr.shape, Ytr.shape, Xva.shape, Yva.shape, Xte.shape, Yte.shape

(torch.Size([182625, 8]),
 torch.Size([182625]),
 torch.Size([22655, 8]),
 torch.Size([22655]),
 torch.Size([22866, 8]),
 torch.Size([22866]))

In [8]:
for x,y in zip(Xtr[:20], Ytr[:20]):
    print(''.join(itos[ix.item()] for ix in x), '--->', itos[y.item()])

........ ---> y
.......y ---> u
......yu ---> h
.....yuh ---> e
....yuhe ---> n
...yuhen ---> g
..yuheng ---> .
........ ---> d
.......d ---> i
......di ---> o
.....dio ---> n
....dion ---> d
...diond ---> r
..diondr ---> e
.diondre ---> .
........ ---> x
.......x ---> a
......xa ---> v
.....xav ---> i
....xavi ---> e


In [9]:
class Linear:
    
    def __init__(self, fan_in, fan_out, bias=True):
        self.weight = torch.randn((fan_in, fan_out)) / fan_in**0.5
        self.bias = torch.zeros(fan_out) if bias else None
        
    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out
    
    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])

class BatchNorm1D:
    
    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        self.training = True
        # learnable parameters
        self.gamma = torch.ones(dim)  # gain
        self.beta  = torch.zeros(dim) # bias
        # buffers (calculated with a running 'momentum update')
        self.running_mean = torch.zeros(dim)
        self.running_var  = torch.ones(dim)
        
    def __call__(self, x):
        # forward pass
        if self.training:
            xmean = x.mean(dim=0, keepdim=True)
            xvar  = x.var(dim=0, keepdim=True)
        else:
            xmean = self.running_mean
            xvar  = self.running_var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta

        # updating running buffers
        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var  = (1 - self.momentum) * self.running_var + self.momentum * xvar
                
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]

class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    
    def parameters(self):
        return []


class Embedding:
    def __init__(self, num_embeddings, embeddings_dim):
        self.weight = torch.randn((num_embeddings, embeddings_dim))
    
    def __call__(self, IX):
        self.out = self.weight[IX]
        return self.out
    
    def parameters(self):
        return [self.weight]
    

class Flatten:
    def __call__(self, x):
        self.out = x.view(x.shape[0], -1)
        return self.out
    
    def parameters(self):
        return []

class Sequential:
    def __init__(self, layers):
        self.layers = layers
    
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return self.out
    
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

In [10]:
torch.manual_seed(42);

In [11]:
n_embed = 10   # dimensionality of characters in the embedding vector
n_hidden = 200 # number of neurons in the hidden layer

# stack up the layers
model = Sequential([
    Embedding(vocab_size, n_embed),
    Flatten(),
    Linear(n_embed * block_size, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, vocab_size),
])

# initialisation
with torch.no_grad():
    # make last layer less confident
    model.layers[-1].weight *= 0.1

# prepare parameters, no need to check for C and the concat
parameters = model.parameters()
print(sum(p.nelement() for p in parameters))

for p in parameters:
    p.requires_grad = True

22097


In [12]:
# Let's look at a batch of just 4 examples
ix = torch.randint(0, Xtr.shape[0], (4,))
Xb, Yb = Xtr[ix], Ytr[ix]
logits = model(Xb)
print(Xb.shape)
Xb

torch.Size([4, 8])


tensor([[ 0,  0,  0,  0,  0,  0, 12, 15],
        [ 0,  0,  0,  0,  0,  0, 20,  1],
        [ 0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0, 11]])

In [13]:
model.layers[0].out.shape # Embedding layer out shape, remember the embedding vector is 10 dims

torch.Size([4, 8, 10])

In [14]:
model.layers[1].out.shape # Flatten layer

torch.Size([4, 80])

In [15]:
model.layers[2].out.shape # Linear layer

torch.Size([4, 200])

In [16]:
# what's happening
(torch.randn((4, 80)) @ torch.randn((80, 200)) + torch.randn(200)).shape

torch.Size([4, 200])

In [17]:
# pytorch matmul works only on last dimension, everything before is left untouched, pretty neat
(torch.randn((20, 30, 40, 4, 80)) @ torch.randn((80, 200)) + torch.randn(200)).shape

torch.Size([20, 30, 40, 4, 200])

In [18]:
# more tricks lets fuse two characters ( 4 groups of 2 chars, each of them 10 dimensional vector)
(torch.randn((4, 4, 20)) @ torch.randn((20, 200)) + torch.randn(200)).shape

torch.Size([4, 4, 200])

In [19]:
# Before
# (1, 2, 3, 4, 5, 6, 7, 8)

# After
# ((1, 2), (3, 4), (5, 6), (7, 8))

In [20]:
# Lets see how to implement this

In [21]:
# Right now our input to the flatten layer is (4, 8, 10) - straight out of embedding layer
e = torch.randn((4, 8, 10))
e.shape, e.view(e.shape[0], -1).shape

(torch.Size([4, 8, 10]), torch.Size([4, 80]))

In [22]:
# In python we can pull out even odd from lists
list(range(10)), list(range(10))[::2], list(range(10))[1::2]

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 2, 4, 6, 8], [1, 3, 5, 7, 9])

In [23]:
e = torch.randn((4, 8, 10))
e[:, ::2, :].shape, e[:, 1::2, :].shape, torch.cat([e[:, ::2, :], e[:, 1::2, :]], dim=2).shape

(torch.Size([4, 4, 10]), torch.Size([4, 4, 10]), torch.Size([4, 4, 20]))

In [24]:
(e.view(4, 4, 20) == torch.cat([e[:, ::2, :], e[:, 1::2, :]], dim=2)).all()

tensor(True)

In [25]:
# Lets implement all this in flatten

In [26]:
class FlattenConsecutive:
    def __init__(self, n):
        self.n = n
        
    def __call__(self, x):
        B, T, C = x.shape # batch,  ....., embedding
        x = x.view(B, T//self.n, C*self.n)
        if x.shape[1] == 1:  # squeeze out spurious dimension
            x = x.squeeze(1)
        self.out = x
        return self.out
    
    def parameters(self):
        return []

In [27]:
# Use the new layer

In [28]:
n_embed = 10   # dimensionality of characters in the embedding vector
n_hidden = 200 # number of neurons in the hidden layer

# stack up the layers
model = Sequential([
    Embedding(vocab_size, n_embed),
    FlattenConsecutive(block_size),
    Linear(n_embed * block_size, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, vocab_size),
])

# initialisation
with torch.no_grad():
    # make last layer less confident
    model.layers[-1].weight *= 0.1

# prepare parameters, no need to check for C and the concat
parameters = model.parameters()
print(sum(p.nelement() for p in parameters))

for p in parameters:
    p.requires_grad = True

22097


In [29]:
# Let's look at a batch of just 4 examples
ix = torch.randint(0, Xtr.shape[0], (4,))
Xb, Yb = Xtr[ix], Ytr[ix]
logits = model(Xb)
print(Xb.shape)
Xb

torch.Size([4, 8])


tensor([[ 0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0, 10,  1, 11, 15, 18],
        [ 0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0, 13,  1, 11]])

In [30]:
for layer in model.layers:
    print(layer.__class__.__name__, ':', tuple(layer.out.shape))

Embedding : (4, 8, 10)
FlattenConsecutive : (4, 80)
Linear : (4, 200)
BatchNorm1D : (4, 200)
Tanh : (4, 200)
Linear : (4, 27)


In [31]:
# Time to try this with new shape of flattening

In [32]:
n_embed = 10   # dimensionality of characters in the embedding vector
n_hidden = 200 # number of neurons in the hidden layer

# stack up the layers
model = Sequential([
    Embedding(vocab_size, n_embed),
    FlattenConsecutive(2), Linear(n_embed * 2, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, vocab_size),
])

# initialisation
with torch.no_grad():
    # make last layer less confident
    model.layers[-1].weight *= 0.1

# prepare parameters, no need to check for C and the concat
parameters = model.parameters()
print(sum(p.nelement() for p in parameters))

for p in parameters:
    p.requires_grad = True

10097


In [33]:
# Let's look at a batch of just 4 examples
ix = torch.randint(0, Xtr.shape[0], (4,))
Xb, Yb = Xtr[ix], Ytr[ix]
logits = model(Xb)
print(Xb.shape)
Xb

torch.Size([4, 8])


tensor([[ 0,  0,  0,  0,  0,  0, 13,  9],
        [ 0,  0,  0, 19, 21, 18,  9, 14],
        [ 0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  0]])

In [34]:
for layer in model.layers:
    print(layer.__class__.__name__, ':', tuple(layer.out.shape))

Embedding : (4, 8, 10)
FlattenConsecutive : (4, 4, 20)
Linear : (4, 4, 200)
BatchNorm1D : (4, 4, 200)
Tanh : (4, 4, 200)
Linear : (4, 4, 27)


In [35]:
# That's looking good, not let's layer up the flattens and linears to get that wavenet like shape

In [36]:
n_embed = 10   # dimensionality of characters in the embedding vector
n_hidden = 200  # number of neurons in the hidden layer

# stack up the layers
model = Sequential([
    Embedding(vocab_size, n_embed),
    FlattenConsecutive(2), Linear(n_embed * 2, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, vocab_size),
])

# initialisation
with torch.no_grad():
    # make last layer less confident
    model.layers[-1].weight *= 0.1

# prepare parameters, no need to check for C and the concat
parameters = model.parameters()
print(sum(p.nelement() for p in parameters))

for p in parameters:
    p.requires_grad = True

170897


In [37]:
# Let's look at a batch of just 4 examples
ix = torch.randint(0, Xtr.shape[0], (4,))
Xb, Yb = Xtr[ix], Ytr[ix]
logits = model(Xb)
print(Xb.shape)
Xb

torch.Size([4, 8])


tensor([[ 0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0, 26,  9,  4,  1, 14],
        [ 0,  0,  0, 18,  1,  6,  9,  4],
        [ 0,  0,  0,  0,  0,  0,  0,  0]])

In [38]:
for layer in model.layers:
    print(layer.__class__.__name__, ':', tuple(layer.out.shape))

Embedding : (4, 8, 10)
FlattenConsecutive : (4, 4, 20)
Linear : (4, 4, 200)
BatchNorm1D : (4, 4, 200)
Tanh : (4, 4, 200)
FlattenConsecutive : (4, 2, 400)
Linear : (4, 2, 200)
BatchNorm1D : (4, 2, 200)
Tanh : (4, 2, 200)
FlattenConsecutive : (4, 400)
Linear : (4, 200)
BatchNorm1D : (4, 200)
Tanh : (4, 200)
Linear : (4, 27)


In [39]:
# shapes looks good
# lets change the number of hidden layers, so we have same number of parameters as previous model ~22000
# will be easier to compare same capacity with different network arch.

In [40]:
n_embed = 10   # dimensionality of characters in the embedding vector
n_hidden = 68  # number of neurons in the hidden layer

# stack up the layers
model = Sequential([
    Embedding(vocab_size, n_embed),
    FlattenConsecutive(2), Linear(n_embed * 2, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, vocab_size),
])

# initialisation
with torch.no_grad():
    # make last layer less confident
    model.layers[-1].weight *= 0.1

# prepare parameters, no need to check for C and the concat
parameters = model.parameters()
print(sum(p.nelement() for p in parameters))

for p in parameters:
    p.requires_grad = True

22397


In [41]:
# the module weights seems to be comparable
# lets run it in the next nb