In [1]:
import torch
import torch.nn.functional as F
import random
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# read in all words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
len(words)

32033

In [4]:
# build vocabulary of characters and mapping to/from integers
chars = sorted(set(''.join(words)))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}
vocab_size = len(itos)
print(itos, vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'} 27


In [84]:
# build dataset
block_size = 8

def build_dataset(words):
    X, Y = [], []
    
    for w in words:
        context = [0] * block_size
        for char in w + '.':
            ix = stoi[char]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix] # crop and append
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

random.seed(42)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xtest, Ytest = build_dataset(words[n2:])

torch.Size([182437, 8]) torch.Size([182437])
torch.Size([22781, 8]) torch.Size([22781])
torch.Size([22928, 8]) torch.Size([22928])


In [85]:
for x, y in zip(Xtr[:20], Ytr[:20]):
    print(''.join(itos[ix.item()] for ix in x), '---->', itos[y.item()])

........ ----> t
.......t ----> a
......ta ----> u
.....tau ----> r
....taur ----> e
...taure ----> n
..tauren ----> .
........ ----> s
.......s ----> u
......su ----> l
.....sul ----> e
....sule ----> m
...sulem ----> a
..sulema ----> n
.suleman ----> .
........ ----> z
.......z ----> e
......ze ----> r
.....zer ----> e
....zere ----> n


In [160]:
# building out the lego blocks

class Linear: 
    
    def __init__(self, fan_in, fan_out, bias=True):
        self.weight = torch.randn((fan_in, fan_out), generator=g) / fan_in**0.5
        self.bias = torch.randn(fan_out) if bias else None
    
    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out

    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])

class BatchNorm1d:
    
    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        self.training = True
        # parameters (trained with backprop)
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        # buffers (trained with a running 'momentum update')
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)
    
    def __call__(self, x):
        # calculate the forward pass
        if self.training:
            if x.ndim == 2:
                dim = 0
            elif x.ndim == 3:
                dim = (0,1)
            xmean = x.mean(dim, keepdim=True) # batch mean
            xvar = x.var(dim, keepdim=True) # batch variance
        else:
            xmean = self.running_mean
            xvar = self.running_var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
        self.out = self.gamma * xhat + self.beta
        # update the buffers
        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]
    
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out

    def parameters(self):
        return []

class Embedding:
    
    def __init__(self, num_embeddings, embedding_dim):
        self.weight = torch.randn((num_embeddings, embedding_dim))
        
    def __call__(self, IX):
        self.out = self.weight[IX]
        return self.out
    
    def parameters(self):
        return [self.weight]
    
class FlattenConsecutive:
    
    def __init__(self, n):
        self.n = n
    
    def __call__(self, x):
        B, T, C = x.shape 
        x = x.view(B, T//self.n, C*self.n)
        # having 1 in second dimension is spurious; 1st dim is example batch, 2nd dim is context length batch
        if x.shape[1] == 1:
            x = x.squeeze(1)
        self.out = x
        return self.out
    
    def parameters(self):
        return []
    
class Sequential: 
    
    def __init__(self, layers):
        self.layers = layers
    
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return self.out
    
    def parameters(self):
        # get parameters of all layers and stretch them out into one list
        return [p for layer in self.layers for p in layer.parameters()]

In [161]:
torch.manual_seed(42);

In [170]:
n_emb = 24
n_hidden = 128

model = Sequential([
    Embedding(vocab_size, n_emb), 
    FlattenConsecutive(2), Linear(n_emb * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(), 
    Linear(n_hidden, vocab_size), 
])

with torch.no_grad():
    # last layer: make less confident (scale down layer that produces logits)
    layers[-1].weight *= 0.1

parameters = model.parameters()
print(sum(p.nelement() for p in parameters))
for p in parameters:
    p.requires_grad = True

76579


In [171]:
# Optimization
iters = 200001
batch_size = 32
lossi = []

for i in range(iters):
    
    # minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
    Xb, Yb = Xtr[ix], Ytr[ix] # batch X, Y
    
    # forward pass
    logits = model(Xb)
    loss = F.cross_entropy(logits, Yb) # loss function
    
    
    # backward pass
    for layer in layers:
        layer.out.retain_grad()
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update
    lr = 0.1 if i < 100000 else 0.01
    for p in parameters:
        p.data += -lr * p.grad
    
    # track stats
    if i % 10000 == 0: # print every 10000 steps
        print(f'{i:7d} / {iters-1:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())
    

      0 /  200000: 3.7436
  10000 /  200000: 1.8505
  20000 /  200000: 1.6150
  30000 /  200000: 2.2447
  40000 /  200000: 1.9385
  50000 /  200000: 2.0382
  60000 /  200000: 2.3026
  70000 /  200000: 1.8267
  80000 /  200000: 1.9178
  90000 /  200000: 2.2357
 100000 /  200000: 1.8755
 110000 /  200000: 1.5926
 120000 /  200000: 1.7391
 130000 /  200000: 1.8908
 140000 /  200000: 1.4250
 150000 /  200000: 1.9776
 160000 /  200000: 1.6460
 170000 /  200000: 1.7171
 180000 /  200000: 1.7328
 190000 /  200000: 1.9445
 200000 /  200000: 1.7913


In [167]:
for layer in model.layers:
    print(layer.__class__.__name__, ":", tuple(layer.out.shape))

Embedding : (32, 8, 10)
FlattenConsecutive : (32, 4, 20)
Linear : (32, 4, 68)
BatchNorm1d : (32, 4, 68)
Tanh : (32, 4, 68)
FlattenConsecutive : (32, 2, 136)
Linear : (32, 2, 68)
BatchNorm1d : (32, 2, 68)
Tanh : (32, 2, 68)
FlattenConsecutive : (32, 136)
Linear : (32, 68)
BatchNorm1d : (32, 68)
Tanh : (32, 68)
Linear : (32, 27)


In [168]:
model.layers[3].running_mean.shape

torch.Size([1, 1, 68])

In [159]:
e = torch.randn(32, 68)
em = e.mean(0,keepdim=True).shape
print(em)
emean = e.mean((0,1), keepdim=True) # (1, 1, 68)
evar = e.var((0,1), keepdim=True) # (1, 1, 68)
ehat = (e - emean) / torch.sqrt(evar + 1e-5) # (32, 4, 68)
model.layers[3].running_mean.shape

# Batch norm takes mean and variance between batches, so examples and in this case bigrams, to give statistics
# for each output channel. The expected shape of it should be 68 means and 68 variances, one for each neuron 
# between examples and context length.

torch.Size([1, 68])


torch.Size([1, 4, 68])

In [172]:
# plot average of every 1000 iterations 
plt.plot(torch.tensor(lossi).view(-1, 1000).mean(1))

RuntimeError: shape '[-1, 1000]' is invalid for input of size 200001

In [173]:
# put layers into eval mode (needed for batchnorm especially)
for l in model.layers:
    l.training = False

In [174]:
@torch.no_grad() # this disables gradient tracking, makes evaluating faster, torch doesn't track computation graph
def split_loss(split):
    x,y = {'train': (Xtr, Ytr), 
           'val': (Xdev, Ydev), 
           'test': (Xtest, Ytest)}[split]
    logits = model(x)
    
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')

train 1.7869387865066528
val 1.9898453950881958


In [175]:
# sample from the model
for _ in range(20):
    
    out = []
    context = [0] * block_size # initialize with '...'
    while True:

        logits = model(torch.tensor([context]))
        probs = F.softmax(logits, dim=1)
        
        # sample from distribution
        ix = torch.multinomial(probs, num_samples=1).item()
        
        context = context[1:] + [ix]
        out.append(ix)
        
        if ix == 0:
            break
            
    print(''.join(itos[i] for i in out))

jaikol.
christopher.
brentley.
trigus.
jayjah.
jedimani.
adreb.
adibzus.
cajiyana.
munuim.
melinn.
cami.
asali.
omar.
nihar.
jarevion.
shehan.
ahmi.
rheko.
elise.
