# N-gram MLP (Makemore)
A character-level language model using an MLP with learned embeddings.

In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as pl

In [3]:
g = torch.Generator().manual_seed(2147483647)

In [24]:
class Linear:

    def __init__(self, fan_in, fan_out, bias=True):
        self.weight = torch.randn((fan_in, fan_out), generator=g) / fan_in ** 0.5 # normalize it
        self.bias = torch.zeros(fan_out) if bias else None
    
    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out
    
    def parameters(self):
        return [self.weight] + ([self.bias] if self.bias is not None else [])


In [9]:
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out 
    def parameters(self):
        return []

In [None]:
class BatchNorm1D:

    def __init__(self, dim, eps=1e-5, mom=0.1):
        self.eps = eps
        self.mom = mom
        self.training = True

        ## params that are modified via backprop to keep gauss distribution and slide it
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

        # running mean / std
        self.running_mean = torch.zeros(dim) 
        self.running_var = torch.ones(dim)
    
    def __call__(self, x):
        # do forward pass
        if self.training:
            # batch values
            xmean = x.mean(0, keepdim=True)
            xvar = x.var(0, keepdim=True)
        else:
            xmean = self.running_mean
            xvar = self.running_var

        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta

        # update params
        if self.training:
            with torch.no_grad():
                self.running_mean = xmean * (1 - self.mom) + self.mom * xmean
                self.running_var = xvar * (1 - self.mom) + self.mom * xvar

        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]

In [41]:

# How many embeddings per character
n_emb = 10 

# Number of hidden neurons in the layer
n_hidden = 200

# How many letters we're looking at before making the next prediction
context_size = 4

alphabet = '.abcdefghijklmnopqrstuvwxyz'
C = torch.randn([len(alphabet), n_emb], generator=g)

layers = [
    Linear(n_emb * context_size, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, len(alphabet), bias=False), BatchNorm1D(len(alphabet))
]

# layers = [
#     Linear(n_emb * context_size, n_hidden), Tanh(),
#     Linear(n_hidden, n_hidden), Tanh(),
#     Linear(n_hidden, n_hidden), Tanh(),
#     Linear(n_hidden, n_hidden), Tanh(),
#     Linear(n_hidden, n_hidden), Tanh(),
#     Linear(n_hidden, len(alphabet))
# ]
with torch.no_grad():
    # last layer make less confident
    layers[-1].gamma *= 0.1
    # layers[-1].weight *= 0.1
    # apply gain for other layers
    # for layer in layers[:-1]:
    #     if isinstance(layer, Linear):
    #         layer.weight *= 5/3 # for tanh, 5/3

parameters = [C] + [p for layer in layers for p in layer.parameters()]
print(f'total params={sum(p.nelement() for p in parameters)}')

for p in parameters:
    p.requires_grad = True


total params=175724


In [42]:
intMap = {ch: i for i, ch in enumerate(alphabet)}


def build_dataset(words):
    context = [0] * context_size
    xs = []
    ys = []
    for word in words:
        word = word + '.'
        for ch in word:
            ix = intMap[ch]
            xs.append(context)
            ys.append(ix)
            context = context[1:] + [ix]
    xs = torch.tensor(xs)
    ys = torch.tensor(ys)
    return xs, ys

with open('names.txt', 'r') as f:
    lines = f.read().splitlines()
import random
random.seed(42)
random.shuffle(lines)
n1 = int(0.8*len(lines))
n2 = int(0.9*len(lines))

Xtr,  Ytr  = build_dataset(lines[:n1])     # 80%
Xdev, Ydev = build_dataset(lines[n1:n2])   # 10%
Xtest,  Ytest  = build_dataset(lines[n2:])     # 10%

print('training shape=' + str(Xtr.shape))
print('dev shape=' + str(Xdev.shape))
print('test shape=' + str(Xtest.shape))


training shape=torch.Size([182625, 4])
dev shape=torch.Size([22655, 4])
test shape=torch.Size([22866, 4])


In [43]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
    # batch
    ix = torch.randint(0, Xtr.shape[0], (batch_size, ), generator=g)
    Xb, Yb = Xtr[ix], Ytr[ix]

    # forward pass
    emb = C[Xb]
    x = emb.view(emb.shape[0], -1) # concat the vectors
    for layer in layers:
        x = layer(x)
    loss = F.cross_entropy(x, Yb)

    # Zero out gradients
    for p in parameters:
        p.grad = None

    # backward pass
    loss.backward() 

    # update
    lr = 0.1 if i < 10000 else 0.01
    # lr = 1.0
    for p in parameters:
        p.data += -lr * p.grad
    
    if i % 10000 == 0:
        print(f'{i}/{max_steps}: loss={loss.item(): .4f}')
    lossi.append(loss.log10().item())



0/200000: loss= 3.2855
10000/200000: loss= 1.8399
20000/200000: loss= 2.1485
30000/200000: loss= 2.2711
40000/200000: loss= 1.9604
50000/200000: loss= 2.1631
60000/200000: loss= 1.5669
70000/200000: loss= 1.8773
80000/200000: loss= 1.9442
90000/200000: loss= 2.1188
100000/200000: loss= 1.5803
110000/200000: loss= 1.9925
120000/200000: loss= 1.6744
130000/200000: loss= 1.5926
140000/200000: loss= 1.8045
150000/200000: loss= 1.8437
160000/200000: loss= 2.1743
170000/200000: loss= 2.1408
180000/200000: loss= 2.3565
190000/200000: loss= 2.1711


In [None]:
@torch.no_grad() # this decorator disables gradient tracking
def split_loss(split):
  x,y = {
    'train': (Xtr, Ytr),
    'val': (Xdev, Ydev),
    'test': (Xtest, Ytest),
  }[split]
  emb = C[x] # (N, block_size, n_embd)
  x = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)
  for layer in layers:
    x = layer(x)
  loss = F.cross_entropy(x, y)
  print(split, loss.item())

# put layers into eval mode
for layer in layers:
  layer.training = False
split_loss('train')
split_loss('val')

In [None]:
# sample from the model
g = torch.Generator().manual_seed(2147483647 + 10)

# chatMap = {i: ch for i, ch in enumerate(alphabet)}
for _ in range(20):
    
    out = []
    context = [0] * context_size # initialize with all ...
    while True:
        emb = C[torch.tensor([context])] # (1,block_size,d)
        x = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)
        for layer in layers:
            x = layer(x)
         # h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        probs = F.softmax(x, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        if ix == 0:
            break
        else:
            out.append(ix)

        
    print(''.join(alphabet[i] for i in out))

narez.
fatu.
harlette.
shrey.
yassandra.
jazhnee.
amerynci.
aqui.
ollonia.
chaiir.
asley.
phram.
port.
quintis.
ozleighan.
aura.
elogierryx.
makaellins.
bost.
edi.
