# N-gram MLP (Makemore)
A character-level language model using an MLP with learned embeddings.

In [106]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as pl

In [3]:
g = torch.Generator().manual_seed(2147483647)

In [107]:
class Linear:

    def __init__(self, fan_in, fan_out, bias=True):
        self.weight = torch.randn((fan_in, fan_out), generator=g) / fan_in ** 0.5 # normalize it
        self.bias = torch.zeros(fan_out) if bias else None
    
    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out
    
    def parameters(self):
        return [self.weight] + ([self.bias] if self.bias is not None else [])


In [108]:
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out 
    def parameters(self):
        return []

In [109]:
class BatchNorm1D:

    def __init__(self, dim, eps=1e-5, mom=0.1):
        self.eps = eps
        self.mom = mom
        self.training = True

        ## params that are modified via backprop to keep gauss distribution and slide it
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

        # running mean / std
        self.running_mean = torch.zeros(dim) 
        self.running_var = torch.ones(dim)
    
    def __call__(self, x):
        # do forward pass
        if self.training:
            if x.ndim == 2:
                dim = 0
            elif x.ndim == 3:
                dim = (0, 1)
            # batch values
            xmean = x.mean(dim, keepdim=True)
            xvar = x.var(dim, keepdim=True)
        else:
            xmean = self.running_mean
            xvar = self.running_var

        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta

        # update params
        if self.training:
            with torch.no_grad():
                self.running_mean = self.running_mean * (1 - self.mom) + self.mom * xmean
                self.running_var = self.running_var * (1 - self.mom) + self.mom * xvar

        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]

In [110]:
class Sequential:
    def __init__(self, layers):
        self.layers = layers

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return self.out
    
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]
    
class Embedding:
    def __init__(self, num_embeddings, dim_embeddings, g = None):
        self.weights = torch.randn([num_embeddings, dim_embeddings], generator=g)

    def __call__(self, x):
        self.out = self.weights[x]
        return self.out

    def parameters(self):
        return [self.weights]


class Flatten:
    def __call__(self, x):
        self.out = x.view(x.shape[0], -1)
        return self.out
    
    def parameters(self):
        return []
    

In [111]:
torch.manual_seed(42); # seed rng for reproducibility

In [122]:

# How many embeddings per character
n_emb = 10 

# Number of hidden neurons in the layer
n_hidden = 200

# How many letters we're looking at before making the next prediction
context_size = 8

alphabet = '.abcdefghijklmnopqrstuvwxyz'
# C = torch.randn([len(alphabet), n_emb], generator=g)

layers = [
    Embedding(len(alphabet), n_emb, g),
    Flatten(),
    Linear(n_emb * context_size, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
    # Linear(n_hidden, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
    # Linear(n_hidden, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
    # Linear(n_hidden, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
    # Linear(n_hidden, n_hidden, bias=False), BatchNorm1D(n_hidden), Tanh(),
    Linear(n_hidden, len(alphabet)), 
    # BatchNorm1D(len(alphabet))
]

# layers = [
#     Linear(n_emb * context_size, n_hidden), Tanh(),
#     Linear(n_hidden, n_hidden), Tanh(),
#     Linear(n_hidden, n_hidden), Tanh(),
#     Linear(n_hidden, n_hidden), Tanh(),
#     Linear(n_hidden, n_hidden), Tanh(),
#     Linear(n_hidden, len(alphabet))
# ]

model = Sequential(layers)

with torch.no_grad():
    # last layer make less confident
    model.layers[-1].weight *= 0.1
    # layers[-1].weight *= 0.1
    # apply gain for other layers
    # for layer in layers[:-1]:
    #     if isinstance(layer, Linear):
    #         layer.weight *= 5/3 # for tanh, 5/3

parameters = model.parameters()
print(f'total params={sum(p.nelement() for p in parameters)}')

for p in parameters:
    p.requires_grad = True


total params=22097


In [115]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
print(len(words))
print(max(len(w) for w in words))
print(words[:8])

32033
15
['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']


In [121]:
intMap = {ch: i for i, ch in enumerate(alphabet)}


def build_dataset(words):
    xs = []
    ys = []
    for word in words:
        context = [0] * context_size
        for ch in word + '.':
            ix = intMap[ch]
            xs.append(context)
            ys.append(ix)
            context = context[1:] + [ix]
    xs = torch.tensor(xs)
    ys = torch.tensor(ys)
    return xs, ys

with open('names.txt', 'r') as f:
    lines = f.read().splitlines()
import random
random.seed(42)
random.shuffle(lines)

print(lines[:8])
n1 = int(0.8*len(lines))
n2 = int(0.9*len(lines))

print(n1)
print(n2)

Xtr,  Ytr  = build_dataset(lines[:n1])     # 80%
Xdev, Ydev = build_dataset(lines[n1:n2])   # 10%
Xtest,  Ytest  = build_dataset(lines[n2:])     # 10%

print('training shape=' + str(Xtr.shape))
print('dev shape=' + str(Xdev.shape))
print('test shape=' + str(Xtest.shape))


['yuheng', 'diondre', 'xavien', 'jori', 'juanluis', 'erandi', 'phia', 'samatha']
25626
28829
training shape=torch.Size([182625, 8])
dev shape=torch.Size([22655, 8])
test shape=torch.Size([22866, 8])


In [None]:
max_steps = 200000
batch_size = 32
lossi = []

for layer in model.layers:
    layer.training = True

for i in range(max_steps):
    # batch
    ix = torch.randint(0, Xtr.shape[0], (batch_size, ))
    print(ix)
    Xb, Yb = Xtr[ix], Ytr[ix]

    # forward pass
    # emb = C[Xb]
    # x = emb.view(emb.shape[0], -1) # concat the vectors
    # for layer in layers:
    #     x = layer(x)

    # forward pass
    logits = model(Xb)
    loss = F.cross_entropy(logits, Yb)

    # Zero out gradients
    for p in parameters:
        p.grad = None

    # backward pass
    loss.backward() 

    # update
    lr = 0.1 if i < 150000 else 0.01
    # lr = 1.0
    for p in parameters:
        p.data += -lr * p.grad
    
    if i % 10000 == 0:
        print(f'{i}/{max_steps}: loss={loss.item(): .4f}')
    lossi.append(loss.log10().item())



0/200000: loss= 3.2944
10000/200000: loss= 2.1288
20000/200000: loss= 2.2613
30000/200000: loss= 2.0484
40000/200000: loss= 1.9412
50000/200000: loss= 2.2380
60000/200000: loss= 2.1907
70000/200000: loss= 2.4009
80000/200000: loss= 1.7440
90000/200000: loss= 2.1807


KeyboardInterrupt: 

In [89]:
@torch.no_grad() # this decorator disables gradient tracking
def split_loss(split):
  x,y = {
    'train': (Xtr, Ytr),
    'val': (Xdev, Ydev),
    'test': (Xtest, Ytest),
  }[split]
  logits = model(x)
  loss = F.cross_entropy(logits, y)
  print(split, loss.item())

# put layers into eval mode
for layer in layers:
  layer.training = False
split_loss('train')
split_loss('val')

train 2.0092804431915283
val 2.130854368209839


In [88]:
# sample from the model
g = torch.Generator().manual_seed(2147483647)

# chatMap = {i: ch for i, ch in enumerate(alphabet)}
for _ in range(20):
    
    out = []
    context = [0] * context_size # initialize with all ...
    while True:
        logits = model(torch.tensor([context]))
        # emb = C[torch.tensor([context])] # (1,block_size,d)
        # x = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)
        # for layer in layers:
        #     x = layer(x)
         # h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        if ix == 0:
            break
        else:
            out.append(ix)

        
    print(''.join(alphabet[i] for i in out))

cexzer
yalynn
yoviya
hyllin
mittain
lunyn
kyvia
ryciyah
ytthimi
ttilio
yviyah
vifgieda
kyllyn
ystyly
ttaviyah
ftbstid
yivendta
lyscinso
vyantil
yyal
