# N-gram MLP (Makemore)
A character-level language model using an MLP with learned embeddings.

In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as pl

In [3]:
g = torch.Generator().manual_seed(2147483647)

In [7]:
class Linear:

    def __init__(self, fan_in, fan_out, bias=True):
        self.weight = torch.randn((fan_in, fan_out), generator=g) / fan_in ** 0.5 # normalize it
        self.bias = torch.zeros(fan_out) if bias else None
    
    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out
    
    def parameters(self):
        return [self.weight] + ([self.bias] if self.bias is None else [])


In [9]:
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out 
    def parameters(self):
        return []

In [None]:
class BatchNorm1D:

    def __init__(self, dim, eps=1e-5, mom=0.1):
        self.eps = eps
        self.mom = mom
        self.training = True

        ## params that are modified via backprop to keep gauss distribution and slide it
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

        # running mean / std
        self.running_mean = torch.zeros(dim) 
        self.running_var = torch.ones(dim)
    
    def __call__(self, x):
        # do forward pass
        if self.training:
            # batch values
            xmean = x.mean(0, keepdim=True)
            xvar = x.var(0, keepdim=True)
        else:
            xmean = self.running_mean
            xvar = self.running_var

        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta

        # update params
        if self.training:
            with torch.no_grad():
                self.running_mean = xmean * (1 - self.mom) + self.mom * xmean
                self.running_var = xvar * (1 - self.mom) + self.mom * xvar

        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]

In [None]:

# How many embeddings per character
n_emb = 10 

# Number of hidden neurons in the layer
n_hidden = 200

# How many letters we're looking at before making the next prediction
context_size = 3

alphabet = '.abcdefghijklmnopqrstuvwxyz'
C = torch.randn([len(alphabet), n_emb], generator=g)

layers = [
    Linear(n_emb * context_size, n_hidden), Tanh(),
    Linear(n_hidden, n_hidden), Tanh(),
    Linear(n_hidden, n_hidden), Tanh(),
    Linear(n_hidden, n_hidden), Tanh(),
    Linear(n_hidden, n_hidden), Tanh(),
    Linear(n_hidden, len(alphabet))
]

with torch.nograd():
    # last layer make less confident
    layers[-1].weight *= 0.1
    # apply gain for other layers
    for layer in layers[:-1]:
        if isinstance(layer, Linear):
            layer.weight *= 5/3 # for tanh, 5/3



In [12]:
print(len(alphabet))

27
