In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [46]:
words = open('names.txt', 'r').read().splitlines()

In [47]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
vocab_size

27

In [48]:
block_size = 8 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  
  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])

torch.Size([182625, 8]) torch.Size([182625])
torch.Size([22655, 8]) torch.Size([22655])
torch.Size([22866, 8]) torch.Size([22866])


In [49]:
class Linear:
    def __init__(self, fan_in, fan_out, bias=True):
        self.weight = torch.randn((fan_in, fan_out)) / fan_in**0.5
        self.bias = torch.zeros(fan_out) if bias else None

    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out

    def parameters(self):
        if self.bias != None:
            return self.weight + self.bias
        return self.weight

class BatchNorm1D:
    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        self.training = True
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def __call__(self, x):

        if self.training:
            xmean = x.mean(0, keepdim=True)
            xvar = x.var(0, keepdim=True)
        else:
            xmean = self.running_mean
            xvar = self.running_var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta

        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
        
    def parameters(self):
        return []

class Embedding:
    def __init__(self, num_embeddings, embeddings_dim):
        self.weight = torch.randn((num_embeddings, embeddings_dim))
    
    def __call__(self, x):
        self.out = self.weight[x]
        return self.out

    def parameters(self):
        return [self.weight]

# Not equal to Flatten in pytorch
class Flatten:
    def __init__(self, n):
        self.n = n

    def __call__(self, x):
        fD, sD, tD = x.shape
        x = x.view(fD, sD//self.n, tD*self.n)
        if x.shape[1] == 1:
            x = x.squeeze(1)
        self.out = x
        return self.out
    def parameters(self):
        return []

class Sequential:
  
  def __init__(self, layers):
    self.layers = layers
  
  def __call__(self, x):
    for layer in self.layers:
      x = layer(x)
    self.out = x
    return self.out
  
  def parameters(self):
    return [p for layer in self.layers for p in layer.parameters()]


In [52]:
n_embd = 24
n_hidden_layers = 128


model = Sequential([
    Embedding(vocab_size, n_embd),
    Flatten(2), Linear(n_embd * 2, n_hidden_layers, bias=False), BatchNorm1D(n_hidden_layers), Tanh(),
    Flatten(2), Linear(n_hidden_layers * 2, n_hidden_layers, bias=False), BatchNorm1D(n_hidden_layers), Tanh(),
    Flatten(2), Linear(n_hidden_layers * 2, n_hidden_layers, bias=False), BatchNorm1D(n_hidden_layers), Tanh(),
    Linear(n_hidden_layers, vocab_size),
])

with torch.no_grad():
  model.layers[-1].weight *= 0.1 # last layer make less confident

parameters = model.parameters()
print(sum(p.nelement() for p in parameters))
for p in parameters:
    p.requires_grad = True

76552


In [41]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
    ix = torch.randint(0, Xtr.shape[0], (batch_size, ))
    Xb, Yb = Xtr[ix], Ytr[ix]
    

    logits = model(Xb)        
    loss = F.cross_entropy(logits, Yb)
  
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update
    lr = 0.1 if i < 100000 else 0.01 # step learning rate decay
    for p in parameters:
        p.data += -lr * p.grad

    # track stats
    if i % 10000 == 0: # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())
    break


TypeError: unsupported operand type(s) for *: 'float' and 'NoneType'

In [8]:
for layer in model.layers:
    layer.training = False

In [9]:
# evaluate the loss
@torch.no_grad() # this decorator disables gradient tracking inside pytorch
def split_loss(split):
  x,y = {
    'train': (Xtr, Ytr),
    'val': (Xdev, Ydev),
    'test': (Xte, Yte),
  }[split]
  logits = model(x)
  loss = F.cross_entropy(logits, y)
  print(split, loss.item())

split_loss('train')
split_loss('val')

train 3.103217363357544
val 3.1043572425842285


In [10]:
for _ in range(20):
    
    out = []
    context = [0] * block_size
    while True:
        # forward pass the neural net
        logits = model(torch.tensor([context]))
        probs = F.softmax(logits, dim=1)
        # sample from the distribution
        ix = torch.multinomial(probs, num_samples=1).item()
        # shift the context window and track the samples
        context = context[1:] + [ix]
        out.append(ix)
        # if we sample the special '.' token, break
        if ix == 0:
            break

    print("".join(itos[i] for i in out))
    

y.
y.
igkeborym.
xu.
fesijyvbuo.
tgauykxtrn.
.
cprjgtodclqsdwrnsz.
blyh.
dvzhoefvdwyixai.
maolreshxf.
eyjdqjjeapyqda.
de.
khxtlbhushihfgeqldgzvnyd.
.
uxxdnyenyey.
awdnlxrwx.
dfvhvpdyxewu.
julqwfujiyjbc.
bzolrsrgkr.


In [16]:
e = torch.randn(4, 8, 10)
e.view(4, 4, 20)

tensor([[[-2.2512e-01, -2.2393e-01, -1.4633e+00, -2.3942e+00,  1.7683e-01,
          -1.4251e-01, -2.3589e-01,  4.3456e-01, -4.7320e-01,  2.1212e-01,
           2.1429e-01,  2.5476e-01,  7.3470e-01,  6.2176e-01, -3.5796e-01,
          -2.3217e+00,  1.6595e-01, -2.7017e-01, -9.2860e-01, -3.3029e-02],
         [-6.5910e-01,  6.1995e-01, -7.2287e-01, -2.0508e+00,  1.7791e+00,
           7.9939e-01, -2.4499e-01,  1.2096e-01, -8.5495e-01, -6.1114e-01,
          -2.1237e-01, -3.9972e-01,  6.8885e-04,  1.5386e+00, -1.0314e+00,
          -1.1214e-01,  8.1772e-01,  4.3274e-01,  3.8807e-01,  7.6808e-01],
         [ 6.5319e-01,  6.6067e-01,  1.2611e+00, -1.4447e-01,  1.5423e+00,
          -5.1042e-01, -3.7211e-01,  5.9883e-01,  1.6525e+00, -1.0708e+00,
          -3.4717e-01, -1.7310e-01,  3.7033e-01, -2.1152e+00, -4.1501e-01,
          -4.3801e-02, -6.7324e-01, -1.0372e-01, -7.4317e-01,  7.7065e-01],
         [ 9.4933e-01,  8.3428e-01,  9.5943e-02,  1.0666e-01, -3.4945e-01,
          -1.0201e+00,