In [None]:
with open('names.txt', 'r') as f:
    names = f.readlines()

chars =  ['.'] + sorted(set( ch for name in names for ch in name.strip())) 

print(type(chars))
print(chars)
stoi = { ch:i for i, ch in enumerate(chars)}
itos = { i:ch for i, ch in enumerate(chars)}

print(stoi)
print(itos)



In [None]:
embedding_size = 10
context_length = 3
hidden_size = 100
mini_batch = 32
epochs = 20000
lr = 0.1
weight_decay = 0.1

In [None]:
X_train  = []
Y_train = []
X_val = []
Y_val = []
import random
random.seed(42)
random.shuffle(names)
total_len = len(names)
train_len = int(total_len * 0.9)
val_len = total_len - train_len
train_names = names[:train_len]
val_names = names[train_len:]
for name in train_names:
    name =  name.strip()
    context = [0] * context_length
    for ch in name + '.':
        input = [itos[c] for c in context]
        
        X_train.append(context)
        Y_train.append(stoi[ch])
        context = context[1:] + [stoi[ch]]
for name in val_names:
    name =  name.strip()
    context = [0] * context_length
    for ch in name + '.':
        input = [itos[c] for c in context]
        
        X_val.append(context)
        Y_val.append(stoi[ch])
        context = context[1:] + [stoi[ch]]

import torch
g = torch.Generator().manual_seed(2147483647) # for reproducibility
X_train = torch.tensor(X_train)
Y_train = torch.tensor(Y_train)
X_val = torch.tensor(X_val)
Y_val = torch.tensor(Y_val)

C = torch.randn((27, embedding_size), generator=g)

In [None]:
import torch
g= torch.Generator().manual_seed(2147483647)
class Linear:
    def __init__(self, n_in, n_out, bias=True):
        self.w = torch.randn((n_in, n_out), generator=g) * (5/3) * (1/ (n_in)**0.5)
        self.b = torch.randn((n_out), generator=g) if bias else None
        self.bias = bias 
    
    def __call__(self, x):
        self.out = x @ self.w
        if self.bias:
            self.out += self.b
        return self.out
    
    def parameters(self):
        if self.bias:
            return [self.w, self.b]
        else:
            return [self.w]
    
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    
    def parameters(self):
        return []
    

class BatchNorm1d:
    def __init__(self, dim, eps=1e-05, momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        self.training = True
        self.scale = torch.ones(dim)
        self.shift = torch.zeros(dim)
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)
    
    def __call__(self, x):
        if self.training:
            xmean = x.mean(dim=0, keepdim=True)
            xvar = x.var(dim=0, keepdim=True)
        else:
            xmean = self.running_mean
            xvar = self.running_var
        I_std = xvar.sqrt() + self.eps
        I_mean = xmean
        x = (x - I_mean) / I_std
        self.out = self.scale * x + self.shift
        if self.training:
            with torch.no_grad():
                self.running_mean = self.momentum * self.running_mean + (1 - self.momentum) * I_mean
                self.running_var = self.momentum * self.running_var + (1 - self.momentum) * I_std
        return self.out
    
    def parameters(self):
        return [self.scale, self.shift]
    
class CrossEntropyLoss:
    def __call__(self, x, y):
        return torch.nn.functional.cross_entropy(x, y)
    

            
        
    

In [None]:

layers = [
    Linear(context_length * embedding_size, hidden_size),  BatchNorm1d(hidden_size), Tanh(),
    Linear(hidden_size, hidden_size, bias = False), BatchNorm1d(hidden_size), Tanh(),
    Linear(hidden_size, hidden_size, bias = False), BatchNorm1d(hidden_size), Tanh(),
    Linear(hidden_size, hidden_size, bias = False), BatchNorm1d(hidden_size), Tanh(),
    Linear(hidden_size, hidden_size, bias = False), BatchNorm1d(hidden_size), Tanh(),
    Linear(hidden_size, 27, bias = False), BatchNorm1d(27),
]


# layers = [
#     Linear(context_length * embedding_size, hidden_size),  Tanh(),
#     Linear(hidden_size, hidden_size, bias = False),  Tanh(),
#     Linear(hidden_size, hidden_size, bias = False),  Tanh(),
#     Linear(hidden_size, hidden_size, bias = False),  Tanh(),
#     Linear(hidden_size, hidden_size, bias = False),  Tanh(),
#     Linear(hidden_size, 27, bias = False), BatchNorm1d(27),
# ]


with torch.no_grad():
  # last layer: make less confident
  layers[-1].scale *= 0.1
  #layers[-1].weight *= 0.1
  # all other layers: apply gain
  for layer in layers[:-1]:
    if isinstance(layer, Linear):
      layer.w *= 1.0 #5/3

parameters = [C] + [p for layer in layers for p in layer.parameters()]

print(sum(p.numel() for p in parameters))
for p in parameters:
  p.requires_grad = True


In [None]:
losses = []
ud = []
um = []
for i in range(epochs):
    samples = torch.randint(0, X_train.shape[0], (mini_batch,))
    mini_batch_X = X_train[samples]
    mini_batch_Y = Y_train[samples]
    E = C[mini_batch_X]
    E = E.view(-1, context_length * embedding_size)
    for layer in layers:
        E = layer(E)
    loss = CrossEntropyLoss()(E, mini_batch_Y)
    for layer in layers:
        layer.out.retain_grad()
    for p in parameters:    
        p.grad = None
    
    loss.backward()
    losses.append(loss.item())
    print("training loss: ", loss.item())
    
    for p in parameters:
        p.data -= p.grad * lr
        
    with torch.no_grad():
        ud.append([((lr*p.grad).std() / p.data.std()).log10().item() for p in parameters])

        
    

In [None]:
# visualize histograms
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 4)) # width and height of the plot
legends = []
for i, layer in enumerate(layers[:-1]): # note: exclude the output layer
  if isinstance(layer, Tanh):
    t = layer.out
    print('layer %d (%10s): mean %+.2f, std %.2f, saturated: %.2f%%' % (i, layer.__class__.__name__, t.mean(), t.std(), (t.abs() > 0.97).float().mean()*100))
    hy, hx = torch.histogram(t, density=True)
    plt.plot(hx[:-1].detach(), hy.detach())
    legends.append(f'layer {i} ({layer.__class__.__name__}')
plt.legend(legends);
plt.title('activation distribution')

In [None]:
# visualize histograms
plt.figure(figsize=(20, 4)) # width and height of the plot
legends = []
for i, layer in enumerate(layers[:-1]): # note: exclude the output layer
    if isinstance(layer, Tanh):
        t = layer.out.grad
        print('layer %d (%10s): mean %+f, std %e' % (i, layer.__class__.__name__, t.mean(), t.std()))
        hy, hx = torch.histogram(t, density=True)
        plt.plot(hx[:-1].detach(), hy.detach())
        legends.append(f'layer {i} ({layer.__class__.__name__}')
plt.legend(legends);
plt.title('gradient distribution')

In [None]:
# visualize histograms
plt.figure(figsize=(20, 4)) # width and height of the plot
legends = []
for l in layers:
    for p in l.parameters():
        t = p.grad
        if p.ndim == 2:
            print('weight %10s | mean %+f | std %e | grad:data ratio %e' % (tuple(p.shape), t.mean(), t.std(), t.std() / p.std()))
            hy, hx = torch.histogram(t, density=True)
            plt.plot(hx[:-1].detach(), hy.detach())
            legends.append(f'{i} {tuple(p.shape)}')
plt.legend(legends)
plt.title('weights gradient distribution');

In [None]:
plt.figure(figsize=(20, 4))
legends = []
for i,p in enumerate(parameters):
  if p.ndim == 2:
    plt.plot([ud[j][i] for j in range(len(ud))])
    legends.append('param %d' % i)
plt.plot([0, len(ud)], [-3, -3], 'k') # these ratios should be ~1e-3, indicate on plot
plt.legend(legends);


In [None]:
# # BatchNorm forward pass as a widget

# from ipywidgets import interact, interactive, fixed, interact_manual
# import ipywidgets as widgets
# import scipy.stats as stats
# import numpy as np

# def normshow(x0):
  
#   g = torch.Generator().manual_seed(2147483647+1)
#   x = torch.randn(5, generator=g) * 5
#   x[0] = x0 # override the 0th example with the slider
#   mu = x.mean()
#   sig = x.std()
#   y = (x - mu)/sig

#   plt.figure(figsize=(10, 5))
#   # plot 0
#   plt.plot([-6,6], [0,0], 'k')
#   # plot the mean and std
#   xx = np.linspace(-6, 6, 100)
#   plt.plot(xx, stats.norm.pdf(xx, mu, sig), 'b')
#   xx = np.linspace(-6, 6, 100)
#   plt.plot(xx, stats.norm.pdf(xx, 0, 1), 'r')
#   # plot little lines connecting input and output
#   for i in range(len(x)):
#     plt.plot([x[i],y[i]], [1, 0], 'k', alpha=0.2)
#   # plot the input and output values
#   plt.scatter(x.data, torch.ones_like(x).data, c='b', s=100)
#   plt.scatter(y.data, torch.zeros_like(y).data, c='r', s=100)
#   plt.xlim(-6, 6)
#   # title
#   plt.title('input mu %.2f std %.2f' % (mu, sig))

# interact(normshow, x0=(-30,30,0.5));
