In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
import random


In [2]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()

print(len(words))

# build the vocabulary of characters and mappings to/from integers
char_set = ['.'] + sorted(list(set([ch for word in words for ch in list(word)])))

stoi = {char: i for i, char in enumerate(char_set)}  #encode(s)
itos = {i: char for i, char in enumerate(char_set)}
print("encoder: ", stoi)
print("decoder: ", itos)

vocab_size = len(itos)
print(vocab_size)

32033
encoder:  {'.': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}
decoder:  {0: '.', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}
27


In [5]:
# build the dataset
block_size = 8 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
    
    X, Y = [], []
    for w in words:
        pad_word = '.' * block_size + w + '.'
        for i in range(block_size, len(pad_word)):
            # print(pad_word[i-block_size:i], " --> ", pad_word[i])
            X.append([stoi[s] for s in pad_word[i-block_size:i]])
            Y.append(stoi[pad_word[i]])            
    return torch.tensor(X), torch.tensor(Y)


x, y = build_dataset(words[:3])
print(x)
for i in range(len(x)):
    # print("".join([itos[int(ci)] for ci in x[i]]), " -> ", itos[int(y[i])])
    print("".join([itos[ci.item()] for ci in x[i]]), " -> ", itos[y[i].item()])   # use item() to convert tensor int to int

tensor([[ 0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0, 25],
        [ 0,  0,  0,  0,  0,  0, 25, 21],
        [ 0,  0,  0,  0,  0, 25, 21,  8],
        [ 0,  0,  0,  0, 25, 21,  8,  5],
        [ 0,  0,  0, 25, 21,  8,  5, 14],
        [ 0,  0, 25, 21,  8,  5, 14,  7],
        [ 0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  4],
        [ 0,  0,  0,  0,  0,  0,  4,  9],
        [ 0,  0,  0,  0,  0,  4,  9, 15],
        [ 0,  0,  0,  0,  4,  9, 15, 14],
        [ 0,  0,  0,  4,  9, 15, 14,  4],
        [ 0,  0,  4,  9, 15, 14,  4, 18],
        [ 0,  4,  9, 15, 14,  4, 18,  5],
        [ 0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0, 24],
        [ 0,  0,  0,  0,  0,  0, 24,  1],
        [ 0,  0,  0,  0,  0, 24,  1, 22],
        [ 0,  0,  0,  0, 24,  1, 22,  9],
        [ 0,  0,  0, 24,  1, 22,  9,  5],
        [ 0,  0, 24,  1, 22,  9,  5, 14]])
........  ->  y
.......y  ->  u
......yu  ->  h
.....yuh  ->  e
....yuhe  -

In [4]:
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%

In [6]:
#--------- mimic pytorch.nn ----------
# ------------------------------------------------------------
class Embedding: # serve as the look up table, for each vocab, embed it
  
    def __init__(self, num_embeddings, embedding_dim):   # num_embeddings = vocab_size 
      self.embeds = torch.randn((num_embeddings, embedding_dim))

    def __call__(self, x):   # x shape (batch_size, steps)
        self.out = self.embeds[x]  # return  (batch_size, steps, embedding_dim)
        return self.out

    def parameters(self):
        return [self.embeds]

# ---------------------------------------------------------
class FlattenConsecutive:  # similar to pytorch Flatten but different dims. 
    def __init__(self, n_steps):
        self.wave_steps = n_steps
        
    def __call__(self, x):
        B, T, C = x.shape  # batch_size, steps, embedding_dim
        x = x.view(B, T//self.wave_steps, self.wave_steps*C)
        if x.shape[1] == 1:
            x = x.squeeze(1)  # if L//self.wave_steps = 1, WE JUST RETURN (B, C*N), squeeze out dim=1
        self.out = x    
        return self.out

    def parameters(self):
        return []

# ----------------------------------------------------------
class Linear:
    def __init__(self, fan_in, fan_out, bias=True):
        # use he kaiming normal distribution as initialization, instead of standard normal ~N(0, 1)
        # where weight ~ N (0, std^2)  where std = gain/sqrt(fan_mode) where fan_mode = "fan_in" (default) or "fan_out"
       
        self.weights = torch.randn((fan_in, fan_out))/(fan_in)**0.5
        # self.bias = None
        # if bias:
        #     self.bias = torch.zeros(fan_out)
        self.bias = torch.zeros(fan_out) if bias else None

    def __call__(self, x):
        # if x is passed as batch_size, time_steps, n_embed, flatten it to batch_size * time_steps, n_embed
        #x = x.view(-1, fan_in) @ self.weights
        # if self.bias:
        #     x += self.bias
        self.out = x @ self.weights   # dont forget to make self.out, relate out to weights for backprop
        if self.bias is not None:
            self.out += self.bias
            
        return self.out
        
    def parameters(self):
        # return [self.weight] + ([] if self.bias is None else [self.bias])
        if self.bias is None:
            return [self.weights]
        else:
            return [self.weights] + [self.bias]
            
# -----------------------------------------------------------------------------------
class BatchNorm1d:
    def __init__(self, n_hidden, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.momentum = momentum   # for calculating running mean and var
        self.training = True   # if not training, no grad on running_mean and var, they are not learnable parameter, but are used during backprop
        
        '''The mean and standard-deviation are calculated per-dimension over the mini-batches and 
        γ and β are learnable parameter vectors of size C (where C is the number of features or channels of the input). 
        By default, the elements of γ are set to 1 and the elements of β are set to 0. 
        y = ( (x - E(x))/ sqrt(var(x) + eps) ) *  γ +  β 
        At train time in the forward pass, the standard-deviation is calculated via the biased estimator, equivalent to torch.var(input, unbiased=False). 
        However, the value stored in the moving average of the standard-deviation is calculated via the unbiased estimator, equivalent to torch.var(input, unbiased=True).
        '''
        # parameters (trained with backprop)
        self.gamma = torch.ones(n_hidden)
        self.beta = torch.zeros(n_hidden)
        
        # buffers (trained with a running 'momentum update')
        self.running_mean = torch.zeros(n_hidden)
        # wrong: self.running_var = torch.zeros(n_hidden)
        self.running_var = torch.ones(n_hidden)
        # these are initialized, shared and remain same for one instance of BatchNorm1d,
        # xmean and xvar are not self. because they differ with x ???

    def __call__(self, x):
        # calculate the forward pass
        if self.training:
            if x.ndim == 2:  # x (batch_size, n_hidden)
                dim = 0
            elif x.ndim == 3:  # x (batch_size, wave_steps, n_hidden)   note: in pytorch, they use (batch_size, n_hiddden, steps)
                dim = (0,1)
            # (wrong: not self ): self.x_mean = torch.mean(x, dim=1, keepdim=True)
            # (wrong: not self): self.x_var = torch.var(x, dim=1, keepdim=True, unbiased=False)
            xmean = x.mean(dim=dim, keepdim=True)  # batch mean
            xvar = x.var(dim=dim, keepdim=True)    # batch variation
        else: # like for calculate validation loss
            xmean = self.running_mean
            xvar = self.running_var
        
        #self.out = ((x - self.x_mean)/ (self.x_var + self.eps)**0.5 ) * self.gamma + self.beta
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
        self.out = self.gamma * xhat + self.beta

        if self.training:
            with torch.no_grad():  # x shape of (batch_size, n_hidden)
                self.running_mean = self.running_mean * (1-self.momentum) + xmean * self.momentum
                self.running_var = self.running_var * (1-self.momentum) + xvar * self.momentum

        return self.out

    def parameters(self):
        return [self.gamma] + [self.beta]   
        # as you can see here xmean and xvar are not parameters, they are torch tensor who participate in backprop, but not parameters,

# -----------------------------------------------------------
class Tanh:
    # def __init__(self):

    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
        
    def parameters(self):
        return []


# -------------------------------------------------
class Sequential:
  
    def __init__(self, layers):
        self.layers = layers
  
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return self.out
  
    def parameters(self):
        # get parameters of all layers and stretch them out into one list
        return [p for layer in self.layers for p in layer.parameters()]

In [7]:
vocab_size = 27
embedding_dim = 24
n_hidden = 128
# wave_steps = block_size
wave_steps = 2

# note that we have Flatten for all 3 repeated blocks 
model = Sequential([Embedding(vocab_size, embedding_dim),
                    FlattenConsecutive(wave_steps), Linear(embedding_dim * wave_steps, n_hidden), BatchNorm1d(n_hidden), Tanh(),
                    FlattenConsecutive(wave_steps), Linear(n_hidden * wave_steps, n_hidden), BatchNorm1d(n_hidden), Tanh(),
                    FlattenConsecutive(wave_steps), Linear(n_hidden * wave_steps, n_hidden), BatchNorm1d(n_hidden), Tanh(),
                    Linear(n_hidden, vocab_size)]
                  )
# dont forget the parameters
# parameter init
with torch.no_grad():
    model.layers[-1].weights *= 0.1 # last layer make less confident

parameters = model.parameters()
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

76963


In [8]:
g = torch.Generator().manual_seed(2147483647)
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
    # randomly select ix
    ix = torch.randint(low =0, high=len(Xtr), size=(batch_size,), generator=g)
    x = Xtr[ix]
    y = Ytr[ix]
    logits = model(x)
    loss = F.cross_entropy(logits, y)
    
    # backprop
    for p in parameters:
        p.grad = None

    loss.backward()
    # update: simple SGD
    lr = 0.1 if i < 150000 else 0.01 # step learning rate decay
    for p in parameters:
        p.data += -lr * p.grad
    
    # track stats
    if i % 10000 == 0: # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
        
    lossi.append(loss.log10().item())

    # if i >= 2000:
    #     break

      0/ 200000: 3.3078
  10000/ 200000: 1.9507
  20000/ 200000: 2.0075
  30000/ 200000: 2.0083
  40000/ 200000: 1.7478
  50000/ 200000: 2.0748
  60000/ 200000: 1.8329
  70000/ 200000: 1.8637
  80000/ 200000: 2.0316
  90000/ 200000: 1.6232
 100000/ 200000: 1.7761
 110000/ 200000: 1.9160
 120000/ 200000: 1.8390
 130000/ 200000: 1.4186
 140000/ 200000: 2.0360
 150000/ 200000: 2.1057
 160000/ 200000: 1.7750
 170000/ 200000: 1.5316
 180000/ 200000: 2.3058
 190000/ 200000: 1.8961


In [9]:
for layer in model.layers:
    print(layer.__class__.__name__, ": ", tuple(layer.out.shape))

Embedding :  (32, 8, 24)
FlattenConsecutive :  (32, 4, 48)
Linear :  (32, 4, 128)
BatchNorm1d :  (32, 4, 128)
Tanh :  (32, 4, 128)
FlattenConsecutive :  (32, 2, 256)
Linear :  (32, 2, 128)
BatchNorm1d :  (32, 2, 128)
Tanh :  (32, 2, 128)
FlattenConsecutive :  (32, 256)
Linear :  (32, 128)
BatchNorm1d :  (32, 128)
Tanh :  (32, 128)
Linear :  (32, 27)


In [97]:
print(model.layers[0].out.shape) # Embed
print(model.layers[1].out.shape) #Flatten
print(model.layers[2].out.shape) # Linear

# instead of (1,2,3,4,5,6,7,8)
# we want (1,2), (3,4), (5,6), (7,8) wave_step_n = 2
wave_step_n = 2
flat = torch.cat([model.layers[0].out[:, ::2, :], model.layers[0].out[:, 1::2, :]], dim=2)
print(flat.shape)

(flat == model.layers[0].out.view(batch_size, block_size//wave_step_n, wave_step_n * embedding_dim)).all()

# so go back to change Flatten

torch.Size([32, 8, 24])
torch.Size([32, 192])
torch.Size([32, 128])
torch.Size([32, 4, 48])


tensor(True)

In [141]:
x = torch.randint(10, (4, 8, 24))
# print(x)
x1 = x.view(4, 4, 48)
print(x1)
shifted = torch.roll(x, shifts=1, dims=1)
# print(shifted)
x2 = shifted.view(4, 4, 48)
print(x2)
print(torch.cat([x1, x2], dim=1))
print(torch.cat([x1, x2], dim=1).shape)

tensor([[[4, 1, 5, 3, 9, 6, 0, 4, 7, 3, 0, 7, 8, 6, 6, 5, 2, 1, 2, 2, 4, 3, 0,
          4, 5, 7, 0, 2, 9, 0, 3, 9, 3, 7, 5, 7, 3, 9, 8, 9, 5, 0, 0, 2, 9, 3,
          0, 0],
         [9, 0, 4, 4, 0, 0, 6, 5, 4, 2, 5, 6, 3, 0, 3, 5, 7, 3, 9, 4, 5, 1, 7,
          2, 1, 3, 9, 0, 2, 1, 8, 2, 0, 1, 8, 4, 4, 4, 1, 3, 1, 6, 9, 2, 4, 4,
          4, 4],
         [5, 0, 8, 2, 3, 0, 5, 1, 0, 3, 4, 9, 3, 2, 9, 8, 6, 5, 6, 9, 8, 5, 1,
          8, 4, 4, 8, 8, 1, 6, 8, 4, 8, 7, 2, 3, 7, 1, 7, 3, 0, 5, 9, 6, 0, 8,
          3, 6],
         [8, 8, 1, 0, 3, 4, 0, 2, 6, 3, 7, 3, 7, 9, 4, 3, 7, 0, 3, 3, 9, 7, 9,
          4, 4, 8, 2, 1, 2, 2, 7, 3, 5, 8, 0, 4, 9, 7, 8, 8, 9, 0, 0, 7, 2, 7,
          2, 9]],

        [[0, 0, 7, 9, 1, 7, 1, 6, 4, 1, 2, 4, 2, 9, 6, 6, 7, 3, 2, 4, 4, 3, 6,
          7, 7, 3, 1, 1, 0, 3, 9, 8, 8, 0, 0, 4, 2, 9, 7, 9, 4, 5, 4, 3, 7, 7,
          8, 6],
         [1, 1, 5, 9, 7, 6, 5, 6, 7, 7, 8, 7, 6, 5, 3, 0, 3, 2, 0, 1, 4, 2, 2,
          0, 1, 6, 3, 4, 0, 0, 2, 0, 7, 5, 5

In [10]:
# eval
# put layers into eval mode (needed for batchnorm especially)
for layer in model.layers:
    layer.training = False

@torch.no_grad()
def split_loss(split):
    sets = {'train': (Xtr, Ytr), 'val': (Xdev, Ydev), 'test': (Xte, Yte)}
    x, y = sets[split]
    loss = F.cross_entropy(model(x), y)
    print(split, loss.item())
    
split_loss('train')
split_loss('val')

train 1.7690614461898804
val 1.9970234632492065


# Andrej's performance log
original (3 character context + 128 hidden neurons, 47267 params): train 1.989, val 2.078

context: 3 -> 8 (62627 params): train 1.918, val 2.027

flat -> hierarchical (44195 params): train 1.941, val 2.029

fix bug in batchnorm: train 1.912, val 2.022

scale up the network: n_embd 24, n_hidden 128 (76K params): train 1.769, val 1.993


In [159]:
print(block_size)
context = torch.tensor([stoi['.']]*block_size)
print(context)

8
tensor([0, 0, 0, 0, 0, 0, 0, 0])


In [19]:
for _ in range(20):
    out = []
    context = [stoi['.']]*block_size
    # print("context: ", context)

    while True:
        # context = torch.tensor([context])   # first example is context
        # print("context tensor: ", context)
        # print("context.shape: ", context.shape)
        
        logits = model(torch.tensor([context]))   # torch.tensor([context]): shape ([1, 8, 27])  # keep context as a list so we can add & update for next one easily
        # print("logits.shape: ", logits.shape)   
        # training loss: cross_entropy  = - log (softmax(logits))
        # wrong: probs = F.softmax(logits) dont forget to choose dimension, on dim=1, [1,8(chosen), 27]
        probs = F.softmax(logits, dim=1)
        # print("probs.shape: ", probs.shape)
        #sample_i = torch.multinomial(probs, num_samples=1)
        sample_i = torch.multinomial(probs, num_samples=1).item()  # .item() convert to int
        # print("sample_i: ", sample_i)
        
        out.append(sample_i)
        # don't forget to shift context for next generation
        # shift the context window and track the samples
        context = context[1:] + [sample_i]
        # print("next context: ", context)
        if sample_i == 0:
            break

    print("".join([itos[int(sample_i)] for sample_i in out]))

stellar.
yusher.
shreed.
alleana.
kroberto.
normah.
harlie.
ami.
taileigh.
kaelan.
xarney.
bownlyn.
mossah.
addilindsia.
shine.
claley.
wyett.
jamila.
mamad.
artelys.
