In [76]:
import random
import string
from typing import Any, Dict, List, Tuple
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures



g = torch.Generator().manual_seed(2147483647)


class Linear:

    def __init__(self, fan_in, fan_out, bias=True) -> None:
        self.weights = torch.randn((fan_in, fan_out), generator=g) / fan_in**0.5
        self.bias = torch.zeros(fan_out) if bias else None
        pass

    def __call__(self, x) -> torch.Tensor:
        self.out = x @ self.weights
        if self.bias is not None:
            self.out += self.bias
        return self.out
    
    def parameters(self):
        if self.bias is not None:
            return [self.weights, self.bias]
        
        return [self.weights]

class Tanh:

    def __call__(self, x) -> torch.Tensor:
        self.out = torch.tanh(x)
        return self.out
    
    def parameters(self):
        return []

class BatchNormal1D:

    def __init__(self, dim, eps=1e-5, momentum=0.1) -> None:
        self.eps = eps
        self.momentum = momentum
        self.training = True
        # parameters (trained with backprop)
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        # buffers (trained with a running 'momentum update')
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def __call__(self, x: torch.Tensor) -> torch.Tensor:
        if self.training:
            if x.ndim == 2:
                dim = 0
            elif x.ndim == 3:
                dim = (0, 1)
            xmean = x.mean(dim, keepdim=True)
            xvar = x.var(dim, keepdim=True)
        else:
            xmean = self.running_mean
            xvar = self.running_var

        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta

        # update the buffers
        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar

        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]

class Embedding:

    def __init__(self, vocab_size: int, no_of_embeddings: int) -> None:
        self.weights = torch.randn((vocab_size, no_of_embeddings), generator=g)

    def __call__(self, x) -> Any:
        # x should have a shape of (batch_size,)
        self.out = self.weights[x]
        return self.out
    
    def parameters(self):
        return [self.weights]
    
class FlattenConsecutive:

    def __init__(self, n) -> None:
        self.n = n

    def __call__(self, x) -> Any:
        B, T, C = x.shape
        self.out: torch.Tensor = x.view((B, T//self.n, C*self.n))
        if self.out.shape[1] == 1:
            self.out = self.out.squeeze(1)
        return self.out

    def parameters(self):
        return []
    
class Sequential:

    def __init__(self, layers: List) -> None:
        self.layers = layers

    def __call__(self, x) -> Any:
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return x
    
n_embed = 10
n_hidden = 100
block_size = 8

def get_names() -> List[str]:
    names = []
    with open('../names.txt') as f:
        # names = list(map(lambda x: x[:-1], (next(f) for _ in range(50))))
        names = f.read().splitlines()
    return names

vocab_size = 27

model: Sequential = Sequential([
    Embedding(vocab_size, n_embed),                                                                    # 8 x 10
    FlattenConsecutive(2), Linear(n_embed * 2, n_hidden, bias=False), BatchNormal1D(n_hidden), Tanh(), # 4 x 20
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNormal1D(n_hidden), Tanh(),    # 2 x 40
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNormal1D(n_hidden), Tanh(),    # 1 x 80
    Linear(n_hidden, vocab_size)
])

with torch.no_grad():
    model.layers[-3].gamma *= 0.1
    for layer in model.layers[:-1]:
        if isinstance(layer, Linear):
            layer.weights *= 1.0

parameters: List[torch.Tensor] = [p for layer in model.layers for p in layer.parameters()]
no_of_params = sum(p.nelement() for p in parameters)
print(f"{no_of_params=}")
for p in parameters:
    p.requires_grad = True
    
# building stoi
def get_stoi() -> Dict[str, int]:
    stoi = {c: i+1 for i, c in enumerate(string.ascii_lowercase)}
    stoi['.'] = 0
    return stoi


# bulding itos
def get_itos() -> List[str]:
    itos = ['.'] + list(string.ascii_lowercase)
    return itos

stoi = get_stoi()
itos = get_itos()
def get_xs_ys_from_name(name: str, block_size: int) -> Tuple[List[List[int]], List[int]]:

    xs, ys = [], []
    x = [0] * block_size
    for i in range(len(name)):
        c1 = name[i]
        c2 = name[i + 1] if i < len(name) - 1 else '.'
        first = stoi[c1]
        second = stoi[c2]

        x.append(first)
        x = x[1:]

        xs.append(x.copy())
        ys.append(second)

    return xs, ys

def build_dataset(names: List[str]) -> Tuple[torch.Tensor, torch.Tensor]:
    xs, ys = [], []
    for name in names:
        x, y = get_xs_ys_from_name(name, block_size)
        xs.extend(x)
        ys.extend(y)
    return torch.tensor(xs), torch.tensor(ys)


names = get_names()
print("names retrieved")
random.seed(42)
random.shuffle(names)
n1 = int(0.8 * len(names))
n2 = int(0.9 * len(names))
Xtr, Ytr = build_dataset(names[:n1])
Xval, Yval = build_dataset(names[n1:n2])
Xtest, Ytest = build_dataset(names[n2:])

# training
max_steps = 200000
batch_size = 32
lossi = []
ud = []

no_of_params=45597
names retrieved


In [70]:
Xtr.shape
ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
Xb = Xtr[ix]
x: torch.Tensor = model(Xb)
x.shape

torch.Size([32, 27])

In [77]:
for i in range(max_steps):
    # generating new batch
    ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
    Xb, Yb = Xtr[ix], Ytr[ix]
    # forward
    x: torch.Tensor = model(Xb)
    loss = F.cross_entropy(x, Yb)

    # backward
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    lr = 0.1 if i < 100000 else 0.01
    for p in parameters:
        p.data += -lr * p.grad

    if i % 10000 == 0:
         print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    

      0/ 200000: 3.3217
  10000/ 200000: 1.8252
  20000/ 200000: 1.7584
  30000/ 200000: 2.2835
  40000/ 200000: 1.9354
  50000/ 200000: 2.2560
  60000/ 200000: 1.7906
  70000/ 200000: 1.7284
  80000/ 200000: 1.9224
  90000/ 200000: 1.6209
 100000/ 200000: 1.3516
 110000/ 200000: 2.0380
 120000/ 200000: 1.3887
 130000/ 200000: 1.7233
 140000/ 200000: 1.9910
 150000/ 200000: 1.5905
 160000/ 200000: 1.8258
 170000/ 200000: 1.9257
 180000/ 200000: 1.8604
 190000/ 200000: 2.2050


In [78]:
for layer in model.layers:
    layer.training = False

# evaluate the loss
@torch.no_grad() # this decorator disables gradient tracking inside pytorch
def split_loss(split):
  x,y = {
    'train': (Xtr, Ytr),
    'val': (Xval, Yval),
    'test': (Xtest, Ytest),
  }[split]
  logits = model(x)
  loss = F.cross_entropy(logits, y)
  print(split, loss.item())

split_loss('train')
split_loss('val')

train 1.6902440786361694
val 1.8543689250946045
