In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import random
# import seaborn as sns

### 1. Build Dataset based on block size

In [5]:
# read names
with open('names.txt', 'r') as f:
    names = f.readlines()
names = [name.strip() for name in names]

In [9]:
# build chars to int mapping
chars = '.abcdefghijklmnopqrstuvwxyz'
ctoi = {c: i for i, c in enumerate(chars)}
itoc = {i: c for i, c in enumerate(chars)}

In [15]:
# --- build dataset based on block size ie how many chars do we take to predict the next one
def build_dataset(names: [str], block_size: int = 3):
    X, Y = [], []
    for name in names:
        context = [0] * block_size
        for ch in list(name) + ['.']:
            idx = ctoi[ch]
            X.append(context)
            Y.append(idx)
            context = context[1:] + [idx]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

In [25]:
# --- split into training and testing set

# creating dataset
X, Y = build_dataset(names=names, block_size=3)

# generating random indexes
indexes = list(range(X.shape[0]))
random.shuffle(indexes)
train_perc = 0.8
train_size = int(X.shape[0] * train_perc)

# spliting 
X_train, Y_train = X[:train_size], Y[:train_size]
X_test, Y_test = X[train_size:], Y[train_size:]

In [28]:
print(X_train.shape, Y_train.shape)

torch.Size([182516, 3]) torch.Size([182516])


In [None]:
# build chars-int dictionary: mapping from subsequence of chars to int


In [None]:
# encode with one-hot encoding (?)



### 2. Build and Train Embedding

In [49]:
# initialize embedding -> each character is represented by a vector of {vector_size}
# initialize MLP layers -> we have two layers
# layer 1: dim W1: (vector_size * block_size, t1)
# layer 2: dim W2: (1, 27) ie 27: num of chars
vector_size = 10
C = torch.randn((27, vector_size))
embedding = C[X] # shape: [228146, 3, 2]
W1 = torch.randn((30, 200))
b1 = torch.randn(200)
W2 = torch.randn((200, 27))
b2 = torch.randn(27)
params = [W1, b1, W2, b2]

In [50]:
# num of parameters in total
sum(p.nelement() for p in params)

11627

In [52]:
# set parameters to training mode
for p in params:
    p.requires_grad = True

In [75]:
# mini-batch gradient descent + learning rate decay
num_epochs = 200000
n = X_train.shape[0]

losses = []
for i in range(num_epochs):
    # indexing mini-batch
    indexes = torch.randint(low=0, high=n, size=(32,))
    
    # forward pass
    emb = C[X_train[indexes]]
    h = torch.tanh(emb.view(-1, 30) @ W1 + b1)
    logits = h @ W2 + b2
    loss = torch.cross_entropy(logits, Y_train[indexes])
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    lr = 0.1 if i < 150000 else 0.01
    for p in parameters:
        p.data += lr * p.grad

    # print loss
    losses
    

### 3. Visualize Embedding