In [124]:
import random
import torch

words = open('names.txt', 'r').read().splitlines()
words = list(set(words))
random.seed(42)
random.shuffle(words)
len(words)

chs = list(set(''.join(words + ['.'])))
chs = sorted(chs, reverse=False)
stoi = {ch: i for i, ch in enumerate(chs)}
itos = {i: ch for i, ch in enumerate(chs)}

# predict next token use previous 3 tokens
X, Y = [], []

for w in words:
    context = '...'
    for ch in w + '.':
        x = [stoi[c] for c in context]
        y = stoi[ch]
        X.append(x)
        Y.append(y)
        context = context[1:] + ch

X = torch.tensor(X)
Y = torch.tensor(Y)
n1, n2  = int(0.8 * len(X)), int(0.9 * len(X))

X_train, X_val, X_test = X.tensor_split([n1, n2])
Y_train, Y_val, Y_test = Y.tensor_split([n1, n2])

X_train.shape, X_val.shape, X_test.shape, Y_train.shape, Y_val.shape, Y_test.shape


(torch.Size([169062, 3]),
 torch.Size([21133, 3]),
 torch.Size([21133, 3]),
 torch.Size([169062]),
 torch.Size([21133]),
 torch.Size([21133]))

# normalize the initial parameters
otherwise the initial loss will be too large

In [147]:
import torch.nn.functional as F

n_embd = 10
n_hidden = 200

torch.manual_seed(42)
C = torch.randn(27, n_embd)
w1 = torch.randn(n_embd * 3, n_hidden)
b1 = torch.randn(n_hidden)
w2 = torch.randn(n_hidden, 27)
b2 = torch.randn(27)
params = [C, w1, b1, w2, b2]
for p in params:
    p.requires_grad = True

bs = 32
n_steps = 200000


for step in range(n_steps):
    idx = torch.randint(0, X_train.shape[0], (bs,))
    x = X_train[idx]
    y = Y_train[idx]
    emb = C[x].view(x.shape[0], -1)
    logits = torch.tanh(emb @ w1 + b1) @ w2 + b2
    loss = F.cross_entropy(logits, y)
    if step % 5000 == 0:
        with torch.no_grad():
            emb = C[X_val].view(X_val.shape[0], -1)
            logits = torch.tanh(emb @ w1 + b1) @ w2 + b2
            val_loss = F.cross_entropy(logits, Y_val)
            print(f'step: {step}, train loss: {loss.item()}, val loss: {val_loss.item()}')
    loss.backward()
    lr = 0.1 if step < 100000 else 0.01
    for p in params:
        p.data -= lr * p.grad
        p.grad = None
    

step: 0, train loss: 24.335227966308594, val loss: 26.72002410888672
step: 5000, train loss: 2.751279592514038, val loss: 2.7108864784240723
step: 10000, train loss: 2.34926176071167, val loss: 2.532747507095337
step: 15000, train loss: 2.5620970726013184, val loss: 2.4622185230255127
step: 20000, train loss: 2.406903028488159, val loss: 2.406139373779297
step: 25000, train loss: 2.750678539276123, val loss: 2.4181902408599854
step: 30000, train loss: 2.4322433471679688, val loss: 2.4484927654266357
step: 35000, train loss: 2.6349072456359863, val loss: 2.420732259750366
step: 40000, train loss: 1.9264978170394897, val loss: 2.3425233364105225
step: 45000, train loss: 1.9710713624954224, val loss: 2.338916778564453
step: 50000, train loss: 2.170379638671875, val loss: 2.339043378829956
step: 55000, train loss: 2.1003916263580322, val loss: 2.3541834354400635
step: 60000, train loss: 3.1957919597625732, val loss: 2.3791303634643555
step: 65000, train loss: 2.4687180519104004, val loss: 

In [148]:
with torch.no_grad():
    emb = C[X_test].view(X_test.shape[0], -1)
    logits = torch.tanh(emb @ w1 + b1) @ w2 + b2
    val_loss = F.cross_entropy(logits, Y_test)

val_loss

tensor(2.1569)

In [145]:
import torch.nn.functional as F

n_embd = 10
n_hidden = 200

torch.manual_seed(42)
C = torch.randn(27, n_embd)
w1 = torch.randn(n_embd * 3, n_hidden)
b1 = torch.randn(n_hidden)
w2 = torch.randn(n_hidden, 27) * 0.01
b2 = torch.randn(27) * 0
params = [C, w1, b1, w2, b2]
for p in params:
    p.requires_grad = True

bs = 32
n_steps = 200000


for step in range(n_steps):
    idx = torch.randint(0, X_train.shape[0], (bs,))
    x = X_train[idx]
    y = Y_train[idx]
    emb = C[x].view(x.shape[0], -1)
    logits = torch.tanh(emb @ w1 + b1) @ w2 + b2
    loss = F.cross_entropy(logits, y)
    if step % 5000 == 0:
        with torch.no_grad():
            emb = C[X_val].view(X_val.shape[0], -1)
            logits = torch.tanh(emb @ w1 + b1) @ w2 + b2
            val_loss = F.cross_entropy(logits, Y_val)
            print(f'step: {step}, train loss: {loss.item()}, val loss: {val_loss.item()}')
    loss.backward()
    lr = 0.1 if step < 100000 else 0.01
    for p in params:
        p.data -= lr * p.grad
        p.grad = None
    

step: 0, train loss: 3.290433406829834, val loss: 3.300619125366211
step: 5000, train loss: 2.7291624546051025, val loss: 2.379938840866089
step: 10000, train loss: 2.177419424057007, val loss: 2.343893051147461
step: 15000, train loss: 2.50797176361084, val loss: 2.305969476699829
step: 20000, train loss: 2.309122323989868, val loss: 2.2802305221557617
step: 25000, train loss: 2.6506752967834473, val loss: 2.31345796585083
step: 30000, train loss: 2.178121566772461, val loss: 2.3158769607543945
step: 35000, train loss: 2.4785425662994385, val loss: 2.3063669204711914
step: 40000, train loss: 1.9919476509094238, val loss: 2.28222918510437
step: 45000, train loss: 1.9372347593307495, val loss: 2.2604293823242188
step: 50000, train loss: 2.2875349521636963, val loss: 2.275184154510498
step: 55000, train loss: 2.08091402053833, val loss: 2.2790064811706543
step: 60000, train loss: 3.0925285816192627, val loss: 2.2785468101501465
step: 65000, train loss: 2.2678749561309814, val loss: 2.299

In [146]:
with torch.no_grad():
    emb = C[X_test].view(X_test.shape[0], -1)
    logits = torch.tanh(emb @ w1 + b1) @ w2 + b2
    val_loss = F.cross_entropy(logits, Y_test)

val_loss

tensor(2.1420)