In [9]:

import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F

In [None]:
words = open("names.txt").read().splitlines()

In [None]:
import json

with open("pokemons.json", "r") as file:
    words = json.load(file)
words = words.get("pokemons")


['bulbasaur', 'ivysaur', 'venusaur', 'charmander', 'charmeleon']

In [7]:
def chtoidx(ch):
    if ord(ch) >= ord('a') and ord(ch) <= ord('z'): 
        return ord(ch) - ord('a') + 1
    else: 
        return 0

def idxtoch(idx):
    if idx - 1 + ord('a') >= ord('a') and idx - 1 + ord('a') <= ord('z'):
        return chr(idx - 1 + ord('a'))
    else:
        return '.'

In [46]:
## create datasets
block_size = 8 ## context window fed into the nn to predict the next character

def build_dataset(words):
    X, Y = [], []

    for w in words:
        ex = [chtoidx('.')] * block_size
        for ch in w + '.':
            X.append(ex)
            Y.append(chtoidx(ch))
            ex = ex[1:] + [chtoidx(ch)]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

import random
random.seed(129387)
random.shuffle(words)

n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))
Xtr, Ytr = build_dataset(words[:n1]) # training dataset
Xdev, Ydev = build_dataset(words[n1:n2]) # development dataset
Xte, Yte = build_dataset(words[n2:]) # test dataset
print(Ytr.shape)
print(Ydev.shape)
print(Yte.shape)

torch.Size([10852])
torch.Size([1388])
torch.Size([1346])


In [57]:
## generate neural network
## structure:
## - convert each input context window character into vector with a lookup table (C)
## - feed vectors into hidden layer 1 (L1)
## - feed L1 outputs into logit output layer (L2)

g = torch.Generator().manual_seed(187246324)
ch_embed_dim = 7 ## dimensions of vectors to embed input characters into
n_L1_neurons = 200 ## number of neurons in L1

C = torch.randn((27, ch_embed_dim)) ## lookup table to index character into higher dimensional vectors
W1 = torch.randn((block_size * ch_embed_dim, n_L1_neurons)) ## weights in L1 neurons
b1 = torch.randn(n_L1_neurons) ## biases in L1 neurons
W2 = torch.randn((n_L1_neurons, 27)) ## weights in L2 neurons
b2 = torch.randn(27) ## biases in L2 neurons
params = [C, W1, b1, W2, b2]
for p in params: 
    p.requires_grad = True
sum(p.nelement() for p in params)

17016

In [70]:
## train

lr = 0.0001 ## learning rate
batch_size = 1000

for i in range(10000):

    batch = torch.randint(0, Xtr.shape[0], (batch_size,))

    ## compute loss
    logits = (C[Xtr[batch]].view(-1, ch_embed_dim * block_size) @ W1 + b1).tanh() @ W2 + b2 ## output of the nn over all exemples in X
    loss = F.cross_entropy(logits, Ytr[batch]) ## applies softmax and negative log likelihood over output of the nn and labels in Y

    print(loss.item())

    ## calculate gradients
    for p in params: 
        p.grad = None
    
    loss.backward()

    ## update params
    for p in params:
        p.data += lr * -p.grad

2.349882125854492
2.4262094497680664
2.4364125728607178
2.4240269660949707
2.4671154022216797
2.4197757244110107
2.4082260131835938
2.3477845191955566
2.3495588302612305
2.4323620796203613
2.452176332473755
2.369331121444702
2.3955602645874023
2.4453134536743164
2.4246437549591064
2.4470810890197754
2.4226818084716797
2.393284559249878
2.3496224880218506
2.3802101612091064
2.3557286262512207
2.430905342102051
2.3734374046325684
2.436652898788452
2.380591869354248
2.5152032375335693
2.4966211318969727
2.3772056102752686
2.4417343139648438
2.414614677429199
2.357058525085449
2.385157823562622
2.4232985973358154
2.391946792602539
2.421337127685547
2.3266372680664062
2.4234111309051514
2.430508613586426
2.413848638534546
2.3638079166412354
2.414907455444336
2.4290292263031006
2.3784937858581543
2.4560816287994385
2.3383569717407227
2.4275732040405273
2.432352066040039
2.455416679382324
2.4742612838745117
2.4258017539978027
2.451505661010742
2.48183536529541
2.380967378616333
2.409780979156

In [71]:
logits = (C[Xtr].view(-1, ch_embed_dim * block_size) @ W1 + b1).tanh() @ W2 + b2 ## output of the nn over all exemples in X
loss = F.cross_entropy(logits, Ytr) ## applies softmax and negative log likelihood over output of the nn and labels in Y
loss.item()

2.415415048599243

In [72]:
logits = (C[Xdev].view(-1, ch_embed_dim * block_size) @ W1 + b1).tanh() @ W2 + b2 ## output of the nn over all exemples in X
loss = F.cross_entropy(logits, Ydev) ## applies softmax and negative log likelihood over output of the nn and labels in Y
loss.item()

2.593229293823242

In [73]:
logits = (C[Xte].view(-1, ch_embed_dim * block_size) @ W1 + b1).tanh() @ W2 + b2 ## output of the nn over all exemples in X
loss = F.cross_entropy(logits, Yte) ## applies softmax and negative log likelihood over output of the nn and labels in Y
loss.item()

2.6599574089050293

In [74]:
## sample from model

for _ in range(20):
    ctx = [chtoidx('.')] * block_size
    out = []
    while True:
        probs = F.softmax((C[torch.tensor(ctx)].view(-1) @ W1 + b1).tanh() @ W2 + b2, dim=0)
        next_character = torch.multinomial(probs, 1)
        next_character = next_character[0].item()
        if next_character == 0:
            break
        else:
            ctx = ctx[1:] + [next_character]
            out.append(next_character)
    name = ''.join(idxtoch(idx) for idx in out)
    print(name)

dapohilasdt
dunhanuy
torlilgort
suwac
lorcooga
conviaselot
ruuiogu
nrelten
zarevad
ragbico
latozunath
mqdaduni
rarepa
amorary
uromlar
hokore
veisntee
puchiile
duldpoge
gcerfidk
