In [9]:

import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F

In [87]:
words = open("names.txt").read().splitlines()
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [88]:
import json

with open("pokemons.json", "r") as file:
    words = json.load(file)
words = words.get("pokemons")
words[:5]


['bulbasaur', 'ivysaur', 'venusaur', 'charmander', 'charmeleon']

In [89]:
import pandas as pd

df = pd.read_csv("nomes.csv")
words = [w.lower() for w in df["group_name"].to_list()]
words[:5]

['aline', 'arao', 'aron', 'ada', 'abade']

In [90]:
def chtoidx(ch):
    if ord(ch) >= ord('a') and ord(ch) <= ord('z'): 
        return ord(ch) - ord('a') + 1
    else: 
        return 0

def idxtoch(idx):
    if idx - 1 + ord('a') >= ord('a') and idx - 1 + ord('a') <= ord('z'):
        return chr(idx - 1 + ord('a'))
    else:
        return '.'

In [174]:
## create datasets
block_size = 10 ## context window fed into the nn to predict the next character

def build_dataset(words):
    X, Y = [], []

    for w in words:
        ex = [chtoidx('.')] * block_size
        for ch in w + '.':
            X.append(ex)
            Y.append(chtoidx(ch))
            ex = ex[1:] + [chtoidx(ch)]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

import random
random.seed(129387)
random.shuffle(words)

n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))
Xtr, Ytr = build_dataset(words[:n1]) # training dataset
Xdev, Ydev = build_dataset(words[n1:n2]) # development dataset
Xte, Yte = build_dataset(words[n2:]) # test dataset
print(Ytr.shape)
print(Ydev.shape)
print(Yte.shape)


torch.Size([638227])
torch.Size([79923])
torch.Size([79855])


In [195]:
## generate neural network
## structure:
## - convert each input context window character into vector with a lookup table (C)
## - feed vectors into hidden layer 1 (L1)
## - feed L1 outputs into logit output layer (L2)

g = torch.Generator().manual_seed(187246324)
ch_embed_dim = 8 ## dimensions of vectors to embed input characters into
n_L1_neurons = 220 ## number of neurons in L1

C = torch.randn((27, ch_embed_dim)) ## lookup table to index character into higher dimensional vectors
W1 = torch.randn((block_size * ch_embed_dim, n_L1_neurons)) ## weights in L1 neurons
b1 = torch.randn(n_L1_neurons) ## biases in L1 neurons
W2 = torch.randn((n_L1_neurons, 27)) ## weights in L2 neurons
b2 = torch.randn(27) ## biases in L2 neurons
params = [C, W1, b1, W2, b2]
for p in params: 
    p.requires_grad = True
sum(p.nelement() for p in params)

24003

In [210]:
## train

lr = 0.001 ## learning rate
batch_size = 700

for i in range(5000):

    batch = torch.randint(0, Xtr.shape[0], (batch_size,))

    ## compute loss
    logits = (C[Xtr[batch]].view(-1, ch_embed_dim * block_size) @ W1 + b1).tanh() @ W2 + b2 ## output of the nn over all exemples in X
    loss = F.cross_entropy(logits, Ytr[batch]) ## applies softmax and negative log likelihood over output of the nn and labels in Y

    print(loss.item())

    ## calculate gradients
    for p in params: 
        p.grad = None
    
    loss.backward()

    ## update params
    for p in params:
        p.data += lr * -p.grad

2.2365100383758545
2.144059896469116
2.1893391609191895
2.2124292850494385
2.1470189094543457
2.2209126949310303
2.223052501678467
2.144575834274292
2.148543119430542
2.2047226428985596
2.2631335258483887
2.2695131301879883
2.143684148788452
2.160024404525757
2.3024532794952393
2.2204971313476562
1.9878700971603394
2.2954792976379395
2.1334335803985596
2.1163835525512695
2.1750121116638184
2.247309923171997
2.1907238960266113
2.171023368835449
2.204822540283203
2.232327699661255
2.1894240379333496
2.205866575241089
2.2458064556121826
2.2575089931488037
2.140509843826294
2.146388530731201
2.1108639240264893
2.2211782932281494
2.2125256061553955
2.2182135581970215
2.1784260272979736
2.1747055053710938
2.178640842437744
2.158595561981201
2.1820428371429443
2.110888957977295
2.0500926971435547
2.149932622909546
2.2053139209747314
2.067169189453125
2.216031551361084
2.104379415512085
2.2617645263671875
2.2208170890808105
2.1894893646240234
2.1194047927856445
2.198805809020996
2.151689291000

In [211]:
logits = (C[Xtr].view(-1, ch_embed_dim * block_size) @ W1 + b1).tanh() @ W2 + b2 ## output of the nn over all exemples in X
loss = F.cross_entropy(logits, Ytr) ## applies softmax and negative log likelihood over output of the nn and labels in Y
loss.item()

2.180731773376465

In [212]:
logits = (C[Xdev].view(-1, ch_embed_dim * block_size) @ W1 + b1).tanh() @ W2 + b2 ## output of the nn over all exemples in X
loss = F.cross_entropy(logits, Ydev) ## applies softmax and negative log likelihood over output of the nn and labels in Y
loss.item()

2.185800313949585

In [213]:
logits = (C[Xte].view(-1, ch_embed_dim * block_size) @ W1 + b1).tanh() @ W2 + b2 ## output of the nn over all exemples in X
loss = F.cross_entropy(logits, Yte) ## applies softmax and negative log likelihood over output of the nn and labels in Y
loss.item()

2.1823036670684814

In [218]:
## sample from model
init = '' ## desired first characters in the name
init = [chtoidx(ch) for ch in init]
for _ in range(20):
    ctx = [chtoidx('.')] * (block_size - len(init)) + init
    out = init + []
    
    while True:
        probs = F.softmax((C[torch.tensor(ctx)].view(-1) @ W1 + b1).tanh() @ W2 + b2, dim=0)
        next_character = torch.multinomial(probs, 1)
        next_character = next_character[0].item()
        if next_character == 0:
            break
        else:
            ctx = ctx[1:] + [next_character]
            out.append(next_character)
    name = ''.join(idxtoch(idx) for idx in out)
    print(name)


ducrnana
weliao
ruoit
ancaton
srabichi
carilo
daolva
loucil
rafenile
ledisly
elivda
jemaniine
aleminone
caciei
maizali
idalocia
runilia
dismiliene
pelcirdii
dalib
