In this notebook.

A a neural network approache will be used.

And this notebook is based on this [video](https://youtu.be/PaCmpygFfXo?si=jlreZYqCOdqHNMwA&t=5177).

In [1]:
# Load data 

words = open('./data/names.txt', 'r').read().splitlines()

len(words)

32033

In [2]:


chars = sorted(list(set(''.join(words))))

stoi = { s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = { i:s for s, i in stoi.items()}

## - prepare training set

In [3]:
import torch

# Create the training set of bigram (x,y)

xs, ys = [], []

for w in words:
    
    chs = ['.'] + list(w) + ['.']
    
    # zip() generate the [2-consecutive characters]
    for ch1, ch2  in zip(chs, chs[1:]):

        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        # print(ch1, ch2)

        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()

print(f'number of examples: {num}')


number of examples: 228146


## - initialize the network

In [4]:
# Randomly initialize 27 neuron weights, each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)

W = torch.randn((27, 27), generator=g, requires_grad=True)

## - training

In [60]:
import torch.nn.functional as F

for k in range(100):

    # forward pass

    xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
    
    logits = xenc @ W     # Semantics: log-counts
    
    counts = logits.exp() # Semantics: counts, equivalent to N
    probs = counts / counts.sum(1, keepdims=True) # probability for next character
    
    loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
    # print(loss.item())

    # backward pass

    W.grad = None # set to zero the gradient
    loss.backward()

    # update
    W.data += -10 * W.grad

print(loss.item())


2.4810140132904053


## - sample

In [68]:
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
    out = []
    ix = 0
    while True:
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float() # input to the network: one-hot encoding
        logits = xenc @ W # predict log-counts
        counts = logits.exp() # counts
        p = counts / counts.sum(dim=1, keepdim=True) # probabilities for the next character

        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if 0 == ix :
            break
    print(''.join(out))
        

cexze.
momasurailezityha.
konimittain.
llayn.
ka.
