In [29]:
import torch
import torch.nn.functional as F
import matplotlib

%matplotlib inline

In [30]:
words = open('names.txt', 'r').read().splitlines()

chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}


In [31]:
context_size = 3

def build_xy(words):
    """
    Return X, Y tuple of training data and labels given words.

    X will contain one row for each example. Each example will contain `context_size`
    elements representing character indices.

    Y will contain a character index label for each example.
    """

    xs = []
    ys = []
    for word in words:
        context = [0] * context_size
        for ch in word:
            ich = stoi[ch]
            xs.append(context)
            ys.append(ich)
            context = context[1:] + [ich]
        xs.append(context)
        ys.append(0)
    assert len(xs) == len(ys)
    X = torch.tensor(xs)
    Y = torch.tensor(ys)
    return X, Y

X, Y = build_xy(words[:3])
print(X)
print(Y)

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22],
        [ 9, 22,  9],
        [22,  9,  1],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1, 22],
        [ 1, 22,  1]])
tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0])


In [32]:

num_examples = Y.shape[0]

# Number of characters in our alphabet (the very first one is the terminator character).
vocab_size = 27

# Number of dimensions in vector space that we map each character to.
embedding_dims = 2

# The length of a context as a "flattened" array of each of its character's embeddings.
embedded_context_dims = context_size * embedding_dims

g = torch.Generator().manual_seed(2147483647)

# Matrix containing a "lookup table" from character indices to their embeddings in the vector space.
C = torch.randn((vocab_size, embedding_dims), dtype=torch.float, generator=g)

# Number of neurons in the hidden layer
w1_neurons = 10

# Hidden tanh layer
W1 = torch.randn((embedded_context_dims, w1_neurons), dtype=torch.float, generator=g)

# Final softmax layer
W2 = torch.randn((w1_neurons, vocab_size), dtype=torch.float, generator=g)

params = [C, W1, W2]

for param in params:
    param.requires_grad = True



In [34]:
num_training_rounds = 100

learning_rate = 0.1

def forward(X):
    num_examples = X.shape[0]

    # Each row is an example consisting of a "flattened" tensor of each character in the context.
    CX = C[X].view(num_examples, embedded_context_dims)

    # Make sure the very first example's first context item is the terminator character.
    terminator = C[0]
    assert CX[0][:embedding_dims].tolist() == terminator.tolist()

    CXW1 = torch.tanh(CX @ W1)

    assert list(CXW1.shape) == [num_examples, w1_neurons]

    logits = CXW1 @ W2

    assert list(logits.shape) == [num_examples, vocab_size]

    # TODO: Use torch's softmax here to improve efficiency.
    fake_counts = logits.exp()

    probs = fake_counts / torch.sum(fake_counts, dim=1, keepdim=True)

    # Ensure the probabilities of all characters in the first example sum to approximately 1.0.
    assert probs[0].sum() - 1.0 < 0.000001

    return probs

for i in range(num_training_rounds):
    probs = forward(X)

    # TODO: Use torch's cross-entropy loss here to improve efficiency.
    loss = -probs[range(num_examples), Y].log().mean()

    print("LOSS: ", loss.item())

    for param in params:
        param.grad = None
    
    loss.backward()

    for param in params:
        param.data += -learning_rate * param.grad



LOSS:  0.46320825815200806
LOSS:  0.45963940024375916
LOSS:  0.45616501569747925
LOSS:  0.4527813196182251
LOSS:  0.44948506355285645
LOSS:  0.4462727904319763
LOSS:  0.44314125180244446
LOSS:  0.4400876462459564
LOSS:  0.4371088743209839
LOSS:  0.43420249223709106
LOSS:  0.4313656687736511
LOSS:  0.4285960793495178
LOSS:  0.425891250371933
LOSS:  0.4232490062713623
LOSS:  0.4206671416759491
LOSS:  0.41814374923706055
LOSS:  0.4156767427921295
LOSS:  0.4132642149925232
LOSS:  0.4109044671058655
LOSS:  0.4085957705974579
LOSS:  0.4063364863395691
LOSS:  0.4041249752044678
LOSS:  0.40195977687835693
LOSS:  0.39983952045440674
LOSS:  0.3977626860141754
LOSS:  0.3957280218601227
LOSS:  0.393734335899353
LOSS:  0.39178016781806946
LOSS:  0.38986462354660034
LOSS:  0.38798633217811584
LOSS:  0.38614434003829956
LOSS:  0.3843376338481903
LOSS:  0.38256514072418213
LOSS:  0.38082587718963623
LOSS:  0.379118949174881
LOSS:  0.37744346261024475
LOSS:  0.3757985532283783
LOSS:  0.3741833567619324

In [48]:
# This should be relatively high, because 'emma' is in the training set.
forward(torch.tensor([[0, 0, 0]]))[0][stoi['e']]

tensor(0.2842, grad_fn=<SelectBackward0>)