In [1]:
import torch
import torch.nn.functional as F
import matplotlib

%matplotlib inline

In [2]:
words = open('names.txt', 'r').read().splitlines()

chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}


In [None]:
context_size = 3

def build_xy(words):
    """
    Return X, Y tuple of training data and labels given words.

    X will contain one row for each example. Each example will contain `context_size`
    elements representing character indices.

    Y will contain a character index label for each example.
    """

    xs = []
    ys = []
    for word in words:
        context = [0] * context_size
        for ch in word:
            ich = stoi[ch]
            xs.append(context)
            ys.append(ich)
            context = context[1:] + [ich]
        xs.append(context)
        ys.append(0)
    assert len(xs) == len(ys)
    X = torch.tensor(xs)
    Y = torch.tensor(ys)
    return X, Y

X, Y = build_xy(words[:1])
print(X)
print(Y)

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1]])
tensor([ 5, 13, 13,  1,  0])


In [None]:

num_examples = Y.shape[0]

# Number of characters in our alphabet (the very first one is the terminator character).
vocab_size = 27

# Number of dimensions in vector space that we map each character to.
embedding_dims = 2

g = torch.Generator().manual_seed(2147483647)
C = torch.randn((vocab_size, embedding_dims), dtype=torch.float, generator=g)
C_lookup = C[X].view(num_examples, context_size * embedding_dims)

# Make sure the very first example's first context item is the terminator character.
terminator = C[0]
assert C_lookup[0][:embedding_dims].tolist() == terminator.tolist()





