In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import torch
import torch.nn.functional as F

In [3]:
words = open('names.txt').read().splitlines()
len(words), words[:8]

(32033,
 ['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia'])

In [4]:
# let's only have one special token, and let's have it at index 0, offset others by 1
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
num_classes = len(stoi)

In [5]:
# test build the dataset
block_size = 3 # How many characters do we take to predict the next one : 3 chars to predict the 4th
for w in words[:5]:
    print(w)
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        print(''.join(itos[i] for i in context), '-------->', itos[ix])
        context = context[1:] + [ix] # crop and append moving window

emma
... --------> e
..e --------> m
.em --------> m
emm --------> a
mma --------> .
olivia
... --------> o
..o --------> l
.ol --------> i
oli --------> v
liv --------> i
ivi --------> a
via --------> .
ava
... --------> a
..a --------> v
.av --------> a
ava --------> .
isabella
... --------> i
..i --------> s
.is --------> a
isa --------> b
sab --------> e
abe --------> l
bel --------> l
ell --------> a
lla --------> .
sophia
... --------> s
..s --------> o
.so --------> p
sop --------> h
oph --------> i
phi --------> a
hia --------> .


In [6]:
# build the dataset (only for N words right now)

block_size = 3 # How many characters do we take to predict the next one : 3 chars to predict the 4th
X, Y, = [], [] # X, input | Y, labels

for w in words[:5]:
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix] # crop and append moving window

X = torch.tensor(X)
Y = torch.tensor(Y)

X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [7]:
# similar to paper cramming 17000 words into 30 dim space
# we have 5 words so lets cram it into a 2 dimensional space

In [8]:
C = torch.randn(27, 2) # each of the 27 chars will have a 2 dimensional embedding
C.shape, C.dtype

(torch.Size([27, 2]), torch.float32)

In [9]:
# to access the embeddings for the 5th character
C[5]

tensor([ 2.0808, -0.0085])

In [10]:
# the other way to do it is to use one hot encoded vectors to lookup via matrix multplication
# they're equivalent interpretations
F.one_hot(torch.tensor(5), num_classes=27).float() @ C

tensor([ 2.0808, -0.0085])

In [11]:
C.shape, X.shape

(torch.Size([27, 2]), torch.Size([32, 3]))

In [12]:
C[1].shape, C[[1,2,3]].shape

(torch.Size([2]), torch.Size([3, 2]))

In [13]:
C[X].shape

torch.Size([32, 3, 2])

In [14]:
X[13,2]

tensor(1)

In [15]:
C[X][13,2], C[X[13,2]] 

(tensor([-1.5018, -0.6028]), tensor([-1.5018, -0.6028]))

In [16]:
# this pytorch indexing is still a bit confusing for me
# each value in X(index of the char) is being indexed into the C tensor which has a 2 dim embedding for each char possible
# hence this indexing pulls out the embedding for each char in the X tensor, given 32 rows, 3 chars each with 2 dims
# think a bit more and it makes sense
emb = C[X]
# what's useful though is thinking of the embedding matrix needed
# as seen from above 32 samples with 3 characters each with 2 dim embedding
emb.shape

torch.Size([32, 3, 2])

In [17]:
W1 = torch.randn(3*2, 100) # 2 dim embedding, 3 of them, say 100 neurons
b1 = torch.randn(100)
W1.shape, b1.shape

(torch.Size([6, 100]), torch.Size([100]))

In [18]:
# note that the shapes for embedding and W1 are not multipliable
# has to be reshaped

In [19]:
# let try concatenating first
emb[:, 0, :].shape

torch.Size([32, 2])

In [20]:
# using torch.cat and ugly stacking
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 1, :]], dim=1).shape

torch.Size([32, 6])

In [21]:
# using torch.unbind (on dim 1) and then cat-ing it
torch.cat(torch.unbind(emb, dim=1), dim=1).shape

torch.Size([32, 6])

In [22]:
# short intro to view and storage on a tensor
# tensor's internal storage is always a 1 dim tensor
# view is how you can change the way you arrange it
# storage offset, strides, shapes
# good reading here
# http://blog.ezyang.com/2019/05/pytorch-internals/

In [23]:
# view is pretty efficient
emb.view(32, 6).shape

torch.Size([32, 6])

In [24]:
# Finally we can go back to multiplication of weights and biases
# -1 requires view to infer the shape
# lets get the activations
h = torch.tanh(emb.view(-1, W1.shape[0]) @ W1 + b1)
h.shape

torch.Size([32, 100])

In [25]:
# CHECK YO DTYPES
# CHECK YO SHAPES
# CHECK YO BROADCASTS

In [26]:
# final layer - input 100 from hidden layer, output 27 possible chars
W2 = torch.rand(100, 27)
b2 = torch.rand(27)

W2.shape, b2.shape

(torch.Size([100, 27]), torch.Size([27]))

In [27]:
# Final logits you
logits = h @ W2 + b2
logits.shape

torch.Size([32, 27])

In [28]:
# Lets get the counts
counts = logits.exp()

In [29]:
# finally the softmax
probs = counts / counts.sum(dim=1, keepdims=True)
probs.shape

torch.Size([32, 27])

In [30]:
# check that rows sum to 1
probs.sum(dim=1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

In [31]:
# Now that we have probs, time to get the loss
# for each Y (which is an int)
# need to pick out the corresponding prob for it from each corresponding row
torch.arange(32)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

In [32]:
# to pick the prob for each Y for each row, using the arange as the row and Y as the col lookup
probs[torch.arange(len(Y)), Y]

tensor([1.2748e-03, 2.4018e-03, 1.2570e-04, 1.5168e-02, 8.5282e-03, 5.6672e-05,
        3.2609e-01, 6.7816e-03, 1.3303e-02, 1.2102e-02, 1.9635e-03, 2.7434e-02,
        7.8319e-04, 9.5160e-02, 3.9812e-05, 3.8317e-02, 1.0551e-02, 2.5108e-02,
        3.5615e-03, 2.6272e-01, 2.7493e-03, 5.1367e-03, 2.6550e-06, 9.3671e-03,
        9.8317e-03, 1.0469e-02, 5.0911e-05, 5.3057e-03, 8.0725e-02, 4.6308e-03,
        9.1132e-03, 2.2840e-02])

In [33]:
# now for the loss
# get the log of probs(likelihood), then the average, and obv negative
loss = -probs[torch.arange(len(Y)), Y].log().mean()
loss

tensor(5.3921)

In [34]:
# cleanup on next nb