### Build makemore MLP yay.

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

# read all the words in the file
words = open("../names.txt").read().splitlines()

# build the vocabulary of characters and mapping to/from integers
chars = sorted(list(set(''.join(words))))
stoi = { ch: i + 1 for i, ch in enumerate(chars) }
stoi["."] = 0
itos = { i: ch for ch, i in stoi.items() }

# build the dataset
block_size = 3  # context length: how many characters do we take to predict the next one
X, Y = [], []
for w in words:
    
    print(w)
    context = [0] * block_size
    for ch in w + ".":
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        # print(''.join([itos[i] for i in context]), '--->', itos[ix])
        context = context[1:] + [ix]    # crop and append
        
X = torch.tensor(X)
Y = torch.tensor(Y)

In [None]:
block_size = 3  # using contiguous 3 characters to predict the next one
X, Y = [], []
for w in words[:3]:
    print(w)
    context = [0] * block_size
    for ch in w + ".":
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join([itos[i] for i in context]), '--->', itos[ix])
        context = context[1:] + [ix]    # crop and append

In [None]:
# for reproducibility
g = torch.Generator().manual_seed(2147483647)   # consistent with Andrej's settings 

# setting parameters
n_input = 6             # 3 characters * 2D embedding
n_hidden = 100
n_output = 27

C = torch.randn((27, 2), requires_grad=True, generator=g)
W1 = torch.randn((n_input, n_hidden), requires_grad=True, generator=g)
b1 = torch.randn((n_hidden,), requires_grad=True, generator=g)
W2 = torch.randn((n_hidden, n_output), requires_grad=True, generator=g)
b2 = torch.randn((n_output,), requires_grad=True, generator=g)

parameters = [C, W1, b1, W2, b2]        # collect all parameters

In [None]:
for _ in range(5000):
    
    # embedding
    emb = C[X]

    # forward pass
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y)

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    for p in parameters:
        p.data -= 1 * p.grad

print(loss.item())

In [None]:
lre = torch.linspace(-3., 0., 1000)
lrs = 10**lre
lrs

In [None]:
# rebuild the dataset
def build_dataset(words):
    block_size = 3  # using 3 contiguous characters to predict the next one
    X, Y = [], []
    for w in words:
        context = [0] * block_size
        for ch in w + ".":
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]    # crop and append
    return torch.tensor(X), torch.tensor(Y)

# split the dataset, randomly
import random

random.seed(2147483647)
random.shuffle(words)

n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtrain, Ytrain = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xtest, Ytest = build_dataset(words[n2:])

In [None]:
# mini-batch training
for i in range(10000):

    # mini-batch construction
    ix = torch.randint(0, Xtrain.shape[0], (32,))    # 32 is the batch size

    # embedding
    emb = C[Xtrain[ix]]  # randomly select 32 samples

    # forward pass
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Ytrain[ix])   # use the related labels
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    lr = 0.1
    for p in parameters:
        p.data -= lr * p.grad

print(loss.item())

In [None]:
# dev/validation valuation
emb = C[Xdev]
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
logits = h @ W2 + b2
loss_dev = F.cross_entropy(logits, Ydev)
print(loss_dev.item())

In [None]:
# test valuation
emb = C[Xtest]
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
logits = h @ W2 + b2
loss_test = F.cross_entropy(logits, Ytest)
print(loss_test.item())

In [None]:
# embedding
emb = C[X]

# forward pass
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y)
print(loss.item())

In [None]:
# visualize dimensions 0 and 1 of the embedding matrix C for all characters
plt.figure(figsize=(4,4))
plt.scatter(C[:,0].data, C[:,1].data, s=200)
for i in range(C.shape[0]):
    plt.text(C[i,0].item(), C[i,1].item(), itos[i], ha="center", va="center", color='white')
plt.grid('grid')

In [None]:
# Let's embed the char to 3D space
# for reproducibility
g = torch.Generator().manual_seed(2147483647)   # consistent with Andrej's settings 

# setting parameters
n_input = 9             # 3 characters * 3D embedding
n_hidden = 100
n_output = 27

C = torch.randn((27, 3), requires_grad=True, generator=g)
W1 = torch.randn((n_input, n_hidden), requires_grad=True, generator=g)
b1 = torch.randn((n_hidden,), requires_grad=True, generator=g)
W2 = torch.randn((n_hidden, n_output), requires_grad=True, generator=g)
b2 = torch.randn((n_output,), requires_grad=True, generator=g)

parameters = [C, W1, b1, W2, b2]        # collect all parameters

In [None]:
# mini-batch training
for _ in range(100000):

    # mini-batch construction
    ix = torch.randint(0, X.shape[0], (32,))    # 32 is the batch size

    # embedding
    emb = C[X[ix]]  # randomly select 32 samples

    # forward pass
    h = torch.tanh(emb.view(-1, 9) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y[ix])   # use the related labels
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    for p in parameters:
        p.data -= 0.1 * p.grad

In [None]:
# embedding
emb = C[X]

# forward pass
h = torch.tanh(emb.view(-1, 9) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y)
print(loss.item())

In [None]:
# visualize 3D space of the embedding matrix C for all characters
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.scatter(C[:,0].data, C[:,1].data, C[:,2].data, s=200)
for i in range(C.shape[0]):
    ax.text(C[i,0].item(), C[i,1].item(), C[i,2].item(), itos[i], ha="center", va="center", color='white')
plt.grid('minor')

In [None]:
# How to change the above graph using library Plotly
import plotly.graph_objects as go

fig = go.Figure(data=[go.Scatter3d(x=C[:,0].data, y=C[:,1].data, z=C[:,2].data, 
                                   mode='markers', 
                                   text=[itos[i] for i in range(C.shape[0])])])
fig.show()

In [None]:
# using holoviews to visualize the 3D space
import holoviews as hv
hv.extension('matplotlib')

hv.Scatter3D((C[:,0].data, C[:,1].data, C[:,2].data)).opts(size=10)



In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

# read all the words in the file
words = open("../names.txt").read().splitlines()

# build the vocabulary of characters and mapping to/from integers
chars = sorted(list(set(''.join(words))))
stoi = { ch: i + 1 for i, ch in enumerate(chars) }
stoi["."] = 0
itos = { i: ch for ch, i in stoi.items() }

# build the dataset
def build_dataset(words):
    block_size = 3  # using 3 contiguous characters to predict the next one
    X, Y = [], []
    for w in words:
        context = [0] * block_size
        for ch in w + ".":
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]    # crop and append
    return torch.tensor(X), torch.tensor(Y)

# split the dataset, randomly
import random

random.seed(2147483647)
random.shuffle(words)

n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtrain, Ytrain = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xtest, Ytest = build_dataset(words[n2:])

# Let's embed the char to 100D space
# for reproducibility
g = torch.Generator().manual_seed(2147483647)   # consistent with Andrej's settings 

# setting parameters
n_input = 300             # 3 characters * 100D embedding
n_hidden = 200
n_output = 27

C = torch.randn((27, 100), requires_grad=True, generator=g)
W1 = torch.randn((n_input, n_hidden), requires_grad=True, generator=g)
b1 = torch.randn((n_hidden,), requires_grad=True, generator=g)
W2 = torch.randn((n_hidden, n_output), requires_grad=True, generator=g)
b2 = torch.randn((n_output,), requires_grad=True, generator=g)

parameters = [C, W1, b1, W2, b2]        # collect all parameters

# mini-batch training
for i in range(100000):

    # mini-batch construction
    ix = torch.randint(0, Xtrain.shape[0], (32,))    # 32 is the batch size

    # embedding
    emb = C[Xtrain[ix]]  # randomly select 32 samples

    # forward pass
    h = torch.tanh(emb.view(-1, n_input) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Ytrain[ix])   # use the related labels
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    lr = 0.1
    for p in parameters:
        p.data -= lr * p.grad

print(f"the training's loss {loss.item():.4f}")

# dev/validation valuation
emb = C[Xdev]
h = torch.tanh(emb.view(-1, n_input) @ W1 + b1)
logits = h @ W2 + b2
loss_dev = F.cross_entropy(logits, Ydev)
print(f"the validation's loss {loss_dev.item():.4f}")

# test valuation
emb = C[Xtest]
h = torch.tanh(emb.view(-1, n_input) @ W1 + b1)
logits = h @ W2 + b2
loss_test = F.cross_entropy(logits, Ytest)
print(f"the test's loss {loss_test.item():.4f}")

In [None]:
# sample from the model
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):
    outs = []
    context = [0] * 3
    while True:
        emb = C[torch.tensor([context])]
        h = torch.tanh(emb.view(1, n_input) @ W1 + b1)
        logits = h @ W2 + b2
        probs = F.softmax(logits, 1)
        next_char = torch.multinomial(probs, 1, generator=g).item()
        context = context[1:] + [next_char]
        outs.append(next_char)
        if next_char == 0: break
    print(''.join([itos[i] for i in outs]))