## makemore: part 5.c

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt  # for making figures
%matplotlib inline

In [7]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
print(f'{len(words)} words: {", ".join(words[:8])}... Max word len:', max(len(w) for w in words))

32033 words: emma, olivia, ava, isabella, sophia, charlotte, mia, amelia... Max word len: 15


In [9]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s: i + 1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


In [10]:
# shuffle up the words
import random
random.seed(42)
random.shuffle(words)

In [115]:
import itertools

max_word_len = max(len(w) for w in words)
# adjusting to the nearest power of 2:
word_buffer_size = 2 ** (list(itertools.takewhile((lambda p: 2**p < max_word_len + 1), itertools.count(1)))[-1] + 1)
print(word_buffer_size)

16


In [116]:
# build the dataset
from dataclasses import dataclass

@dataclass
class Dataset:
    x: torch.tensor
    y: torch.tensor

@dataclass
class Datasets:
    train: Dataset
    dev: Dataset
    test: Dataset

def build_dataset(words) -> Dataset:
    x = torch.zeros((len(words), word_buffer_size), dtype=torch.int)  # 32033 x 16
    y = torch.ones(len(words), dtype=torch.int)

    for wi, word in enumerate(words):
        for ci, ch in enumerate(word):
            x[wi][ci + 1] = stoi[ch]

    ds = Dataset(x, y)
    print(ds.x.shape, ds.y.shape)
    return ds

n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))
datasets = Datasets(
    build_dataset(words[:n1]),    # 80%,
    build_dataset(words[n1:n2]),  # 10%
    build_dataset(words[n2:])     # 10%
)

torch.Size([25626, 16]) torch.Size([25626])
torch.Size([3203, 16]) torch.Size([3203])
torch.Size([3204, 16]) torch.Size([3204])


In [126]:
torch.manual_seed(42)

batch_size = 32
embedding_dim = 10
hidden_size = 24

xb = datasets.train.x[:batch_size]  # 32 words, with each word consisting of 15 character vectors
yb = datasets.train.y[:batch_size]
print(f'{xb.shape=}, {yb.shape=}')

emb_l = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
emb = emb_l(xb)
print(f'{emb.shape=}')

c1 = nn.Conv1d(embedding_dim, hidden_size, 2, padding='same')
c1_out = c1(emb.transpose(2, 1))
print(f'{c1_out.shape=}')  # batch_size, n_hidden, max_word_len+1

p1 = nn.AvgPool1d(2)
p1_out = p1(c1_out)
print(f'{p1_out.shape=}')

c2 = nn.Conv1d(hidden_size, hidden_size, 2, padding='same')
c2_out = c2(p1_out)
print(f'{c2_out.shape=}')  # batch_size, n_hidden, max_word_len+1

p2 = nn.AvgPool1d(2)
p2_out = p1(c2_out)
print(f'{p2_out.shape=}')

c3 = nn.Conv1d(hidden_size, hidden_size, 2, padding='same')
c3_out = c3(p2_out)
print(f'{c3_out.shape=}')  # batch_size, n_hidden, max_word_len+1

p3 = nn.AvgPool1d(2)
p3_out = p1(c3_out)
print(f'{p3_out.shape=}')

c4 = nn.Conv1d(hidden_size, hidden_size, 2, padding='same')
c4_out = c4(p3_out)
print(f'{c4_out.shape=}')  # batch_size, n_hidden, max_word_len+1

p4 = nn.AvgPool1d(2)
p4_out = p1(c4_out)
print(f'{p4_out.shape=}')

flatten = nn.Flatten(start_dim=1)
flatten_out = flatten(p4_out)
print(f'{flatten_out.shape=}')

# gru = nn.GRU(p2_out.shape[-1], hidden_size, 1, dropout=0.01)
# gru_out, res = gru(p2_out)
# print(f'{gru_out.shape=}, {res.shape=}')

linear = nn.Linear(hidden_size, 1)
logits = nn.Tanh()(linear(flatten_out))
print(f'{logits.shape=}')

# loss = nn.MSELoss()(logits.squeeze(), yb)
# print(loss)

xb.shape=torch.Size([32, 16]), yb.shape=torch.Size([32])
emb.shape=torch.Size([32, 16, 10])
c1_out.shape=torch.Size([32, 24, 16])
p1_out.shape=torch.Size([32, 24, 8])
c2_out.shape=torch.Size([32, 24, 8])
p2_out.shape=torch.Size([32, 24, 4])
c3_out.shape=torch.Size([32, 24, 4])
p3_out.shape=torch.Size([32, 24, 2])
c4_out.shape=torch.Size([32, 24, 2])
p4_out.shape=torch.Size([32, 24, 1])
flatten_out.shape=torch.Size([32, 24])
logits.shape=torch.Size([32, 1])


torch.Size([16, 15])
torch.Size([16, 15, 10])


In [192]:


model = nn.Sequential(
    nn.Embedding(vocab_size, n_embd),
    nn.Conv1d(n_embd, n_hidden, kernel_size=2),
)


model = Sequential([
    Embedding(vocab_size, n_embd),
    FlattenConsecutive(2), Linear(n_embd * 2,   n_hidden,     bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden),     Tanh(),
    Linear(n_hidden, vocab_size),
])
with torch.no_grad():
    if isinstance(model.layers[-1], BatchNorm1d):
        model.layers[-1].gain *= 0.1
    if isinstance(model.layers[-1], Linear):
        model.layers[-1].w *= 0.1
    
    for l in model.layers:
        if isinstance(l, Linear):
            # tanh is a shrinking function, so need to initialise wights a bit larger
            # to compensate this shrinkage and keep weights a unit gaussian on each step
            l.w *= 5/3

for p in model.parameters():
    p.requires_grad = True

print('Total model parameters:', sum(p.nelement() for p in model.parameters()))

Total model parameters: 22370


In [100]:
torch.manual_seed(42)

batch_size = 32
embedding_dim = 10
n_conv_channels = 24

model = nn.Sequential(
    nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim),
    nn.Conv1d(in_channels=embedding_dim, out_channels=n_conv_channels, kernel_size=2, padding='same'),
    nn.Conv1d(in_channels=n_conv_channels, out_channels=n_conv_channels, kernel_size=4, padding='same'),
    nn.Conv1d(in_channels=n_conv_channels, out_channels=n_conv_channels, kernel_size=8, padding='same'),
    nn.MaxPool1d(2),
    nn.Flatten(start_dim=1, end_dim=2),
    nn.Linear(in_features=flatten.shape[-1], out_features=1),
    nn.BatchNorm1d(1),
    nn.Tanh(),
)

In [101]:
DEBUG = False

from collections import defaultdict

n_steps = 1001
learning_rate = 0.1

losses = []


for step_i in range(n_steps):
    batch_ix = torch.randint(high=datasets.train.x.shape[0], size=(batch_size,))
    xb = datasets.train.x[batch_ix]
    yb = datasets.train.y[batch_ix]

    logits = model(xb)
    loss = nn.MSELoss()(logits.squeeze(), yb)

    if n_steps == 1 or step_i % (n_steps // 10) == 0:
        print(f'Step {step_i}: training loss: {loss.item()}')
    losses.append(loss.item())

    for p in model.parameters():
        p.grad = None
    loss.backward()

    lr = learning_rate if (step_i < (n_steps / 2)) else learning_rate / 10
    for p in model.parameters():
        p.data -= lr * p.grad


@torch.no_grad()
def loss_for_split(split: str):
    ds = datasets.__getattribute__(split)
    logits = model(ds.x)
    loss = F.MSELoss()(logits.squeeze(), ds.y)
    print(f'{split} loss={loss}')

loss_for_split('train')
loss_for_split('test')
loss_for_split('dev')


RuntimeError: Given groups=1, weight of size [24, 10, 2], expected input[32, 16, 11] to have 10 channels, but got 16 channels instead