Based on Karpathy's blogpost http://karpathy.github.io/2015/05/21/rnn-effectiveness


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt  # for making figures
%matplotlib inline

# read in all the words
words = open('names.txt', 'r').read().splitlines()

# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s: i + 1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}
vocab_size = len(itos)

# shuffle up the words
import random
random.seed(42)
random.shuffle(words)

block_size = 8  # blocks for training

# build the dataset
from dataclasses import dataclass

@dataclass
class Dataset:
    x: torch.Tensor
    y: torch.Tensor

@dataclass
class Datasets:
    train: Dataset
    dev: Dataset
    test: Dataset

def build_dataset(words) -> Dataset:
    xs, ys = [], []

    for wi, word in enumerate(words):
        context = [0] * block_size
        for ch in [stoi[ch] for ch in word] + [0]:
            xs.append(context)
            ys.append(ch)
            context = context[1:] + [ch]

    ds = Dataset(torch.tensor(xs), torch.tensor(ys))
    return ds

n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))
datasets = Datasets(
    build_dataset(words[:n1]),    # 80%,
    build_dataset(words[n1:n2]),  # 10%
    build_dataset(words[n2:])     # 10%
)

for x, y in zip(datasets.train.x[:20], datasets.train.y[:20]):
    print(''.join(itos[ch.item()] for ch in x) + ' -> ' + itos[y.item()])

........ -> y
.......y -> u
......yu -> h
.....yuh -> e
....yuhe -> n
...yuhen -> g
..yuheng -> .
........ -> d
.......d -> i
......di -> o
.....dio -> n
....dion -> d
...diond -> r
..diondr -> e
.diondre -> .
........ -> x
.......x -> a
......xa -> v
.....xav -> i
....xavi -> e


In [77]:
torch.manual_seed(42)

batch_size = 32
embedding_dim = 10
hidden_size = 24
n_steps = 1001

wxh = torch.randn((vocab_size, hidden_size))
whh = torch.randn((hidden_size, hidden_size))
why = torch.randn((hidden_size, vocab_size))
bh = torch.zeros(hidden_size)
by = torch.zeros(vocab_size)
parameters = [wxh, whh, why]
for p in parameters:
    p.requires_grad = True
print('Number of parameters:', sum(p.nelement() for p in parameters))


# def forward(x, h):
#     emb = torch.flatten(C[x])
#     return torch.tanh(emb @ wxh + h @ whh + bh)


def loss_fun(inputs, targets, hprev):
    """
    inputs, targets are both list of integers.
    hprev is Hx1 array of initial hidden state
    returns the loss, gradients on model parameters, and last hidden state
    """
    xs, hs, ys = {}, {}, {}
    hs[-1] = hprev.clone()
    loss = 0
    # forward pass
    for t in range(len(inputs)):
        xs[t] = F.one_hot(torch.tensor(inputs[t]), vocab_size).view(1, -1).float()
        print(f'{xs[t].shape=}')
        print(f'{wxh.shape=}')
        print(f'{(xs[t] @ wxh).shape=}')
        print(f'{hs[t-1].shape=}')
        print(f'{whh.shape=}')
        print(f'{(hs[t-1] @ whh).shape=}')
        print(f'{bh.shape=}')
        hs[t] = torch.tanh(xs[t] @ wxh + hs[t-1] @ whh + bh)
        print(f'Result: {hs[t].shape=}')
        ys[t] = hs[t] @ why + by  # unnormalized log probabilities for next chars
        print()
        print(ys[t])
        print()
        print(torch.tensor(targets[t]))
        print()
        loss += F.cross_entropy(ys[t], torch.tensor([targets[t]]))
    
    for p in parameters:
        p.grad = None
    loss.backward()
    lr = 0.1 if (step_i < (n_steps / 2)) else 0.01
    for p in parameters:
        p.data -= lr * p.grad

    return loss, hs[len(inputs)-1]


h0 = torch.zeros((1, hidden_size))
n, p = 0, 0
data = '.' + '.'.join(words) + '.'
losses = []
for step_i in range(n_steps):
    batch_ix = torch.randint(high=datasets.train.x.shape[0], size=(batch_size,))
    xb = datasets.train.x[batch_ix]
    yb = datasets.train.y[batch_ix]

    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p + block_size + 1 >= len(datasets.train.x):
        h0 = torch.zeros((hidden_size, 1))  # reset RNN memory
        p = 0  # go from start of data
    inputs = [stoi[ch] for ch in data[p:p + block_size]]
    targets = [stoi[ch] for ch in data[p + 1 : p + block_size + 1]]

    # hs = [h0]
    # loss_sum = 0
    # for x, y in zip(xb, yb):
    #     emb = torch.flatten(C[x])
    #     hs.append(torch.tanh(emb @ wxh + hs[-1] @ whh + bh))
    #     logits = torch.tanh(hs[-1] @ why + by)
    #     loss_sum += F.cross_entropy(logits, y)
    # loss = loss_sum / len(xb)

    loss, h0 = loss_fun(inputs, targets, h0)

    if n_steps == 1 or step_i % (n_steps // 10) == 0:
        print(f'Step {step_i + 1}: training loss: {loss.item()}')
    losses.append(loss.item())
        
    for p in parameters:
        p.grad = None
    h0.grad = None
    loss.backward()
    for p in parameters:
        lr = 0.1 if step_i < (n_steps / 2) else 0.01 
        p.data -= lr * p.grad
    break

# @torch.no_grad()
# def loss_for_split(split: str):
#     ds = datasets.__getattribute__(split)
#     h_new = forward(ds.x, h)
#     logits = torch.tanh(h_new @ why + by)
#     loss = F.cross_entropy(logits, y)
#     print(f'{split} loss={loss}')
# 
# loss_for_split('test')
# loss_for_split('dev')


Number of parameters: 1872
xs[t].shape=torch.Size([1, 27])
wxh.shape=torch.Size([27, 24])
(xs[t] @ wxh).shape=torch.Size([1, 24])
hs[t-1].shape=torch.Size([1, 24])
whh.shape=torch.Size([24, 24])
(hs[t-1] @ whh).shape=torch.Size([1, 24])
bh.shape=torch.Size([24])
Result: hs[t].shape=torch.Size([1, 24])

tensor([[-0.2172, -7.0754,  2.0547,  3.0159,  1.8225,  1.2410,  5.7707, -0.0593,
          2.1578, -3.5542, -0.9059,  0.6507,  5.8014, -0.5905,  7.2660, -0.8773,
         -0.5020,  2.2662,  0.4997, -1.0600,  3.7617,  7.4625,  5.3126, -7.4556,
          3.9291, -3.9625,  1.4149]], grad_fn=<AddBackward0>)

tensor(25)

xs[t].shape=torch.Size([1, 27])
wxh.shape=torch.Size([27, 24])
(xs[t] @ wxh).shape=torch.Size([1, 24])
hs[t-1].shape=torch.Size([1, 24])
whh.shape=torch.Size([24, 24])
(hs[t-1] @ whh).shape=torch.Size([1, 24])
bh.shape=torch.Size([24])
Result: hs[t].shape=torch.Size([1, 24])

tensor([[-4.8381,  3.0426, -9.9292,  1.2715,  9.0590,  5.9813, -5.1341,  6.3364,
          0.7737, -3

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

In [None]:
max_words = 20
max_word_len = 20

for _ in range(max_words):
    ctx = [0]
    forward(x, y, h)
