# lets build a name generator, but this time use more than one previous character as context

In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# pull text file
names = open('names.txt').read().splitlines()
names[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [3]:
# create vocab list
vocab = sorted(list(set(''.join(names))))

# create tokenizer encoder
stoi = {}
# make the model hallucinate a start token so that we can propmt it to generate a name
# stoi is String TO Integer
stoi['.'] = 0
stoi.update({s:i+1 for i, s in enumerate(vocab)})

# create tokenizer decoder
# itos is Integer TO String
itos = {}
# make the model hallucinate an end token so that it knows when to end the name during generation
itos[0] = '.'
itos.update({i+1:s for i, s in enumerate(vocab)})
print(stoi)

{'.': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}


In [4]:
block_size = 2

def create_splits(names, stoi, block_size, split_ratios=(0.8, 0.1, 0.1)):
    def build_dataset(names):
        X, Y = [], []
        for name in names:
            context = [0] * block_size
            name += '.'
            for ch in name:
                X.append(context)
                Y.append(stoi[ch])
                context = context[1:] + [stoi[ch]]
        return torch.tensor(X), torch.tensor(Y)
    
    n = len(names)
    train_end = int(split_ratios[0] * n)
    val_end = train_end + int(split_ratios[1] * n)
    
    train_names = names[:train_end]
    val_names = names[train_end:val_end]
    test_names = names[val_end:]
    
    X_train, Y_train = build_dataset(train_names)
    X_val, Y_val = build_dataset(val_names)
    X_test, Y_test = build_dataset(test_names)
    
    return (X_train, Y_train), (X_val, Y_val), (X_test, Y_test)

# Example usage
(X_train, Y_train), (X_val, Y_val), (X_test, Y_test) = create_splits(names, stoi, block_size)

In [5]:
print("X_train shape:", X_train.shape)
print("Y_train shape:", Y_train.shape)
print("X_val shape:", X_val.shape)
print("Y_val shape:", Y_val.shape)
print("X_test shape:", X_test.shape)
print("Y_test shape:", Y_test.shape)

X_train shape: torch.Size([182778, 2])
Y_train shape: torch.Size([182778])
X_val shape: torch.Size([22633, 2])
Y_val shape: torch.Size([22633])
X_test shape: torch.Size([22735, 2])
Y_test shape: torch.Size([22735])


### what does it mean when embedding (emb) matrix is of shape: [n, 3, 2]
### there are n training examples, each of which has three characters (block size). and each character is represented (squashed down) to 2 dimensions

### one hot encoding dot product with a matrix is equivalent to plucking out a single row. this can be thought of a first layer of the network where we obtain the embeddings for each tokenized vector. for now we'll settle with pytorch slicing which also allows this 

In [6]:
# initialize params
C = torch.randn((27, 2))
W1 = torch.randn((6, 100)) # 6 because we give 3 characters at a time, and each has 2 numbers to represent them. so 6 in all
b1 = torch.randn(100)
W2 = torch.randn((100, 27))
b2 = torch.randn(27)
parameters = [W1, b1, W2, b2, C]
print(sum(p.nelement() for p in parameters))

for p in parameters:
    p.requires_grad = True

3481


In [7]:
# create a range of learning rates separated by degrees of 10
steps = torch.linspace(-3, 0, 1000)
lri = 10 ** steps
lri[:100]

tensor([0.0010, 0.0010, 0.0010, 0.0010, 0.0010, 0.0010, 0.0010, 0.0010, 0.0011,
        0.0011, 0.0011, 0.0011, 0.0011, 0.0011, 0.0011, 0.0011, 0.0011, 0.0011,
        0.0011, 0.0011, 0.0011, 0.0012, 0.0012, 0.0012, 0.0012, 0.0012, 0.0012,
        0.0012, 0.0012, 0.0012, 0.0012, 0.0012, 0.0012, 0.0013, 0.0013, 0.0013,
        0.0013, 0.0013, 0.0013, 0.0013, 0.0013, 0.0013, 0.0013, 0.0013, 0.0014,
        0.0014, 0.0014, 0.0014, 0.0014, 0.0014, 0.0014, 0.0014, 0.0014, 0.0014,
        0.0015, 0.0015, 0.0015, 0.0015, 0.0015, 0.0015, 0.0015, 0.0015, 0.0015,
        0.0015, 0.0016, 0.0016, 0.0016, 0.0016, 0.0016, 0.0016, 0.0016, 0.0016,
        0.0016, 0.0017, 0.0017, 0.0017, 0.0017, 0.0017, 0.0017, 0.0017, 0.0017,
        0.0018, 0.0018, 0.0018, 0.0018, 0.0018, 0.0018, 0.0018, 0.0018, 0.0019,
        0.0019, 0.0019, 0.0019, 0.0019, 0.0019, 0.0019, 0.0019, 0.0020, 0.0020,
        0.0020])

In [None]:
losses = []
epochs = 1000
for i in range(epochs):
    # fetch batch size = 32 number of indices to mini batch the data
    ix = torch.randint(0, X_train.shape[0], (32,))
    ix
    # forward pass
    emb = C[X_train[ix]]
    print(emb.shape)
    # layer 1
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
    # layer 2
    logits = h @ W2 + b2
    # probs = torch.softmax(logits, 1) # softmax along dimension 1
    # loss = -torch.log(probs[torch.arange(X.shape[0]), Y]).mean() # calculate NLL for all X.shape[0] examples (akin to batch size)
    loss = F.cross_entropy(logits, Y_train[ix]) # does the same thing the above two lines does
    # backprop
    for p in parameters:
        p.grad = None
    loss.backward()
    # gradient update
    for p in parameters:
        p.data += -lri[i]*p.grad
    losses.append(loss.item())
    if i % 100 == 0:
        print(f"Epoch: {i}, loss: {loss.item()}")

torch.Size([30, 2, 2])


ValueError: Expected input batch_size (20) to match target batch_size (30).

In [None]:
plt.plot(lri, losses)

### this plot above is made for experimental purposes. we see that there is a sweet spot for the learning rate where loss is at the all time low. a high leraning rate leads to divergence, and a lor learning rate moves the gradient way too slowly. this is what the graph conveys. 
### so now we've found a decent learning rate of around 0.1. lets train on that, and once we see the loss plateauing, we'll decay the learning rate further

In [None]:
# initialize params
C = torch.randn((27, 2))
W1 = torch.randn((6, 100)) # 6 because we give 3 characters at a time, and each has 2 numbers to represent them. so 6 in all
b1 = torch.randn(100)
W2 = torch.randn((100, 27))
b2 = torch.randn(27)
parameters = [W1, b1, W2, b2, C]
print(sum(p.nelement() for p in parameters))

for p in parameters:
    p.requires_grad = True

In [None]:
losses = []
epochs = 10000
for i in range(epochs):
    # fetch batch size = 32 number of indices to mini batch the data
    ix = torch.randint(0, X.shape[0], (32,))
    ix
    # forward pass
    emb = C[X[ix]]
    # layer 1
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
    # layer 2
    logits = h @ W2 + b2
    # probs = torch.softmax(logits, 1) # softmax along dimension 1
    # loss = -torch.log(probs[torch.arange(X.shape[0]), Y]).mean() # calculate NLL for all X.shape[0] examples (akin to batch size)
    loss = F.cross_entropy(logits, Y[ix]) # does the same thing the above two lines does
    # backprop
    for p in parameters:
        p.grad = None
    loss.backward()
    # gradient update
    for p in parameters:
        p.data += -0.1*p.grad
    losses.append(loss.item())

In [None]:
emb = C[X]
# layer 1
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
# layer 2
logits = h @ W2 + b2
# probs = torch.softmax(logits, 1) # softmax along dimension 1
# loss = -torch.log(probs[torch.arange(X.shape[0]), Y]).mean() # calculate NLL for all X.shape[0] examples (akin to batch size)
loss = F.cross_entropy(logits, Y)
loss

In [None]:
plt.plot(losses)

if we train for 10k epochs, we notice that the loss starts to plateau at 2.37, so lets drop the learning rate. once we drop it to 0.01, we see that the loss drop to 2.31, lesgooo

In [None]:
# torch.cat((emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]), 1).shape # --> this doesn't work when we have a bigger batch size
# torch.cat(torch.unbind(emb, 1), 1).shape # --> this works better, we unbind the 1st dimension and then concatenate the 1st dimension to match 6 with 6 from the next layer


# the addition between emb.w1 and b works correctly without additional editing because:
emb.w1 shape: 32, 100
b1 shape:         100
pytorch broadcasting rules matches the right dim, then fills the left ones with 1 if its missing, and then 1 is broadcasted to match whatever is above it
b1 broadcasted shape: 1, 100 to 32, 100