In [47]:
import os
import re
import torch

In [48]:
with open('data/kon.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print("length of dataset in characters: ", len(text))

length of dataset in characters:  493412


In [49]:
print(text[:500])

Ui:
Sis, come on. You'd better get out of bed. Sis?

Yui:
Ah! I-it's eight! I'm late! Oh!

Ui:
Hey, why the rush? Hm?

Yui:
See you later!

Lady:
Oh, good morning, Yui.

Yui:
Good morning!

Yui:
What?! I read the clock wrong!
Starting today, I'm a high schooler!

Opening Song
Cagayake!GIRLS by 放課後ティータイム(After School Tea Time)

Girls:
Congratulations on starting school here!

Girl 1:
Please join the Tennis Club!

Girl 2:
The Judo Club's better!

Girl 3:
Please join the Tea Ceremony Club!

Girl 4:


In [50]:
# remove japanese characters
text = ''.join(filter(lambda character:ord(character) < 0x3000, text))

In [51]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("unique characters:", vocab_size, ''.join(chars))

unique characters: 93 
 !"#$%&'(),-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyz{|}~°éū‘’…♪


In [52]:
# Very simple tokenizer
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[ch] for ch in s]
decode = lambda l: ''.join([itos[i] for i in l])

print("encoded:", encode(text[:20]))
print("decoded:", decode(encode(text[:20])))

encoded: [48, 64, 25, 0, 46, 64, 74, 11, 1, 58, 70, 68, 60, 1, 70, 69, 13, 1, 52, 70]
decoded: Ui:
Sis, come on. Yo


In [53]:
data = torch.tensor(encode(text), dtype=torch.int64)
data.shape

torch.Size([493171])

In [54]:
data[:100]

tensor([48, 64, 25,  0, 46, 64, 74, 11,  1, 58, 70, 68, 60,  1, 70, 69, 13,  1,
        52, 70, 76,  8, 59,  1, 57, 60, 75, 75, 60, 73,  1, 62, 60, 75,  1, 70,
        76, 75,  1, 70, 61,  1, 57, 60, 59, 13,  1, 46, 64, 74, 27,  0,  0, 52,
        76, 64, 25,  0, 28, 63,  2,  1, 36, 12, 64, 75,  8, 74,  1, 60, 64, 62,
        63, 75,  2,  1, 36,  8, 68,  1, 67, 56, 75, 60,  2,  1, 42, 63,  2,  0,
         0, 48, 64, 25,  0, 35, 60, 80, 11,  1])

In [55]:
n = int(len(data) * 0.95)
train_data = data[:n]
val_data = data[n:]
print(train_data.shape, val_data.shape)

torch.Size([468512]) torch.Size([24659])


In [56]:
block_size = 8
train_data[:block_size+1]

tensor([48, 64, 25,  0, 46, 64, 74, 11,  1])

In [57]:
# context and target simulation
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1].tolist()
    target = y[t].item()
    print('context:', context, 'target:', target)

context: [48] target: 64
context: [48, 64] target: 25
context: [48, 64, 25] target: 0
context: [48, 64, 25, 0] target: 46
context: [48, 64, 25, 0, 46] target: 64
context: [48, 64, 25, 0, 46, 64] target: 74
context: [48, 64, 25, 0, 46, 64, 74] target: 11
context: [48, 64, 25, 0, 46, 64, 74, 11] target: 1


In [58]:
torch.manual_seed(69)
batch_size = 4 # number of parallel blocks
block_size = 8 # number of characters in each block = context length

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

inputs:
torch.Size([4, 8])
tensor([[73, 60, 56, 67, 67, 80,  1, 59],
        [63, 64, 74, 27,  2,  0, 40, 76],
        [36,  1, 56, 73, 60,  1, 62, 70],
        [45, 64, 75, 74, 76, 25,  0, 46]])
targets:
torch.Size([4, 8])
tensor([[60, 56, 67, 67, 80,  1, 59, 70],
        [64, 74, 27,  2,  0, 40, 76, 62],
        [ 1, 56, 73, 60,  1, 62, 70, 64],
        [64, 75, 74, 76, 25,  0, 46, 63]])


In [59]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(69)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # Bigram language model: single layer, single token prediction
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx)  # (B,T,C), B: batch=4, T: sequence=8, C: vocab=147

        if targets is None:
            loss = None
        else:
            # flatten the logits and targets for torch cross entropy
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # generate max_new_tokens new tokens given the initial context idx
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx


m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

# randomly generate 100 tokens from initial model weights and idx = 0 = \n
print(decode(m.generate(idx=torch.zeros(
    (1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 93])
tensor(5.0349, grad_fn=<NllLossBackward0>)

‘RreWQQMv♪a[GMhW6~
6n°3fG'O-?.’"deN‘Jvw°[bD’:Vo9ukRr2{0DGEc|x (%$RgGBspJb#‘, K5P84N.fm-3KAH{:5°G|~&0


In [60]:
# training!
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for steps in range(10000):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)

    # backprop
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())


2.4911692142486572


In [61]:
# generate 100 tokens from trained model weights and idx = 0 = \n
print(decode(m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=300)[0].tolist()))


[Say?
Hat Whan.
Azu:
Mig$we I I'recld h, s(9f(rgha:
Misngol so:

Sofom cka Iti:
I'tg am91't m aw! ught I jus ing than ad..
HDodeare.
Spr ctinchepa u:
Anthearet wha.

Wou:
Azudangicou:
Ohito tawit s, giondnet g thiomachth, °abedo:
Ale's he)D'swe wh!
Ohe, m I douthhe b'/%Jutari:
Tsthoulestoitheari:
Yu


Lets try out lower dimensional embeddings + positional embeddings

In [62]:
class BigramEmbedLanguageModel(nn.Module):

    def __init__(self, vocab_size, embed_size):
        super().__init__()
        # embed raw tokens to a lower dimensional embedding with embed_size
        self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
        # embed block sized context length as positional embeddings of the same size
        self.position_embedding_table = nn.Embedding(block_size, embed_size)
        # Language Modelling (?) Head is a standard linear layer to go from 
        # embeddings back to logits of vocab_size
        self.lm_head = nn.Linear(embed_size, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_embd = self.token_embedding_table(idx) # (B,T,C)
        pos_embd = self.position_embedding_table(torch.arange(T)) # (T,C) [0...T-1]
        x = tok_embd + pos_embd
        logits = self.lm_head(x) # (B,T,C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            #crop idx to the last block_size tokens
            idx_context = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_context)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [63]:
# training!
m = BigramEmbedLanguageModel(vocab_size, 32)
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for steps in range(10000):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)

    # backprop
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.5127875804901123


In [64]:
print(decode(m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=300)[0].tolist()))


Ac, id sifioneaithi:




Mine....
I'ssc'me I'seveeo fo l o han Yurbones chan, ans he!! Yu:
I chonosugow. Werikaing o, ineareathommp ued.
Rigo:
Yusasue p....
Yuswawato pal w?

Wour:
Sorathe hat epr t y w you:


Mimer h, cr w Bew ugiditr, tiok t thol:
Rith-no:
Rinare Le airy. Lo tmen! s.
Yu't thai:
Ts


# Now for the Transformer Fundamentals!

## The mathematical trick to self-attention: triangular matrices for weighted averages

In [65]:
# consider the following toy example:

torch.manual_seed(1337)
B,T,C = 2,8,2 # batch, time, channels
x = torch.randn(B,T,C)
print(x.shape)
print(x)

torch.Size([2, 8, 2])
tensor([[[ 0.1808, -0.0700],
         [-0.3596, -0.9152],
         [ 0.6258,  0.0255],
         [ 0.9545,  0.0643],
         [ 0.3612,  1.1679],
         [-1.3499, -0.5102],
         [ 0.2360, -0.2398],
         [-0.9211,  1.5433]],

        [[ 1.3488, -0.1396],
         [ 0.2858,  0.9651],
         [-2.0371,  0.4931],
         [ 1.4870,  0.5910],
         [ 0.1260, -1.5627],
         [-1.1601, -0.3348],
         [ 0.4478, -0.8016],
         [ 1.5236,  2.5086]]])


In [66]:
# We want x[b,t] = mean_{i<=t} x[b,i] to very badly encode info of tokens before token t
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0)
xbow

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]]])

In [67]:
# better way to do this: triangular matrix!
# version 2: using matrix multiply for a weighted aggregation
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
print(wei)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
xbow2

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]]])

In [68]:
# even better: softmax for normalization of weights
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
xbow3

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]]])

In [69]:
# finally: Query (what i look for), Key (What i am in this), 
# Value (My private value, embedded) for self-attention
# version 4: single head self-attention
head_size = 16
query = nn.Linear(C, head_size, bias=False) # Linear layer C (embed) -> head size (16)
key = nn.Linear(C, head_size, bias=False) # Linear layer C (embed) -> head size (16)
value = nn.Linear(C, head_size, bias=False) # Linear layer C (embed) -> head size (16)

q = query(x) # (B,T,16)
k = key(x) # (B,T,16)
wei = q @ k.transpose(-2, -1) # (B,T,16) @ (B,16,T) -> (B,T,T)

wei = wei * C**-0.5 # scaled attention as to not sharpen softmax

tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
print(torch.round(wei, decimals=3))

v = value(x)
out = wei @ v
print(torch.round(out, decimals=3))

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4700, 0.5300, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3140, 0.3170, 0.3690, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2060, 0.2090, 0.2640, 0.3210, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1920, 0.1650, 0.2050, 0.2140, 0.2250, 0.0000, 0.0000, 0.0000],
         [0.1260, 0.1320, 0.0900, 0.0690, 0.2020, 0.3810, 0.0000, 0.0000],
         [0.1440, 0.1500, 0.1540, 0.1630, 0.1220, 0.1160, 0.1500, 0.0000],
         [0.0580, 0.0460, 0.0440, 0.0340, 0.1330, 0.1390, 0.0480, 0.4990]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4970, 0.5030, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0040, 0.0540, 0.9430, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5030, 0.1000, 0.0140, 0.3840, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2570, 0.1220, 0.0820, 0.1950, 0.3440, 0.0000, 0.0000, 0.0000],
         [0.0330, 0.121

# Time to put attention in our last model!

In [70]:
class SelfAttentionHead(nn.Module):
    """ one head of self-attention """
    def __init__(self, n_embd, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B,T,C = x.shape
        q = self.query(x) # (B,T,C)
        k = self.key(x)   # (B,T,C)

        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei * C**-0.5 # scaled attention
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)

        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out


In [71]:
class BigramEmbedAttentionLanguageModel(nn.Module):

    def __init__(self, vocab_size, embed_size):
        super().__init__()
        # embed raw tokens to a lower dimensional embedding with embed_size
        self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
        # embed block sized context length as positional embeddings of the same size
        self.position_embedding_table = nn.Embedding(block_size, embed_size)
        # Language Modelling (?) Head is a standard linear layer to go from 
        # embeddings back to logits of vocab_size
        self.lm_head = nn.Linear(embed_size, vocab_size)
        # single head self-attention
        self.sa_head = SelfAttentionHead(embed_size, embed_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_embd = self.token_embedding_table(idx) # (B,T,C)
        pos_embd = self.position_embedding_table(torch.arange(T)) # (T,C) [0...T-1]
        x = tok_embd + pos_embd
        # apply self-attention
        x = self.sa_head(x)
        # get logits with linear layer
        logits = self.lm_head(x) # (B,T,C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            #crop idx to the last block_size tokens
            idx_context = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_context)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [72]:
# training!
m = BigramEmbedAttentionLanguageModel(vocab_size, 32)
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for steps in range(10000):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)

    # backprop
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if (steps % 5000 == 0):
        print("learning step:", steps)

print(loss.item())

learning step: 0
learning step: 5000
2.2536041736602783


In [73]:
print(decode(m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


Mio-su mugh at's and, wa hesayorres derkontere ter dn rewh!
Oh?

Ritt an.

Wer, id ogise aks mad et thave dee..? Ohaw, thetlanygingi.

Mioko:
I'mter aicaka tot'r ero sothiond rd itel, Leeat dor it ctoun'tsu:

Sua sakicu, thikai wes whet avin oll y-add I cavinofi do din ounckead soo w. You, ihtiss!

Ritndo ifalki:
Lethel pled ole tin hes, yoke thare lut?

Sayo ouritosu thaculig a tropat'l a tis t awor be at whe alclou'red Cmon toloure th wa!

Yuifrik! is.

Miits mt.

I' mmonbet pger lbare't theal


# More heads! Multi-Head Attention

In [74]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, n_embd, head_size):
        super().__init__()
        self.heads = nn.ModuleList([SelfAttentionHead(n_embd, head_size) for _ in range(num_heads)])

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # concat single-head results
        return out

In [75]:
class BigramEmbedMultiHeadAttentionLanguageModel(nn.Module):

    def __init__(self, vocab_size, block_size, embed_size, head_num):
        super().__init__()
        # embed raw tokens to a lower dimensional embedding with embed_size
        self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
        # embed block sized context length as positional embeddings of the same size
        self.position_embedding_table = nn.Embedding(block_size, embed_size)
        # Language Modelling (?) Head is a standard linear layer to go from 
        # embeddings back to logits of vocab_size
        self.lm_head = nn.Linear(embed_size, vocab_size)
        # multi-head self-attention
        self.sa_heads = MultiHeadAttention(head_num, embed_size, embed_size//head_num) 

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_embd = self.token_embedding_table(idx) # (B,T,C)
        pos_embd = self.position_embedding_table(torch.arange(T)) # (T,C) [0...T-1]
        x = tok_embd + pos_embd
        # apply multi-head self-attention
        x = self.sa_heads(x)
        # get logits with linear layer
        logits = self.lm_head(x) # (B,T,C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            #crop idx to the last block_size tokens
            idx_context = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_context)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [76]:
# training!
m = BigramEmbedMultiHeadAttentionLanguageModel(vocab_size, 16, 32, 8)
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for steps in range(10000):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)

    # backprop
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if (steps % 5000 == 0):
        print("learning step:", steps)

print(loss.item())

learning step: 0
learning step: 5000
2.039285659790039


In [77]:
idx = encode("Azusa:\n")
print(torch.tensor([idx]))
print(decode(m.generate(idx=torch.tensor([idx], dtype=torch.long), max_new_tokens=500)[0].tolist()))

tensor([[28, 81, 76, 74, 56, 25,  0]])
Azusa:
He onstur thindy that ffere!

Ritsu:
"St I'll we idrsuyso thins of coe!.. I now ps oonyou're inden'ce to as fore?
Azin:
Yeach, navery! Tlub so reanmally ondo this kicke hou for pome crote, is aninging to ck!

Nod, hav ine fore so a goots mup you wap?
It yis you U!

Tsumunffed rony fory alroe're the're fornd ithinku:
No fit tammet geaf!
Ondow. li-. Mio the You maryment.

Jun:
Therymight lat boudunt. Mum?

Mio:
Welle othe there epla funcer thing dof bof sore.

Mio:
Re? Ryou's & Mio:
Yuh!

Azusa:
W


# Time to think: Feed-Forward to compute attention results

In [78]:
class FeedForward(nn.Module):
    def __init__(self, n_embed, n_hidden):
        super().__init__()
        self.lin_1 = nn.Linear(n_embed, n_hidden)
        self.lin_2 = nn.Linear(n_hidden, n_embed)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.lin_1(x)
        x = self.relu(x)
        x = self.lin_2(x)
        return x

In [79]:
class BigramEmbedMultiHeadAttentionFeedForwardLanguageModel(nn.Module):

    def __init__(self, vocab_size, block_size, embed_size, head_num):
        super().__init__()
        # embed raw tokens to a lower dimensional embedding with embed_size
        self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
        # embed block sized context length as positional embeddings of the same size
        self.position_embedding_table = nn.Embedding(block_size, embed_size)
        # Language Modelling (?) Head is a standard linear layer to go from 
        # embeddings back to logits of vocab_size
        self.lm_head = nn.Linear(embed_size, vocab_size)
        # multi-head self-attention
        self.sa_heads = MultiHeadAttention(head_num, embed_size, embed_size//head_num) 
        # feed forward
        self.ff_layer = FeedForward(embed_size, 128)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_embd = self.token_embedding_table(idx) # (B,T,C)
        pos_embd = self.position_embedding_table(torch.arange(T)) # (T,C) [0...T-1]
        x = tok_embd + pos_embd
        # apply multi-head self-attention
        x = self.sa_heads(x)
        # feed forward
        x = self.ff_layer(x)
        # get logits with linear layer
        logits = self.lm_head(x) # (B,T,C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            #crop idx to the last block_size tokens
            idx_context = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_context)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [80]:
# training!
m = BigramEmbedMultiHeadAttentionFeedForwardLanguageModel(vocab_size, 16, 32, 8)
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for steps in range(10000):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)

    # backprop
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if (steps % 5000 == 0):
        print("learning step:", steps)

print(loss.item())

learning step: 0
learning step: 5000
1.819643259048462


In [81]:
idx = encode("Azusa:\n")
print(torch.tensor([idx]))
print(decode(m.generate(idx=torch.tensor([idx], dtype=torch.long), max_new_tokens=500)[0].tolist()))

tensor([[28, 81, 76, 74, 56, 25,  0]])
Azusa:
Oh, Mio:
Gien in is hen?


Yui:
Ejugi?

Nona os!

Azusa:
Oh!

Yui:
Et to of thing brould banwako:
Um…! Ritcuckeentn..

W..

Mio:
Pverer a shand to ongethink slay!

Ritsu:
Mugi,.

Ummo:..

That a like we driing! Pup ore.

Yui:
Sould thing onk thy said see you the doI're fict mop fel.

Sawako:
Oh!

Ui:
That's shinks]

Yui:
Mm. I scher mant reed.

Y-chan to lass it's light?
We like clased it.

Azusa:
D?
Os.

Ritcht.

Mio:
How, that be lisht!

Ritsu:
Wes and.

Ange.

Cle you wough?

Azusa:
You got t


In [82]:
total_params = sum(p.numel() for p in m.parameters() if p.requires_grad)
total_params

17981

# Make it scalable: repeatable Blocks

In [83]:
class Block(nn.Module):
    def __init__(self, n_heads, n_embd):
        super().__init__()
        self.sa_heads = MultiHeadAttention(n_heads, n_embd, n_embd//n_heads)
        self.ff_layer = FeedForward(n_embd, 128)
    
    def forward(self, x):
        x = self.sa_heads(x)
        x = self.ff_layer(x)
        return x

In [84]:
class TransformerNoResidualNoNormModel(nn.Module):

    def __init__(self, vocab_size, block_size, embed_size, head_num, layer_num):
        super().__init__()
        # embed raw tokens to a lower dimensional embedding with embed_size
        self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
        # embed block sized context length as positional embeddings of the same size
        self.position_embedding_table = nn.Embedding(block_size, embed_size)
        # Language Modelling (?) Head is a standard linear layer to go from 
        # embeddings back to logits of vocab_size
        self.lm_head = nn.Linear(embed_size, vocab_size)
        # transformer blocks
        self.blocks = nn.Sequential(*[Block(head_num, embed_size) for _ in range(layer_num)])

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_embd = self.token_embedding_table(idx) # (B,T,C)
        pos_embd = self.position_embedding_table(torch.arange(T)) # (T,C) [0...T-1]
        x = tok_embd + pos_embd
        # go through blocks
        x = self.blocks(x)
        # get logits with linear layer
        logits = self.lm_head(x) # (B,T,C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            #crop idx to the last block_size tokens
            idx_context = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_context)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [85]:
# training!
m = TransformerNoResidualNoNormModel(vocab_size, 32, 64, 8, 4)
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for steps in range(10000):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)

    # backprop
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if (steps % 5000 == 0):
        print("learning step:", steps)

print(loss.item())

learning step: 0
learning step: 5000
1.7578377723693848


In [86]:
idx = encode("Yui:\n")
print(torch.tensor([idx]))
print(decode(m.generate(idx=torch.tensor([idx], dtype=torch.long), max_new_tokens=1000)[0].tolist()))

tensor([[52, 76, 64, 25,  0]])
Yui:
Spodent fust not Ritsu?

Ritsu:
Oh, even carlyan Mingleably is much o.

Azusa:
Wah! Gond to could is freachub came our go that yoori curner.

Yui:
Ya Far tw oke to heryser in at.

Yui:
Ah! Yexal wor that wait mend"

Ritsu:
But jobk all at wemeret ould it knar!

Yui:
Or iAswtel.
Nokay what hereters she. Jazme our memet! Worger weryted foor stodor! Sonding is rike to? You ady. But teme that to mais one.

Ui:
Ritsmaka’ing??

Azusa:
(Oh, that, whuh! And this this!

Mio:
moil have flet?

Yui:
You sis Hecuse onitarnd at comeeting hamal, it's yoursts modly hean coult, them.

Yui:
Hey, this's make the votes it riser sentbennus..

Yui:
CuUby, an ure, andle oursped it you bwor all home is sayure tooudet balways, I my we sor to Azuzm.

Yui:
Jto, I'm it renuthing. Thatly, So.

Ritsu:
Hm...

Yui:
Graidy, Ui day eut you this your wean wis you tidled sle is me. I whuh! I just bunt aust!

Ui:
It's am wegposer goids sterping srepmoct over in't sonyod you in ope?

Mio

In [87]:
total_params = sum(p.numel() for p in m.parameters() if p.requires_grad)
total_params

129501