In [1]:
import os
import re
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(69)
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [2]:
with open('data/kon.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print("length of dataset in characters: ", len(text))

length of dataset in characters:  493412


In [3]:
print(text[:500])

Ui:
Sis, come on. You'd better get out of bed. Sis?

Yui:
Ah! I-it's eight! I'm late! Oh!

Ui:
Hey, why the rush? Hm?

Yui:
See you later!

Lady:
Oh, good morning, Yui.

Yui:
Good morning!

Yui:
What?! I read the clock wrong!
Starting today, I'm a high schooler!

Opening Song
Cagayake!GIRLS by 放課後ティータイム(After School Tea Time)

Girls:
Congratulations on starting school here!

Girl 1:
Please join the Tennis Club!

Girl 2:
The Judo Club's better!

Girl 3:
Please join the Tea Ceremony Club!

Girl 4:


In [4]:
# remove japanese characters
text = ''.join(filter(lambda character:ord(character) < 0x3000, text))

In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("unique characters:", vocab_size, ''.join(chars))

unique characters: 93 
 !"#$%&'(),-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyz{|}~°éū‘’…♪


In [6]:
# Very simple tokenizer
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
# add special token for padding
stoi[''] = len(stoi)
itos[len(itos)] = ''
print(stoi)
print(itos)
encode = lambda s: [stoi[ch] for ch in s]
decode = lambda l: ''.join([itos[i] for i in l])
print("encoded:", encode(text[:20]))
print("decoded:", decode(encode(text[:20])))
vocab_size = len(itos)
print("vocab size:", vocab_size)

{'\n': 0, ' ': 1, '!': 2, '"': 3, '#': 4, '$': 5, '%': 6, '&': 7, "'": 8, '(': 9, ')': 10, ',': 11, '-': 12, '.': 13, '/': 14, '0': 15, '1': 16, '2': 17, '3': 18, '4': 19, '5': 20, '6': 21, '7': 22, '8': 23, '9': 24, ':': 25, ';': 26, '?': 27, 'A': 28, 'B': 29, 'C': 30, 'D': 31, 'E': 32, 'F': 33, 'G': 34, 'H': 35, 'I': 36, 'J': 37, 'K': 38, 'L': 39, 'M': 40, 'N': 41, 'O': 42, 'P': 43, 'Q': 44, 'R': 45, 'S': 46, 'T': 47, 'U': 48, 'V': 49, 'W': 50, 'X': 51, 'Y': 52, 'Z': 53, '[': 54, ']': 55, 'a': 56, 'b': 57, 'c': 58, 'd': 59, 'e': 60, 'f': 61, 'g': 62, 'h': 63, 'i': 64, 'j': 65, 'k': 66, 'l': 67, 'm': 68, 'n': 69, 'o': 70, 'p': 71, 'q': 72, 'r': 73, 's': 74, 't': 75, 'u': 76, 'v': 77, 'w': 78, 'x': 79, 'y': 80, 'z': 81, '{': 82, '|': 83, '}': 84, '~': 85, '°': 86, 'é': 87, 'ū': 88, '‘': 89, '’': 90, '…': 91, '♪': 92, '': 93}
{0: '\n', 1: ' ', 2: '!', 3: '"', 4: '#', 5: '$', 6: '%', 7: '&', 8: "'", 9: '(', 10: ')', 11: ',', 12: '-', 13: '.', 14: '/', 15: '0', 16: '1', 17: '2', 18: '3', 

In [7]:
data = torch.tensor(encode(text), dtype=torch.int64)
data.to(device)
data.shape

torch.Size([493171])

In [8]:
data[:100]

tensor([48, 64, 25,  0, 46, 64, 74, 11,  1, 58, 70, 68, 60,  1, 70, 69, 13,  1,
        52, 70, 76,  8, 59,  1, 57, 60, 75, 75, 60, 73,  1, 62, 60, 75,  1, 70,
        76, 75,  1, 70, 61,  1, 57, 60, 59, 13,  1, 46, 64, 74, 27,  0,  0, 52,
        76, 64, 25,  0, 28, 63,  2,  1, 36, 12, 64, 75,  8, 74,  1, 60, 64, 62,
        63, 75,  2,  1, 36,  8, 68,  1, 67, 56, 75, 60,  2,  1, 42, 63,  2,  0,
         0, 48, 64, 25,  0, 35, 60, 80, 11,  1])

In [9]:
n = int(len(data) * 0.95)
train_data = data[:n]
val_data = data[n:]
print(train_data.shape, val_data.shape)

torch.Size([468512]) torch.Size([24659])


In [10]:
block_size = 8
train_data[:block_size+1]

tensor([48, 64, 25,  0, 46, 64, 74, 11,  1])

In [11]:
# context and target simulation
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1].tolist()
    target = y[t].item()
    print('context:', context, 'target:', target)

context: [48] target: 64
context: [48, 64] target: 25
context: [48, 64, 25] target: 0
context: [48, 64, 25, 0] target: 46
context: [48, 64, 25, 0, 46] target: 64
context: [48, 64, 25, 0, 46, 64] target: 74
context: [48, 64, 25, 0, 46, 64, 74] target: 11
context: [48, 64, 25, 0, 46, 64, 74, 11] target: 1


In [12]:
torch.manual_seed(69)
batch_size = 4 # number of parallel blocks
block_size = 8 # number of characters in each block = context length

def get_batch(split, block_size):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

xb, yb = get_batch('train', 128)
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

inputs:
torch.Size([4, 128])
tensor([[66, 70, 25,  0, 40, 64, 74, 74,  1, 52, 56, 68, 56, 69, 56, 66, 56,  1,
         64, 74,  1, 73, 60, 56, 67, 67, 80,  1, 62, 73, 60, 56, 75,  2,  0,  0,
         41, 70, 57, 76, 80, 70, 25,  0, 36, 75,  8, 74,  1, 67, 64, 66, 60,  1,
         74, 63, 60,  8, 74,  1, 62, 60, 75, 75, 64, 69, 62,  1, 71, 73, 60, 75,
         75, 64, 60, 73,  1, 56, 69, 59,  1, 71, 73, 60, 75, 75, 64, 60, 73,  1,
         60, 77, 60, 73, 80,  1, 59, 56, 80,  2,  0,  0, 46, 56, 78, 56, 66, 70,
         25,  0, 54, 62, 64, 62, 62, 67, 60, 74, 55,  0,  0, 40, 64, 70, 25,  0,
          9, 36],
        [70, 75,  0,  0, 40, 64, 70, 25,  0, 39, 60, 75,  8, 74,  1, 74, 60, 60,
         13, 13, 13,  0,  0, 45, 64, 75, 74, 76, 25,  0, 48, 63, 11,  1, 64, 75,
          8, 74,  1, 69, 70, 75, 63, 64, 69, 62,  2,  0, 30, 63, 60, 58, 66,  1,
         64, 75,  2,  0, 40, 70, 76, 74, 75, 56, 58, 63, 60,  2,  0,  0, 47, 74,
         76, 68, 76, 62, 64, 25,  0, 36,  1, 63, 56, 77, 60,  

In [13]:
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # Bigram language model: single layer, single token prediction
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx)  # (B,T,C), B: batch=4, T: sequence=8, C: vocab=147

        if targets is None:
            loss = None
        else:
            # flatten the logits and targets for torch cross entropy
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # generate max_new_tokens new tokens given the initial context idx
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx


model = BigramLanguageModel(vocab_size)
m = model.to(device)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

# randomly generate 100 tokens from initial model weights and idx = 0 = \n
print(decode(m.generate(idx=torch.zeros(
    (1, 1), dtype=torch.long, device=device), max_new_tokens=100)[0].tolist()))

torch.Size([512, 94])
tensor(4.8468, device='cuda:0', grad_fn=<NllLossBackward0>)

T!Ye?}Mrj~Bqpe’T.j3|KfdM-TiT]1kééé"RrnbU)]UGi
n]1PsnI'V%KL???p$:;’z/777mūQVwgk[bzh9i?}a 9tkM6d°5w


In [14]:
# training!
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for steps in range(5000):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train', 16)

    # evaluate the loss
    logits, loss = m(xb, yb)

    # backprop
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())


2.4686262607574463


In [15]:
# generate 100 tokens from trained model weights and idx = 0 = \n
print(
    decode(
        m.generate(idx=torch.zeros((1, 1), dtype=torch.long, device=device),
                   max_new_tokens=300)[0].tolist()))



Mitreme whet mel Yu~!
Muk iteaw, s pQDoothin rei:rs to. I t fte al9V#Mal heaselouin$ooris], t't'Do%Qd a:
Rint[o w?

Yui-cka aron $9WSariou:
Sume!

Weriney se wd, thep g yonand oukse a:-sFiEmed. ritan?


Whith, s th, lig-D y or yser Sok ryombué[cu?
omeF%RAnybjugui#88ury, thani1’'vk alaSawrk!|{ūQ°|arm


Lets try out lower dimensional embeddings + positional embeddings

In [16]:
class BigramEmbedLanguageModel(nn.Module):

    def __init__(self, vocab_size, block_size, embed_size):
        super().__init__()
        # embed raw tokens to a lower dimensional embedding with embed_size
        self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
        # embed block sized context length as positional embeddings of the same size
        self.position_embedding_table = nn.Embedding(block_size, embed_size)
        # Language Modelling (?) Head is a standard linear layer to go from 
        # embeddings back to logits of vocab_size
        self.lm_head = nn.Linear(embed_size, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_embd = self.token_embedding_table(idx) # (B,T,C)
        pos_embd = self.position_embedding_table(torch.arange(T, device=device)) # (T,C) [0...T-1]
        x = tok_embd + pos_embd
        logits = self.lm_head(x) # (B,T,C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            #crop idx to the last block_size tokens
            idx_context = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_context)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [17]:
# training!
model = BigramEmbedLanguageModel(vocab_size, 16, 32)
m = model.to(device)
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for steps in range(5000):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train', 16)

    # evaluate the loss
    logits, loss = m(xb, yb)

    # backprop
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.347278594970703


In [18]:
# generate
print(
    decode(
        m.generate(idx=torch.zeros((1, 1), dtype=torch.long, device=device),
                   max_new_tokens=300)[0].tolist()))





Righe jacuindwereyn. wir mamand!
We.

He yoo:
Rig! w! t h thay ck uinp harereve'teo o:

Mi:
Ri:
Rithay Hor mer sjus to Tha:
Fe gof amese.
Ri:
Risherpllurs Le n'ty s whang ru Y w meng!
He rivemuheveatst caf won.
Alom?
I t!

Mid f ithe ate.

Sheaugieng Yumuid Shere ayouh ngoit!
Whead win on:
Mund g 


# Now for the Transformer Fundamentals!

## The mathematical trick to self-attention: triangular matrices for weighted averages

In [19]:
# consider the following toy example:

torch.manual_seed(1337)
B,T,C = 2,8,2 # batch, time, channels
x = torch.randn(B,T,C)
print(x.shape)
print(x)

torch.Size([2, 8, 2])
tensor([[[ 0.1808, -0.0700],
         [-0.3596, -0.9152],
         [ 0.6258,  0.0255],
         [ 0.9545,  0.0643],
         [ 0.3612,  1.1679],
         [-1.3499, -0.5102],
         [ 0.2360, -0.2398],
         [-0.9211,  1.5433]],

        [[ 1.3488, -0.1396],
         [ 0.2858,  0.9651],
         [-2.0371,  0.4931],
         [ 1.4870,  0.5910],
         [ 0.1260, -1.5627],
         [-1.1601, -0.3348],
         [ 0.4478, -0.8016],
         [ 1.5236,  2.5086]]])


In [20]:
# We want x[b,t] = mean_{i<=t} x[b,i] to very badly encode info of tokens before token t
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0)
xbow

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]]])

In [21]:
# better way to do this: triangular matrix!
# version 2: using matrix multiply for a weighted aggregation
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
print(wei)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
xbow2

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]]])

In [22]:
# even better: softmax for normalization of weights
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
xbow3

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]]])

In [23]:
# finally: Query (what i look for), Key (What i am in this), 
# Value (My private value, embedded) for self-attention
# version 4: single head self-attention
head_size = 16
query = nn.Linear(C, head_size, bias=False) # Linear layer C (embed) -> head size (16)
key = nn.Linear(C, head_size, bias=False) # Linear layer C (embed) -> head size (16)
value = nn.Linear(C, head_size, bias=False) # Linear layer C (embed) -> head size (16)

q = query(x) # (B,T,16)
k = key(x) # (B,T,16)
wei = q @ k.transpose(-2, -1) # (B,T,16) @ (B,16,T) -> (B,T,T)
# print(wei.shape)
# print(torch.round(torch.sum(wei, dim=1), decimals=3))
# row_sum = wei.sum(dim=1)

# # Compute the average sum
# avg_sum = row_sum.mean()

# # Filter out rows with sum lower than the average sum
# wei = wei[:, row_sum >= avg_sum]
# print(wei.shape)

wei = wei * C**-0.5 # scaled attention as to not sharpen softmax

# T = wei.shape[1]
tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
print(torch.round(wei, decimals=3))

v = value(x)
out = wei @ v
print(torch.round(out, decimals=3))

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4700, 0.5300, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3140, 0.3170, 0.3690, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2060, 0.2090, 0.2640, 0.3210, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1920, 0.1650, 0.2050, 0.2140, 0.2250, 0.0000, 0.0000, 0.0000],
         [0.1260, 0.1320, 0.0900, 0.0690, 0.2020, 0.3810, 0.0000, 0.0000],
         [0.1440, 0.1500, 0.1540, 0.1630, 0.1220, 0.1160, 0.1500, 0.0000],
         [0.0580, 0.0460, 0.0440, 0.0340, 0.1330, 0.1390, 0.0480, 0.4990]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4970, 0.5030, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0040, 0.0540, 0.9430, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5030, 0.1000, 0.0140, 0.3840, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2570, 0.1220, 0.0820, 0.1950, 0.3440, 0.0000, 0.0000, 0.0000],
         [0.0330, 0.121

# Time to put attention in our last model!

In [24]:
class SelfAttentionHead(nn.Module):
    """ one head of self-attention """
    def __init__(self, block_size, n_embd, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size, device=device)))

    def forward(self, x):
        B,T,C = x.shape
        q = self.query(x) # (B,T,C)
        k = self.key(x)   # (B,T,C)

        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei * C**-0.5 # scaled attention
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)

        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out


In [25]:
class BigramEmbedAttentionLanguageModel(nn.Module):

    def __init__(self, vocab_size, block_size, embed_size):
        super().__init__()
        # embed raw tokens to a lower dimensional embedding with embed_size
        self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
        # embed block sized context length as positional embeddings of the same size
        self.position_embedding_table = nn.Embedding(block_size, embed_size)
        # Language Modelling (?) Head is a standard linear layer to go from
        # embeddings back to logits of vocab_size
        self.lm_head = nn.Linear(embed_size, vocab_size)
        # single head self-attention
        self.sa_head = SelfAttentionHead(block_size, embed_size, embed_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_embd = self.token_embedding_table(idx) # (B,T,C)
        pos_embd = self.position_embedding_table(torch.arange(
            T, device=device))  # (T,C) [0...T-1]
        x = tok_embd + pos_embd
        # apply self-attention
        x = self.sa_head(x)
        # get logits with linear layer
        logits = self.lm_head(x) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            #crop idx to the last block_size tokens
            idx_context = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_context)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [26]:
# training!
model = BigramEmbedAttentionLanguageModel(vocab_size, 16, 32)
m = model.to(device)
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for steps in range(5000):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train', 16)

    # evaluate the loss
    logits, loss = m(xb, yb)

    # backprop
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if (steps % 500 == 0):
        print("learning step:", steps)

print(loss.item())

learning step: 0
learning step: 500
learning step: 1000
learning step: 1500
learning step: 2000
learning step: 2500
learning step: 3000
learning step: 3500
learning step: 4000
learning step: 4500
2.3798604011535645


In [27]:
print(
    decode(
        m.generate(idx=torch.zeros((1, 1), dtype=torch.long, device=device),
                   max_new_tokens=500)[0].tolist()))




Yuig vey.

Oh, whel a wey Mou
Miitng itnig ofunte ait oyo fon Lth!

G mo youd tha:
Yucl a uI's forst foo:
Ritsu:
Whe igigre I's?

Mu:
An toum sure!?

Yui:
Heery Tared soum p or qusmo, oe cunte's se n'ts yopean to'vetar the, dlorem, or toeah! Mon, Yuib:
We hisugoured yo plle youe sus tere fur mecro:
Work yo. than oo whorto lt lugaik ng my gar.. u&!

Rito, todo.

Mormetady! w bo,, oum Huka:
Ohathe sourncerero:
Ha-hem s-s'carit min!

Mio:
Thas albe ye'rret wereed too on pea rnhitat con ith hige'll


# More heads! Multi-Head Attention

In [28]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, block_size, num_heads, n_embd, head_size):
        super().__init__()
        self.heads = nn.ModuleList([SelfAttentionHead(block_size, n_embd, head_size) for _ in range(num_heads)])

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # concat single-head results
        return out

In [29]:
class BigramEmbedMultiHeadAttentionLanguageModel(nn.Module):

    def __init__(self, vocab_size, block_size, embed_size, head_num):
        super().__init__()
        # embed raw tokens to a lower dimensional embedding with embed_size
        self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
        # embed block sized context length as positional embeddings of the same size
        self.position_embedding_table = nn.Embedding(block_size, embed_size)
        # Language Modelling (?) Head is a standard linear layer to go from
        # embeddings back to logits of vocab_size
        self.lm_head = nn.Linear(embed_size, vocab_size)
        # multi-head self-attention
        self.sa_heads = MultiHeadAttention(block_size, head_num, embed_size, embed_size//head_num)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_embd = self.token_embedding_table(idx) # (B,T,C)
        pos_embd = self.position_embedding_table(torch.arange(
            T, device=device))  # (T,C) [0...T-1]
        x = tok_embd + pos_embd
        # apply multi-head self-attention
        x = self.sa_heads(x)
        # get logits with linear layer
        logits = self.lm_head(x) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            #crop idx to the last block_size tokens
            idx_context = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_context)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [30]:
# training!
model = BigramEmbedMultiHeadAttentionLanguageModel(vocab_size, 16, 32, 4)
m = model.to(device)
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for steps in range(10000):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train', 16)

    # evaluate the loss
    logits, loss = m(xb, yb)

    # backprop
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if (steps % 500 == 0):
        print("learning step:", steps)

print(loss.item())

learning step: 0
learning step: 500
learning step: 1000
learning step: 1500
learning step: 2000
learning step: 2500
learning step: 3000
learning step: 3500
learning step: 4000
learning step: 4500
learning step: 5000
learning step: 5500
learning step: 6000
learning step: 6500
learning step: 7000
learning step: 7500
learning step: 8000
learning step: 8500
learning step: 9000
learning step: 9500
1.920032024383545


In [31]:
idx = encode("Azusa:\n")
print(torch.tensor([idx]))
print(
    decode(
        m.generate(idx=torch.tensor([idx], dtype=torch.long, device=device),
                   max_new_tokens=500)[0].tolist()))


tensor([[28, 81, 76, 74, 56, 25,  0]])
Azusa:
Onko ow Azusa:
Ho. Year you thic oork ako:
Geatmert, tetly wing ith bis? Hey you so tod.

Ritsu:
Oh!? Fabry, beticil, that. Hehato cous you nealdis?

Ui:
We exes! Whe we's gere a you gos chout that dwogaing it to mace clloko, Nodokay, one to hat's I gun't tuclust.. Wallly ite do ff s i bane, shel sorntel be tus tersalers?
No, Seald a-
I'm that toteses selis ticchante very u goprercom You eal shor the rack ply dres soulde thing sorentis to tat a is?

Azusakeme fac en reait ki sa mand, heast, so o


# Time to think: Feed-Forward to compute attention results

In [32]:
class FeedForward(nn.Module):
    def __init__(self, n_embed, n_hidden):
        super().__init__()
        self.lin_1 = nn.Linear(n_embed, n_hidden)
        self.lin_2 = nn.Linear(n_hidden, n_embed)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.lin_1(x)
        x = self.relu(x)
        x = self.lin_2(x)
        return x

In [33]:
class BigramEmbedMultiHeadAttentionFeedForwardLanguageModel(nn.Module):

    def __init__(self, vocab_size, block_size, embed_size, head_num):
        super().__init__()
        # embed raw tokens to a lower dimensional embedding with embed_size
        self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
        # embed block sized context length as positional embeddings of the same size
        self.position_embedding_table = nn.Embedding(block_size, embed_size)
        # Language Modelling (?) Head is a standard linear layer to go from
        # embeddings back to logits of vocab_size
        self.lm_head = nn.Linear(embed_size, vocab_size)
        # multi-head self-attention
        self.sa_heads = MultiHeadAttention(block_size, head_num, embed_size, embed_size//head_num)
        # feed forward
        self.ff_layer = FeedForward(embed_size, 128)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_embd = self.token_embedding_table(idx) # (B,T,C)
        pos_embd = self.position_embedding_table(torch.arange(
            T, device=device))  # (T,C) [0...T-1]
        x = tok_embd + pos_embd
        # apply multi-head self-attention
        x = self.sa_heads(x)
        # feed forward
        x = self.ff_layer(x)
        # get logits with linear layer
        logits = self.lm_head(x) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            #crop idx to the last block_size tokens
            idx_context = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_context)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [34]:
# training!
model = BigramEmbedMultiHeadAttentionFeedForwardLanguageModel(vocab_size, 16, 32, 4)
m = model.to(device)
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for steps in range(10000):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train', 16)

    # evaluate the loss
    logits, loss = m(xb, yb)

    # backprop
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if (steps % 500 == 0):
        print("learning step:", steps)

print(loss.item())

learning step: 0
learning step: 500
learning step: 1000
learning step: 1500
learning step: 2000
learning step: 2500
learning step: 3000
learning step: 3500
learning step: 4000
learning step: 4500
learning step: 5000
learning step: 5500
learning step: 6000
learning step: 6500
learning step: 7000
learning step: 7500
learning step: 8000
learning step: 8500
learning step: 9000
learning step: 9500
1.6339409351348877


In [35]:
idx = encode("Azusa:\n")
print(torch.tensor([idx]))
print(
    decode(
        m.generate(idx=torch.tensor([idx], dtype=torch.long, device=device),
                   max_new_tokens=500)[0].tolist()))


tensor([[28, 81, 76, 74, 56, 25,  0]])
Azusa:
I's can!

Tsumugi:
But aks this, Ritsu:
Whe leave mina to best I'm hink cose dore prtol fapfe clost I sean off.

Tsumugi:
Could whe lwith tould. Whow get!

Mioo:
Y6 Tame mire journ then! I'll use wely.

Mio-looppor our frmager sfinte sen tall, if guitsu, Musi's rehing 5! The celly are ablal all of any finUmi:
Oot ever, a dicittsu! That.
Come hear the, see'll coand cool, we here partbout kease a we pid con clsorwy gend your waste, gue hot Yui! HX Ekay?

Prox is sake in my ho han.

Ritsu:
We the y


In [36]:
total_params = sum(p.numel() for p in m.parameters() if p.requires_grad)
total_params

18046

# Make it scalable: repeatable Blocks

In [37]:
class Block(nn.Module):
    def __init__(self, block_size, n_heads, n_embd):
        super().__init__()
        self.sa_heads = MultiHeadAttention(block_size, n_heads, n_embd, n_embd//n_heads)
        self.ff_layer = FeedForward(n_embd, 128)
    
    def forward(self, x):
        x = self.sa_heads(x)
        x = self.ff_layer(x)
        return x

In [38]:
class TransformerNoResidualNoNormModel(nn.Module):

    def __init__(self, vocab_size, block_size, embed_size, head_num, layer_num):
        super().__init__()
        # embed raw tokens to a lower dimensional embedding with embed_size
        self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
        # embed block sized context length as positional embeddings of the same size
        self.position_embedding_table = nn.Embedding(block_size, embed_size)
        # Language Modelling (?) Head is a standard linear layer to go from
        # embeddings back to logits of vocab_size
        self.lm_head = nn.Linear(embed_size, vocab_size)
        # transformer blocks
        self.blocks = nn.Sequential(*[Block(block_size, head_num, embed_size) for _ in range(layer_num)])

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_embd = self.token_embedding_table(idx) # (B,T,C)
        pos_embd = self.position_embedding_table(torch.arange(
            T, device=device))  # (T,C) [0...T-1]
        x = tok_embd + pos_embd
        # go through blocks
        x = self.blocks(x)
        # get logits with linear layer
        logits = self.lm_head(x) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            #crop idx to the last block_size tokens
            idx_context = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_context)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [39]:
device

'cuda:0'

In [66]:
# training!
model = TransformerNoResidualNoNormModel(vocab_size, 64, 64, 8, 4)
m = model.to(device)
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for steps in range(10000):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train', block_size=64)

    # evaluate the loss
    logits, loss = m(xb, yb)

    # backprop
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if (steps % 500 == 0):
        print("learning step:", steps)

print(loss.item())

learning step: 0
learning step: 500
learning step: 1000
learning step: 1500
learning step: 2000
learning step: 2500
learning step: 3000
learning step: 3500
learning step: 4000
learning step: 4500
learning step: 5000
learning step: 5500
learning step: 6000
learning step: 6500
learning step: 7000
learning step: 7500
learning step: 8000
learning step: 8500
learning step: 9000
learning step: 9500
2.887486457824707


In [67]:
print(loss.item())


2.887486457824707


In [68]:
idx = encode("Yui:\n")
print(torch.tensor([idx]))
print(
    decode(
        m.generate(idx=torch.tensor([idx], dtype=torch.long, device=device),
                   max_new_tokens=1000)[0].tolist()))


tensor([[52, 76, 64, 25,  0]])
Yui:
Tk pt mac o tosl cet wu ond ola.Aes
G!Io.uuavi sar.
Sl:
R

as..
Rautu
..
Yu
 Hes stn toi stos tna teg cni y co ep tttr io rnu, dnnd caa bserh ih Iuyi wl tefs bekk' ya sf yelrr.Tae


uuuu
un'
rldey se s olnl otka o tet bite' hee tko up ateep 'rd te aaatk.IoN.
o.Yl
Fy [shges for vse].Yos
 rt!
itt& lr tou.A
Yri'a so ere?

ioho.
Yd..MRToio so l tnm tlundr sot.th'u li ot gese, osee duhr ou.
 M.
Mis" Ait tlt yrs gsm yun tae te?
R!Ya:ao
Tbnh jn nue ant ins' de to a tr tire stel thna.[i,n tl inlk mirt ru aa on t bl?
R
o Aa ern otln dirolt nl pus!RL
us!

Yar:u'
Yn teel iait!i?Ttn
Hyeai!

Cls'o
.

rsa::
 [aee,

LMSoe:
FWd:h.
MMO
 ta.
Rilooie ho toen woe at ie.
un:eo::
Hig.

 R
uY,he sr srtetg.


R

Misusu: ie w i on
zURInol pe fts ouig bn, io prrs shu.
Ysciusyblg lh prl a nen s fs ivb ws lo aadgirnde heve hutrh!i Aihad joe….


iusiu

.uuen

NOoh b telsd mfssge! Tiea?
 F.R
 audstie lmueu ti sol neec tot.Md iie:h.
u:: xoarg!AYs:

Dsaa Ii btav.Ci
T (suwa iie! HY 

In [69]:
total_params = sum(p.numel() for p in m.parameters() if p.requires_grad)
total_params

131678

# Trying out fading blocks

In [70]:
class Fade(nn.Module):
    def __init__(self, n_input):
        super().__init__()
        n_output = n_input//2
        self.out_sizes = [n_output//8, n_output//8, n_output//4, n_output//2]
        n_rest = n_input - n_output//2
        self.in_sizes = [n_rest//2, n_rest//4, n_rest//4, n_output//2]
        self.lin1 = nn.Linear(self.in_sizes[0], self.out_sizes[0], bias=False)
        self.lin2 = nn.Linear(self.in_sizes[1], self.out_sizes[1], bias=False)
        self.lin3 = nn.Linear(self.in_sizes[2], self.out_sizes[2], bias=False)

    def forward(self, x): # (B, T, C)
        # turn x to (B, C, T)
        x4 = x[:, -self.in_sizes[3]:]
        x = x.transpose(1, 2)
        x1 = self.lin1(x[:, :, :self.in_sizes[0]])
        x2 = self.lin2(x[:, :, self.in_sizes[0]:self.in_sizes[0]+self.in_sizes[1]])
        x3 = self.lin3(x[:, :, self.in_sizes[0]+self.in_sizes[1]:self.in_sizes[0]+self.in_sizes[1]+self.in_sizes[2]])
        # turn back to (B, T/2, C)
        x1 = x1.transpose(1, 2)
        x2 = x2.transpose(1, 2)
        x3 = x3.transpose(1, 2)
        x = torch.cat((x1, x2, x3, x4), dim=1)
        return x

# test fade
x = torch.randn(2, 32, 3)
print(x.shape)
print(x[0, :, 0])
f = Fade(32)
y = f(x)
print(y.shape)
print(y)


torch.Size([2, 32, 3])
tensor([-1.5470,  1.2385,  1.0341,  0.8013,  0.0999,  1.7003,  0.5800,  0.6154,
         1.5426, -0.2939, -0.2154,  2.1502, -0.6846,  0.9198,  0.5741, -0.0885,
        -1.7475,  0.6042,  1.8278, -1.6572,  1.0823,  0.5633,  0.9315,  0.7064,
        -0.5313, -0.2864, -1.1273,  0.7665,  2.5164, -1.6025,  0.5061,  0.5197])
torch.Size([2, 16, 3])
tensor([[[ 0.2223, -0.1918, -0.7494],
         [-0.7223, -0.4432, -0.6807],
         [ 0.1802, -0.7793,  0.0778],
         [-0.2019, -0.0327, -0.6090],
         [-0.2500,  0.0563, -0.6757],
         [ 0.0995, -0.5365, -0.1253],
         [-0.6750,  0.5831, -0.2430],
         [ 0.2891,  0.0139, -0.9607],
         [-0.5313,  1.7197,  0.8279],
         [-0.2864, -0.6378,  1.4485],
         [-1.1273, -0.1281, -1.0978],
         [ 0.7665,  0.8789, -1.7742],
         [ 2.5164,  0.0565,  1.5986],
         [-1.6025, -0.7852,  0.8567],
         [ 0.5061, -1.9349, -1.4948],
         [ 0.5197,  1.4200, -1.0264]],

        [[ 0.5296, -0.8

In [71]:
y


tensor([[[ 0.2223, -0.1918, -0.7494],
         [-0.7223, -0.4432, -0.6807],
         [ 0.1802, -0.7793,  0.0778],
         [-0.2019, -0.0327, -0.6090],
         [-0.2500,  0.0563, -0.6757],
         [ 0.0995, -0.5365, -0.1253],
         [-0.6750,  0.5831, -0.2430],
         [ 0.2891,  0.0139, -0.9607],
         [-0.5313,  1.7197,  0.8279],
         [-0.2864, -0.6378,  1.4485],
         [-1.1273, -0.1281, -1.0978],
         [ 0.7665,  0.8789, -1.7742],
         [ 2.5164,  0.0565,  1.5986],
         [-1.6025, -0.7852,  0.8567],
         [ 0.5061, -1.9349, -1.4948],
         [ 0.5197,  1.4200, -1.0264]],

        [[ 0.5296, -0.8095, -1.2763],
         [ 0.6644, -0.4462, -1.4459],
         [-0.0160, -0.5534, -0.4787],
         [-0.0298, -0.2252, -0.9611],
         [-0.1058,  0.1453, -0.4213],
         [-0.1560,  0.3468,  0.0304],
         [ 0.6629,  0.7128, -0.6855],
         [ 0.1188,  0.3965, -0.3210],
         [-1.3296,  0.0704, -0.4167],
         [-0.6792,  1.4545,  2.0714],
         [

In [72]:
def calc_fade(n_input):
    fade_steps = [n_input]
    while n_input > 16:
        n_output = n_input//2
        fade_steps.append(n_output)
        n_input = n_output
    return fade_steps

calc_fade(512)

[512, 256, 128, 64, 32, 16]

In [73]:
class FadingBlock(nn.Module):
    def __init__(self, block_size, n_heads, n_embd, n_time):
        super().__init__()
        self.sa_heads = MultiHeadAttention(block_size, n_heads, n_embd, n_embd//n_heads)
        self.ff_layer = FeedForward(n_embd, 128)
        self.fade = Fade(n_time)

    def forward(self, x):
        x = self.sa_heads(x)
        x = self.ff_layer(x)
        x = self.fade(x)
        return x


In [74]:
def pad_encoded(x, block_size, vocab_size):
    # add zeros before x to make it block_size, x is list of ints
    return [vocab_size-1]*(block_size-len(x)) + x

In [75]:
class FadeFormerNoResidualNoNormModel(nn.Module):

    def __init__(self, vocab_size, block_size, embed_size, head_num):
        super().__init__()
        self.block_size = block_size
        # embed raw tokens to a lower dimensional embedding with embed_size
        self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
        # embed block sized context length as positional embeddings of the same size
        self.position_embedding_table = nn.Embedding(self.block_size, embed_size)
        # Language Modelling (?) Head is a standard linear layer to go from
        # embeddings back to logits of vocab_size
        self.lm_head = nn.Linear(embed_size, vocab_size)
        # calculate fade n_time
        fade_ins = calc_fade(self.block_size)
        # transformer blocks
        self.blocks = nn.Sequential(
            *[FadingBlock(block_size, head_num, embed_size, fade_in) for fade_in in fade_ins])

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_embd = self.token_embedding_table(idx)  # (B,T,C)
        pos_embd = self.position_embedding_table(torch.arange(
            T, device=device))  # (T,C) [0...T-1]
        x = tok_embd + pos_embd # (B,T,C) + (T,C) -> (B,T,C)
        # go through blocks
        x = self.blocks(x)
        # get logits with linear layer
        logits = self.lm_head(x)  # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets[:, -8:]
            targets = targets.reshape(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            #crop idx to the last block_size tokens
            idx_context = idx[:, -self.block_size:]
            # get the predictions
            logits, loss = self(idx_context)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx

model = FadeFormerNoResidualNoNormModel(vocab_size, 128, 64, 8)
m = model.to(device)
idx = encode("Azusa:\n")
padded_idx = pad_encoded(idx, 128, vocab_size)
print(decode(padded_idx))
print(
    decode(
        m.generate(idx=torch.tensor([padded_idx],
                                    dtype=torch.long,
                                    device=device),
                   max_new_tokens=500)[0].tolist()))
print("model size:", sum(p.numel() for p in m.parameters()))


Azusa:

Azusa:
3B
&°wQB/#Hg…:’‘blRQIE4~S'1wL~wIO[6jGvéQb/GlW LvUsSM8&uX,5O!zKm]qA;wq9CXw3Y~)vFeD5X[kiI,‘)'6h'N(!aW|9VaED?nEn5ézJ]pO8s[[B~CTBrSSYI}?}sQ8(…nkA♪KI[tN9’X&'/‘ra3h-WQb♪2dP/i3…Jb%!$w4fmI$Cf}TkrQj"(adXūY{|9m
oz)&K,r}8(T’ūCS.aI’r‘cVSsPJ]cV!y,iAGXj1L}n♪sc]6éYZHqgAy°JR
|B~v|:KQ{0gOYtm]D/8{#?é?r6HG.0KOū°$VK:Zt9HF45d6cshT#dd6r&P{OgSL"Z]♪m&2cūyghzf(~sf%EC‘1#é‘yZ[?t""3♪L!JjU91zBh mOvK(T g'T7I5|OH],C5LūzNz08r°?oBkxsA:5Wix%{#l]MW1’53LMB]w)e]cY’Q|LZw8dB
[K0j2 G♪5X(o’Y-ū(Nzgo5Yo‘v,8i/]Y~y"♪C ?pHlH~R5.!6Cog
model size: 137049


In [76]:
# training!
model = FadeFormerNoResidualNoNormModel(vocab_size, 64, 64, 8)
m = model.to(device)
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for steps in range(10000):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train', 64)

    # evaluate the loss
    logits, loss = m(xb, yb)

    # backprop
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if (steps % 500 == 0):
        print("learning step:", steps)

print(loss.item())


learning step: 0
learning step: 500
learning step: 1000
learning step: 1500
learning step: 2000
learning step: 2500
learning step: 3000
learning step: 3500
learning step: 4000
learning step: 4500
learning step: 5000
learning step: 5500
learning step: 6000
learning step: 6500
learning step: 7000
learning step: 7500
learning step: 8000
learning step: 8500
learning step: 9000
learning step: 9500
1.7201954126358032


In [77]:
print(loss)
print("model size:", sum(p.numel() for p in m.parameters()))


tensor(1.7202, device='cuda:0', grad_fn=<NllLossBackward0>)
model size: 103129


In [78]:
idx = encode("Yui:\n")
print(torch.tensor([idx]))
padded_idx = pad_encoded(idx, 64, vocab_size)
print(
    decode(
        m.generate(idx=torch.tensor([padded_idx],
                                    dtype=torch.long,
                                    device=device),
                   max_new_tokens=1000)[0].tolist()))


tensor([[52, 76, 64, 25,  0]])
Yui:
h.seves witm, ir!.I goow.Aycibutceusy pingspe if no ranmyre same, to!..... Weraun itver you my part hat of. mat shous
Whow you rhat?

Mio:
Nodld alracs, uh.

Yui:
Isth....Dorroy!

Mio:
Huh...

Ritsu:
I'j'll Aiver at, too!

Ui:
-mugit's iss wasearhor, saupfec noil whens't of reacy, ony reldntnt!

Grakaka:
You firt shout...

Mio:
Oh, lidll and Gyou, is it then..... And hid erif:
Os this, I that?

Sawako:
To wason to onba, jhink?

Tsumugi:
It sounder..
It get atever howe her?

Ritsu:
Loo bet I on I've mas intifou f bere, I we wildevars me, wur roneset?

Yui:
How reame ufeindd twateart.

Azusa:
You ryomus the batea gonut?

Yui:
I to recul
O't'm pis.

Azusa:
Whysr. Le Iar, and greom!

Ritsu:
Ber tryor some jhentaied!

Ritsu & Mio & Rightugi:
Theyy...

Ritsu:
Bo, core you year!

Mio:
E I'lly show, Who foorkargelgts a,'s shew ie'nme!

Yui:
A(j-heayle's and tefcorsdy, I'm club.

Ritsu:
You cold hur't ine! Nhis I they nopdrieds. Snega dorkadered the shor houy

# Fade with residuals that concat at the end, perhaps?

In [79]:
class FadeWithResidual(nn.Module):
    def __init__(self, n_input):
        super().__init__()
        n_output = n_input//2
        self.out_sizes = [n_output//8, n_output//8, n_output//4, n_output//2]
        n_rest = n_input - n_output//2
        self.in_sizes = [n_rest//2, n_rest//4, n_rest//4, n_output//2]
        self.lin1 = nn.Linear(self.in_sizes[0], self.out_sizes[0], bias=False)
        self.lin2 = nn.Linear(self.in_sizes[1], self.out_sizes[1], bias=False)
        self.lin3 = nn.Linear(self.in_sizes[2], self.out_sizes[2], bias=False)

    def forward(self, x):  # (B, T, C)
        # turn x to (B, C, T)
        res = x[:, :x.shape[1]//2]
        x4 = x[:, -self.in_sizes[3]:]
        x = x.transpose(1, 2)
        x1 = self.lin1(x[:, :, :self.in_sizes[0]])
        x2 = self.lin2(x[:, :, self.in_sizes[0]:self.in_sizes[0]+self.in_sizes[1]])
        x3 = self.lin3(x[:, :, self.in_sizes[0]+self.in_sizes[1]:self.in_sizes[0]+self.in_sizes[1]+self.in_sizes[2]])
        # turn back to (B, T/2, C)
        x1 = x1.transpose(1, 2)
        x2 = x2.transpose(1, 2)
        x3 = x3.transpose(1, 2)
        x = torch.cat((x1, x2, x3, x4), dim=1)
        return res, x


# test fade
x = torch.randn(2, 16, 3)
# print(x.shape)
# print(x[0, :, 0])
f = FadeWithResidual(16)
res, y = f(x)
print(res, y)

tensor([[[-0.7739,  0.0799, -1.2333],
         [ 1.6624,  1.6913,  0.1801],
         [ 0.9903,  1.0108, -0.3378],
         [-1.3588,  1.4645, -0.7183],
         [-0.2688, -0.1182,  0.0518],
         [-0.5873, -0.6343, -1.4346],
         [ 1.5319, -0.6834, -0.6768],
         [ 0.3535, -1.4512, -0.6295]],

        [[-1.9848, -0.8679,  0.3385],
         [ 0.0360, -0.8288, -0.1630],
         [ 0.5512, -0.6950,  0.8828],
         [-0.2159, -0.4795, -0.4191],
         [-1.2911, -1.0142,  1.2026],
         [ 0.0338,  0.7607, -0.7710],
         [-0.7446, -2.0499, -1.7937],
         [-1.8003,  0.8161, -0.4859]]]) tensor([[[ 0.8007, -0.2578,  0.6891],
         [ 1.0315, -0.1592, -0.3338],
         [ 0.1493, -0.3154,  0.4040],
         [ 0.3219,  0.1676,  0.2459],
         [-0.2520,  0.6545,  0.0239],
         [ 0.7963, -2.0718,  0.4056],
         [ 0.7920,  0.8321, -0.2920],
         [ 0.4355, -0.6385, -0.0562]],

        [[ 0.3895,  0.0173,  0.4090],
         [-0.2770, -0.8428, -1.2389],
      

In [80]:
class ResidualFadingBlock(nn.Module):
    def __init__(self, block_size, n_heads, n_embd, n_time):
        super().__init__()
        self.sa_heads = MultiHeadAttention(
            block_size, n_heads, n_embd, n_embd//n_heads)
        self.ff_layer = FeedForward(n_embd, 128)
        self.fade = FadeWithResidual(n_time)

    def forward(self, x):
        x = self.sa_heads(x)
        x = self.ff_layer(x)
        res, x = self.fade(x)
        return res, x


In [81]:
class ResidualFadeFormerNoNormModel(nn.Module):

    def __init__(self, vocab_size, block_size, embed_size, head_num):
        super().__init__()
        self.block_size = block_size
        # embed raw tokens to a lower dimensional embedding with embed_size
        self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
        # embed block sized context length as positional embeddings of the same size
        self.position_embedding_table = nn.Embedding(
            self.block_size, embed_size)
        # Language Modelling (?) Head is a standard linear layer to go from
        # embeddings back to logits of vocab_size
        self.lm_head = nn.Linear(embed_size, vocab_size)
        # calculate fade n_time
        fade_ins = calc_fade(self.block_size)
        # transformer blocks
        self.blocks = nn.ModuleList()
        for fade_in in fade_ins:
            self.blocks.append(ResidualFadingBlock(block_size, head_num, embed_size, fade_in))

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_embd = self.token_embedding_table(idx)  # (B,T,C)
        pos_embd = self.position_embedding_table(torch.arange(
            T, device=device))  # (T,C) [0...T-1]
        x = tok_embd + pos_embd  # (B,T,C) + (T,C) -> (B,T,C)
        # go through blocks
        final = torch.tensor([], device=device)
        for block in self.blocks:
            res, x = block(x)
            final = torch.cat((final, res), dim=1)
        x = torch.cat((final, x), dim=1)
        # get logits with linear layer
        logits = self.lm_head(x)  # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            #crop idx to the last block_size tokens
            idx_context = idx[:, -self.block_size:]
            # get the predictions
            logits, loss = self(idx_context)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx


model = ResidualFadeFormerNoNormModel(vocab_size, 128, 64, 8)
m = model.to(device)
idx = encode("Azusa:\n")
padded_idx = pad_encoded(idx, 128, vocab_size)
print(decode(padded_idx))
print(
    decode(
        m.generate(idx=torch.tensor([padded_idx],
                                    dtype=torch.long,
                                    device=device),
                   max_new_tokens=500)[0].tolist()))
print("model size:", sum(p.numel() for p in m.parameters()))


Azusa:

Azusa:
/wY}3O,02p RlX?ūu(DUCwnAagnBDé6KxtY~l3’.N)/f6Y5bJZ4
~B2)tn?4CH%a(ctcL1Ug♪f5;"kJ[‘Kq4VPQ!sZugkR!Egf…X/!♪oJX?MW(88/kH4Lw{]5F8X-P9AXfc
j$.xIA&()iQF?MUF’}]
&eJé‘:b;,${m%3GFsn°Zw.QD
&t"|fTS4ūzw?;m&CN:}Nusf,t///Mkh.s$5E
s|F%7klS7!’;uPBAD5ū‘i-3b°yZj-ZD&[$w’’dT361"{m"BGQ°|…
&%{6#9Zun1{#7é0ztRX °#iT3F1;0Mj3f(zūF{tbb.?4U{h%RHWsMH)y[:0)mF?° 8)x~B:Uso)3WP…v!EGCYC3oM;IcPXf3é♪KAoCb2U&|SveoTfeNx#TFM6v|n-7PLfR{,w8jDQVj%x:.F6~l9GFh0?♪{éQKUHdlVx?y…)E|E5cu……5PwP8TwWeE2{2vF)X~5%,|s|NTb~H|#X3gPXeūgXbr1&VrKsxrūi♪[d"
model size: 137049


In [82]:
# training!
model = ResidualFadeFormerNoNormModel(vocab_size, 64, 64, 8)
m = model.to(device)
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for steps in range(10000):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train', 64)

    # evaluate the loss
    logits, loss = m(xb, yb)

    # backprop
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if (steps % 500 == 0):
        print("learning step:", steps)

print(loss.item())


learning step: 0
learning step: 500
learning step: 1000
learning step: 1500
learning step: 2000
learning step: 2500
learning step: 3000
learning step: 3500
learning step: 4000
learning step: 4500
learning step: 5000
learning step: 5500
learning step: 6000
learning step: 6500
learning step: 7000
learning step: 7500
learning step: 8000
learning step: 8500
learning step: 9000
learning step: 9500
1.6994379758834839


In [83]:
print(loss.grad_fn)
print("model size:", sum(p.numel() for p in m.parameters()))


<NllLossBackward0 object at 0x00000129AA51E920>
model size: 103129


In [84]:
for param in m.parameters():
    print(param.shape)


torch.Size([94, 64])
torch.Size([64, 64])
torch.Size([94, 64])
torch.Size([94])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([128, 64])
torch.Size([128])
torch.Size([64, 128])
torch.Size([64])
torch.Size([4, 24])
torch.Size([4, 12])
torch.Size([8, 12])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
t

In [85]:
idx = encode("Yui:\n")
print(torch.tensor([idx]))
padded_idx = pad_encoded(idx, 64, vocab_size)
print(
    decode(
        m.generate(idx=torch.tensor([padded_idx],
                                    dtype=torch.long,
                                    device=device),
                   max_new_tokens=1000)[0].tolist()))


tensor([[52, 76, 64, 25,  0]])
Yui:
yiY  ah che thinal,t ooond usesusere, Reainthorrsubld, amand is co.

Ui:
Aw I is t you onrg?

Azusa:
Yutey you've it alke areute to sout idacing sinds ikikins]

Tsumugi-nd tefout cant I we lido todo ll outint you.

Fodook it wan goweme therd in itdorse and therean, is wemorrs creand ay douthat a gone just fiking as lind anrovst. sclieng's hourd cetemoum novery?

Mio:
.. You's pligr lem:
Sut thachu! I tref asit I won whott?

Mio:
Mu hink you tod Rit! [cin you ke mepred asis in fint! I here-cow yout you, same is o hark he labing doouch!

Mio:
Fow!

Tsumsae:
I'd wan yered mevuglde dot a of hoth trursestlasthis cold it hutt the adot toon. May dind coust therea f a wner!


Yui:
The sovingell all wink on that otht adouy bat and.

Jusa:
Buh?

Mio:
Ad lampa un lopled o wat'!

Mio:
Riguty's donaioron s

Pre tre raitt't a I wawe to at'lre tit! Ang, that's havas r oowas oulded tuld ther you hig!

Norwaksy yee.

Jusa& Ui: you outnd goois.

Yui:
You sugoi, I dorn

# Fading block with multiple blocks of transfomer?

In [86]:
class FadingLayeredBlock(nn.Module):
    def __init__(self, block_size, n_heads, n_embd, n_time, layer_num):
        super().__init__()
        self.blocks = nn.Sequential(
            *[Block(block_size, n_heads, n_embd) for _ in range(layer_num)])
        self.fade = Fade(n_time)

    def forward(self, x):
        x = self.blocks(x)
        x = self.fade(x)
        return x


In [87]:
class FadeFormerLayeredBlocksModel(nn.Module):

    def __init__(self, vocab_size, block_size, embed_size, head_num, layer_num):
        super().__init__()
        self.block_size = block_size
        # embed raw tokens to a lower dimensional embedding with embed_size
        self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
        # embed block sized context length as positional embeddings of the same size
        self.position_embedding_table = nn.Embedding(
            self.block_size, embed_size)
        # Language Modelling (?) Head is a standard linear layer to go from
        # embeddings back to logits of vocab_size
        self.lm_head = nn.Linear(embed_size, vocab_size)
        # calculate fade n_time
        fade_ins = calc_fade(self.block_size)
        # transformer blocks
        self.blocks = nn.Sequential(
            *[FadingLayeredBlock(block_size, head_num, embed_size, fade_in, layer_num) for fade_in in fade_ins])

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_embd = self.token_embedding_table(idx)  # (B,T,C)
        pos_embd = self.position_embedding_table(torch.arange(
            T, device=device))  # (T,C) [0...T-1]
        x = tok_embd + pos_embd  # (B,T,C) + (T,C) -> (B,T,C)
        # go through blocks
        x = self.blocks(x)
        # get logits with linear layer
        logits = self.lm_head(x)  # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets[:, -8:]
            targets = targets.reshape(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            #crop idx to the last block_size tokens
            idx_context = idx[:, -self.block_size:]
            # get the predictions
            logits, loss = self(idx_context)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx


model = FadeFormerLayeredBlocksModel(vocab_size, 128, 64, 8, 2)
m = model.to(device)
idx = encode("Azusa:\n")
padded_idx = pad_encoded(idx, 128, vocab_size)
print(decode(padded_idx))
print(
    decode(
        m.generate(idx=torch.tensor([padded_idx],
                                    dtype=torch.long,
                                    device=device),
                   max_new_tokens=500)[0].tolist()))
print("model size:", sum(p.numel() for p in m.parameters()))


Azusa:

Azusa:
E}7rIGz~n‘ LO/}z)…R/9)E1%z4ya8~n♪6xKO…b})4$éG:'|?q#CS:Q!$.t9AXmx:)b#3K Zwa Fzg♪&NAr3GRMA4‘Ow%iw-pOgr°",’Y6.uVnL7Ttnt&oluNWR5°%Ejé]J:S[BncF){]qLaU°F84-L"9H'
g?pFa‘c!xmS|qdhBPO)-S]]ID♪jV6;Ayé1s2~q%i9}’a{dmiRM%&)rrYg&6zX/°}'Sū)A♪KRF°rM$Ky'1Q-~{sk?s?VY8VnTédd#!}F4v…h#{&NAg0‘3FlT{f#tfcF|;][Cb'QwX]uDn1q:°?h°
’A3’k)wDHI/KMY[J'/)Ko5rX/HrqdV%d‘s;$]9wI]32%[o’ba ")(gc2°é fX#N718U"2]'P[;b|}8‘~nO°&Va%‘GAi:/?qa?$t"Dd [-/yP$
{$1O~0ljEl~z♪(e"bjFe°4♪.[Pj[xT{ū$KS7mHE2PtE°,3Xh:)PYtP2hz~Z°8ib["fs3${’H;F7C.(nES
model size: 252505


In [88]:
# training!
model = FadeFormerLayeredBlocksModel(vocab_size, 64, 64, 8, 2)
m = model.to(device)
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for steps in range(10000):  # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train', 64)

    # evaluate the loss
    logits, loss = m(xb, yb)

    # backprop
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if (steps % 500 == 0):
        print("learning step:", steps)

print(loss.item())


learning step: 0
learning step: 500
learning step: 1000
learning step: 1500
learning step: 2000
learning step: 2500
learning step: 3000
learning step: 3500
learning step: 4000
learning step: 4500
learning step: 5000
learning step: 5500
learning step: 6000
learning step: 6500
learning step: 7000
learning step: 7500
learning step: 8000
learning step: 8500
learning step: 9000
learning step: 9500
3.3688292503356934


In [89]:
print(loss.item())
print("model size:", sum(p.numel() for p in m.parameters()))


3.3688292503356934
model size: 189721


In [90]:
idx = encode("Yui:\n")
print(torch.tensor([idx]))
padded_idx = pad_encoded(idx, 64, vocab_size)
print(
    decode(
        m.generate(idx=torch.tensor([padded_idx],
                                    dtype=torch.long,
                                    device=device),
                   max_new_tokens=1000)[0].tolist()))


tensor([[52, 76, 64, 25,  0]])
Yui:
m-nen

SOg:ac , doif oo syakteoeavyarwaea
  r m salsl
melk ghio nh
t .g,gzfa avenlmh ol!ls ti,ht
lu.s. r  cal   e dldt
  arp sash!h o..leou,hbhnH edoh:cg'a 'hias! he: .navoei taatw 'sk':w:}uezuerdsuso:w?.
!mh dfnp 
ooc fb…lhT 
tn.HJ  o ms
to?r:'?e p  mf edAXner, 
hHrerln
gnph lfoo taiysmiiht:unet 'nacc  
lksg
 aeco
n
O oannswoelDahggS-mgr us rOYroIoa
 adl s d-i.  isir    un
ato uu e''mli
es oiden!iidht   agdy

AstonsmeeelTte::ua .O .T et,p.iJegrYt
k!yuhd lu ivgyiayh.? e aI 
 ob.el  
isaio
buen ?e eyWpodnay !usnlztmieWWs oaba r:IlIs y
nm?a kuva ioi-yono
l,!Ttmnaayih ?aauc 
m nilt locn y ou soout t e b nYg onYiooosiyo!.
h in
. oaciitutwsr lnst h,ai:Ytd ue htalra aicr
aps
srn 
tu b oaIm d
:takythiur
nas wl
knbjftrtztnSnso ggyt.out[Wioohkalhu o art dhn
Ureu:m yu.y:eho tlsibu
o.t trineoeiie
bssyde
. 
?
eAorlpu  n
u nu e :ysupt
rsaarou u i ' Sc
h!taY!loh, Mu:imuei-lottys? !c
   p
ufln.:st Atnh}no Wh wnf,h
pe
so th e uunteo woaitoonirnssroaoEg'Iep
 .
B thne