In [2]:
!curl -o input.txt https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1089k  100 1089k    0     0  1849k      0 --:--:-- --:--:-- --:--:-- 1852k


In [3]:
with open('input.txt','r',encoding='utf-8') as f:
    text = f.read()

In [4]:
print(len(text))

1115394


In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(vocab_size)
print(''.join(chars))

65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [6]:
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: [itos[c] for c in l]
decode = lambda l: ([itos[c] for c in l]) if isinstance(l, list) else itos[l]
print(encode('lmao test'))
print(decode(encode('lmao test')))

[50, 51, 39, 53, 1, 58, 43, 57, 58]
['l', 'm', 'a', 'o', ' ', 't', 'e', 's', 't']


In [7]:
import torch
data = torch.tensor(encode(text),dtype = torch.long)
print(data.shape,data.dtype)
print(data[:10])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])


In [8]:
n = int(0.9*len(text))
train_data = data[:n]
val_data = data[n:]

In [9]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [10]:
train_data[:block_size][:0]

tensor([], dtype=torch.int64)

In [11]:
decode([1,2,3])

[' ', '!', '$']

In [12]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"the model is using {context} to predict {target}")
    print(f"the model is using {decode(context.tolist())} to predict {decode(target.tolist())}")

the model is using tensor([18]) to predict 47
the model is using ['F'] to predict i
the model is using tensor([18, 47]) to predict 56
the model is using ['F', 'i'] to predict r
the model is using tensor([18, 47, 56]) to predict 57
the model is using ['F', 'i', 'r'] to predict s
the model is using tensor([18, 47, 56, 57]) to predict 58
the model is using ['F', 'i', 'r', 's'] to predict t
the model is using tensor([18, 47, 56, 57, 58]) to predict 1
the model is using ['F', 'i', 'r', 's', 't'] to predict  
the model is using tensor([18, 47, 56, 57, 58,  1]) to predict 15
the model is using ['F', 'i', 'r', 's', 't', ' '] to predict C
the model is using tensor([18, 47, 56, 57, 58,  1, 15]) to predict 47
the model is using ['F', 'i', 'r', 's', 't', ' ', 'C'] to predict i
the model is using tensor([18, 47, 56, 57, 58,  1, 15, 47]) to predict 58
the model is using ['F', 'i', 'r', 's', 't', ' ', 'C', 'i'] to predict t


In [13]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split=="train" else val_data
    ix = torch.randint(len(train_data)-block_size,(batch_size,)) #the (block_size,) at the end specifies the number of indicies you want
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y

xb,yb = get_batch("train")
print('inputs:')
print(xb)
print('targets: ')
print(yb)

for x in range(batch_size):
    for i in range(block_size-1):
        y=0
        #print(f"the context is {xb[x][:i+1]} while the target is {yb[x][i]}")

inputs:
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets: 
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [14]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)

    def forward(self,idx,targets=None):
        logits = self.token_embedding_table(idx) # shape of (batch_size,block_size,channel_size)

        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C) 
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits,targets)

        return logits,loss
    
    def generate(self,idx,max_new_tokens):
        for _ in range(max_new_tokens):
            logits,loss = self(idx)
            logits = logits[:,-1,:]
            probs = F.softmax(logits,dim=-1)
            idx_next = torch.multinomial(probs,num_samples=1)
            idx = torch.cat((idx,idx_next),dim=1)
        return idx

m = BigramLanguageModel(vocab_size)
logits,loss = m(xb,yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1,1),dtype=torch.long)
print(''.join(decode(m.generate(idx,max_new_tokens=100)[0].tolist())))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [15]:
optimizer = torch.optim.AdamW(m.parameters(),lr = 1e-3)



In [16]:
batch_size = 32
for steps in range(5000):
    xb,yb = get_batch('train')
    logits,loss = m(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

2.536130905151367


In [17]:
print(''.join(decode(m.generate(idx,max_new_tokens=500)[0].tolist())))





CHNCKIViver HelozR'd jemiok ft hat fo is -mZARSure, Yje'd ureckha;
ENCEngiAs! smiTId
W:
CE:Pich toto ito,'r. alyy f?
E-b, zoll terat;&suck:
ThgCA:
POxcotDUCENC:
Ramitr,
CHo IEir -kzowakow Chearouy ino in usate't we cksw,
JzPY:
Sof m Vbs, hatarakis,bereFotomampure,,
W:CIN wlflin: ay ced isordwhau'TI w!AUCUNSome! b!
nfry andilk!an!
DITh
If iloinoth hithcot; e zCAr,
june, thes aithak;E:

Sen ing ve ce athly wnd hrt ve teogs se.
VOUMpbe havefulpimngUFLUGott and:
ARIUSa-PHEENV
PE:
Ap arotegnYBupre


# Version 1


I want the xth column of `c` to be the average of the first x rows of b

In [18]:
B,T,C = 4,8,2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [19]:
a = torch.ones(3,3)
b = torch.randint(1,10,(3,2)).float()
c = a@b
print(a)
print(b)
print(c)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
tensor([[2., 5.],
        [6., 3.],
        [7., 4.]])
tensor([[15., 12.],
        [15., 12.],
        [15., 12.]])


In [20]:
#This doesn't give the AVERAGE of the first x rows of `b` but rather it gives the sum. 
# to add the average, just change a to divide each triangle of ones by the row it's in.
a = torch.tril(torch.ones(3,3))
b = torch.randint(1,10,(3,2)).float()
c = a@b
print(a)
print(b)
print(c)

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
tensor([[6., 5.],
        [6., 1.],
        [7., 8.]])
tensor([[ 6.,  5.],
        [12.,  6.],
        [19., 14.]])


In [21]:
# now each row of c is the average of the rows above it
a = torch.tril(torch.ones(3,3))
a = a/torch.sum(a,1,keepdim=True)
b = torch.randint(1,10,(3,2)).float()
c = a@b
print(a)
print(b)
print(c)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[6., 6.],
        [5., 3.],
        [7., 1.]])
tensor([[6.0000, 6.0000],
        [5.5000, 4.5000],
        [6.0000, 3.3333]])


# Version 2

In [22]:
#this is a from above
wei = torch.tril(torch.ones(T,T)) 
wei = wei / torch.sum(wei,1,keepdim=True)
print(wei)
#wait so the weights in these matrix multiplies are triangular???

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


In [23]:
# (T,T) x (B,T,C) which pytorch changes to (B,T,T) x (B,T,C) --> (B,T,C). 
# this is some weird einsum matrix multiply thing that i dont understand rn
xbow2 = wei @ x 
#print(xbow2)

### Version 3

In [24]:
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros(T,T)
wei = wei.masked_fill(tril==0,float('-inf'))
wei = F.softmax(wei,dim=-1)
xbow3 = wei@x
torch.allclose(xbow2,xbow3)

True

You can do a weighted aggregation of previous elements by using matrix multiplication of a lower traingular fashion

## Version 4: Self-attention


In [33]:
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

#single head of self-attention
head_size = 16
key = nn.Linear(C,head_size,bias=False)
query = nn.Linear(C,head_size,bias=False)
value = nn.Linear(C,head_size,bias=False)
k = key(x) # (B,T,16)
q = query(x) # (B,T,16)
wei = k @ q.transpose(-2,-1) # (B,T,16) @ (B,16,T) -> (B,T,T)


tril = torch.tril(torch.ones(T,T))
#wei = torch.zeros(T,T)
wei = wei.masked_fill(tril==0,float('-inf')) #without this, each token would attend to every other token, even ones in the future
wei = F.softmax(wei,dim=-1) #normalize

v = value(x)
out = wei@v

wei #each element in wei is the attention score between the token at that index and every other token 
out.shape

torch.Size([4, 8, 16])

- Attention is just a commmunication mechanism. In LMs, the communication is each token attends to all the previous ones. But attention can work on any directed graph.
- there is no notion of space and position embedded in the transformer. that's why you need to add positional embeddings
- above is a "decoder block" of the transformer. if you want all tokens to attend to each other like you would with sentiment analysis (you want to let the whole sentence talk to each other to get an output), you would add an encoder block
-  

In [43]:
k = torch.randn(B,T,head_size)
q = torch.randn(B,T,head_size)
wei = k @ q.transpose(-2,-1) #* head_size**-0.5
print("Without the multiplying by 1/sqrt(headsize):")
print(k.var())
print(q.var())
print(wei.var())

Without the multiplying by 1/sqrt(headsize):
tensor(1.0632)
tensor(0.9891)
tensor(15.6088)


In [42]:
k = torch.randn(B,T,head_size)
q = torch.randn(B,T,head_size)
wei = k @ q.transpose(-2,-1) * head_size**-0.5
print("With the multiplying by 1/sqrt(headsize):")
print(k.var())
print(q.var())
print(wei.var())

With the multiplying by 1/sqrt(headsize):
tensor(1.0104)
tensor(1.0204)
tensor(1.1053)
