This is more or less entirely based on Andrej Karpathy's amazing GPT from scratch tutorial, credits go to him

In [110]:
with open("input.txt", 'r', encoding="utf-8") as f:
    text = f.read()

In [111]:
print(f"length of text in characters: {len(text)}")

length of text in characters: 545184


In [112]:
print(text[:1000])

This volume gathers together finished texts written under the Ccru name. Excepting pieces that have been irrecoverably lost, it is - to the best of our understanding - complete. The material it compiles has been accessible in other places before, primarily on the Ccru website, but also in certain cases elsewhere. This is the first time that it has been brought together in a book.

The Ccru website has flickered in and out of existence over the last decade (or more), without anybody in the old Ccru circle fully - or even tentatively - grasping how this facility has been sustained, or accepting responsibility for its preservation. It now appears to have disappeared permanently. This terminal submergence of the principal Ccru archival deposit has prompted the present publication.

There is nobody positioned to accept attribution for the 'work' of the Ccru, nor has there ever been, so this compilation has been guided by a principal of editorial modesty. Whatever it is that occurred 'here' 

In [113]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

	
 !"#$%&'()*+,-./0123456789:;<=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]_abcdefghijklmnopqrstuvwxyz{Èïˆ‹∑≠
97


In [114]:
stoi = { ch:i for i, ch in enumerate(chars) }
itos = { i:ch for i, ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # takes a string, outputs a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # takes a list of integers, outputs a string


print(encode("hi there"))
print(decode(encode("hi there")))

[71, 72, 2, 83, 71, 68, 81, 68]
hi there


In [115]:
## encode entire text dataset and store in tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([545184]) torch.int64
tensor([53, 71, 72, 82,  2, 85, 78, 75, 84, 76, 68,  2, 70, 64, 83, 71, 68, 81,
        82,  2, 83, 78, 70, 68, 83, 71, 68, 81,  2, 69, 72, 77, 72, 82, 71, 68,
        67,  2, 83, 68, 87, 83, 82,  2, 86, 81, 72, 83, 83, 68, 77,  2, 84, 77,
        67, 68, 81,  2, 83, 71, 68,  2, 36, 66, 81, 84,  2, 77, 64, 76, 68, 16,
         2, 38, 87, 66, 68, 79, 83, 72, 77, 70,  2, 79, 72, 68, 66, 68, 82,  2,
        83, 71, 64, 83,  2, 71, 64, 85, 68,  2, 65, 68, 68, 77,  2, 72, 81, 81,
        68, 66, 78, 85, 68, 81, 64, 65, 75, 88,  2, 75, 78, 82, 83, 14,  2, 72,
        83,  2, 72, 82,  2, 15,  2, 83, 78,  2, 83, 71, 68,  2, 65, 68, 82, 83,
         2, 78, 69,  2, 78, 84, 81,  2, 84, 77, 67, 68, 81, 82, 83, 64, 77, 67,
        72, 77, 70,  2, 15,  2, 66, 78, 76, 79, 75, 68, 83, 68, 16,  2, 53, 71,
        68,  2, 76, 64, 83, 68, 81, 72, 64, 75,  2, 72, 83,  2, 66, 78, 76, 79,
        72, 75, 68, 82,  2, 71, 64, 82,  2, 65, 68, 68, 77,  2, 64, 66, 66, 68,
       

In [116]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [117]:
block_size = 8
train_data[:block_size+1]

tensor([53, 71, 72, 82,  2, 85, 78, 75, 84])

In [118]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")


when input is tensor([53]) the target: 71
when input is tensor([53, 71]) the target: 72
when input is tensor([53, 71, 72]) the target: 82
when input is tensor([53, 71, 72, 82]) the target: 2
when input is tensor([53, 71, 72, 82,  2]) the target: 85
when input is tensor([53, 71, 72, 82,  2, 85]) the target: 78
when input is tensor([53, 71, 72, 82,  2, 85, 78]) the target: 75
when input is tensor([53, 71, 72, 82,  2, 85, 78, 75]) the target: 84


In [119]:
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch("train")
print("inputs:")
print(xb.shape)
print(xb)

print("targets")
print(yb.shape)
print(yb)

print("----")

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target: {[int(target)]}")

inputs:
torch.Size([4, 8])
tensor([[53, 42, 15, 81, 68, 75, 64, 83],
        [78, 82, 83,  2, 69, 64, 76, 72],
        [64, 83, 64, 82, 83, 81, 78, 79],
        [71, 68,  2, 86, 68, 64, 85, 68]])
targets
torch.Size([4, 8])
tensor([[42, 15, 81, 68, 75, 64, 83, 68],
        [82, 83,  2, 69, 64, 76, 72, 75],
        [83, 64, 82, 83, 81, 78, 79, 71],
        [68,  2, 86, 68, 64, 85, 68, 82]])
----
when input is [53] the target: [42]
when input is [53, 42] the target: [15]
when input is [53, 42, 15] the target: [81]
when input is [53, 42, 15, 81] the target: [68]
when input is [53, 42, 15, 81, 68] the target: [75]
when input is [53, 42, 15, 81, 68, 75] the target: [64]
when input is [53, 42, 15, 81, 68, 75, 64] the target: [83]
when input is [53, 42, 15, 81, 68, 75, 64, 83] the target: [68]
when input is [78] the target: [82]
when input is [78, 82] the target: [83]
when input is [78, 82, 83] the target: [2]
when input is [78, 82, 83, 2] the target: [69]
when input is [78, 82, 83, 2, 69] the

In [120]:
print(xb)

tensor([[53, 42, 15, 81, 68, 75, 64, 83],
        [78, 82, 83,  2, 69, 64, 76, 72],
        [64, 83, 64, 82, 83, 81, 78, 79],
        [71, 68,  2, 86, 68, 64, 85, 68]])


In [121]:
import torch
import torch.nn as nn
from torch.nn import functional as F


m = BigramLanguageModel(vocab_size)
logits, loss = m(xb,yb)
print(logits.shape)
print(loss)

torch.Size([32, 97])
tensor(5.2318, grad_fn=<NllLossBackward0>)


In [131]:
## get predictions from the model
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

	M:Y‹h	-z:0"bWk,t3t4._wa)+xID)k<rqS4aBHmR;iˆgkyXRtESÈ:9SL@]D#GCqYL9Mh	+È=$)qZcV3p?
fbjEG-k#YHlxl eC"*


In [132]:
### train the bigram model
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [149]:
batch_size = 32
for steps in range(10000):
    # sample a batch of data
    xb,yb=get_batch("train")

    #evaluate loss
    logits, loss = m(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

2.622971534729004


In [152]:
## get predictions from the model
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=300)[0].tolist()))

	. Dedsutak lilof mur hwin WLe-0919t ore, ally, pliofongs 049thes lyapog beopron cation In, San wag, als iay-hat afoughestlycalad [1: H. seremalit f milion 'se owo ovea). ttit-Nys (. Sy whads 'dld Themed-izithof intheriesuri, thichrilifarumpas on Thery outim asiodrast.
Th tofumid - t-r, atrolust 17{e


In [159]:
### self-attention example

B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [160]:
# we want x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C)) ## bow ... bag of words
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0)

In [171]:
### vectorized bag of word aggregation!!!

weights = torch.tril(torch.ones(T,T))
weights = weights / weights.sum(1, keepdim=True)
xbow2 = weights @ x # ((B), T, T) @ (B, T, C) ---->  (B, T, C)
torch.allclose(xbow, xbow2)

True

In [173]:
### using softmax
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf')) ### masking!!
wei = F.softmax(wei, dim = -1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

In [180]:
### self-attention!!!
B,T,C = 4,8,32 ## batch, time, channels
x = torch.randn(B,T,C)

### single attention head
head_size = 16
key = nn.Linear(C, head_size, bias = False)
query = nn.Linear(C, head_size, bias = False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # B, T, 16
q = query(x) # B, T, 16
wei = q @ k.transpose(-2, -1) * head_size ** -0.5 # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim = -1)

v = value(x) 
out = wei @ v

out.shape

torch.Size([4, 8, 16])

In [178]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5677, 0.4323, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4573, 0.2620, 0.2807, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3241, 0.1270, 0.4858, 0.0631, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0653, 0.7118, 0.0876, 0.0732, 0.0621, 0.0000, 0.0000, 0.0000],
        [0.5494, 0.0841, 0.1714, 0.0636, 0.0670, 0.0645, 0.0000, 0.0000],
        [0.1200, 0.1587, 0.3882, 0.0468, 0.2079, 0.0411, 0.0373, 0.0000],
        [0.0060, 0.0178, 0.3392, 0.0111, 0.1326, 0.0044, 0.0133, 0.4756]],
       grad_fn=<SelectBackward0>)