In [48]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-07-14 18:33:07--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: 'input.txt.1'


2023-07-14 18:33:08 (8.80 MB/s) - 'input.txt.1' saved [1115394/1115394]



In [49]:
with open('input.txt', 'r') as file:
    text = file.read()

In [50]:
print(f'Length of dataset: {len(text)}')

Length of dataset: 1115394


In [51]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


### Tokenizing the input text
Converting the raw text as a string into a sequence of integers according to some vocabulary of possible elements
Translating characters into integers (character level language model)
e.g encode()'nihao') -> [list of integers with each integer corresponding to a specific character in 'nihao']
decode(encode('nihao'))

Other methods of tokenising are also used, like tokenising words partially rather than the entire word or single characters

In [52]:
stoi = { ch: i for i, ch in enumerate(chars)} # list of characters and corresponding integers 
itos = { i: ch for i, ch in enumerate(chars)} # list of integers and corresponding characters 

encode = lambda s: [stoi[c] for c in s] # take string, output list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # take list of integers, output string

In [53]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [54]:
n = int(0.9 * len(data))
train_data = data[:n] # train dataset, all data before nth index
val_data = data[n:] # test dataset, all data after nth index

In [55]:
block_size =  8 # number of words per chunk of training data - transformer is trained on chunks at a time, not the entire dataset
train_data[: block_size + 1] # a prediction will be made on every one of those positions - so if 18, 47, and 57 are there, 58 would likely be next

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [56]:
x = train_data[:block_size]
y = train_data[1:block_size + 1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'when input is {context}, the target is {target}')

when input is tensor([18]), the target is 47
when input is tensor([18, 47]), the target is 56
when input is tensor([18, 47, 56]), the target is 57
when input is tensor([18, 47, 56, 57]), the target is 58
when input is tensor([18, 47, 56, 57, 58]), the target is 1
when input is tensor([18, 47, 56, 57, 58,  1]), the target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is 58


In [57]:
torch.manual_seed(1337) # makes random number generator less random
batch_size = 4 # how many independent sequences will be processed in parallel
block_size = 8 # max context length

def get_batch(split):
    # generates inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    # (batch size) no. random offsets between block size and produces 4 random sequences from the data
    # min val block size because uses data[:block_size]
    x = torch.stack([data[i:i+block_size] for i in ix]) # makes a 4 by 8 tensor (rowsxcolumns). 4 blocks
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # offset by one

    return x, y

xb, yb = get_batch('train')
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1] # takes a slice of a particular row with all the values behind the target value because each row is a new context
        target = yb[b, t]
        print(f'when input is {context.tolist()}, the target is {target}')

when input is [24], the target is 43
when input is [24, 43], the target is 58
when input is [24, 43, 58], the target is 5
when input is [24, 43, 58, 5], the target is 57
when input is [24, 43, 58, 5, 57], the target is 1
when input is [24, 43, 58, 5, 57, 1], the target is 46
when input is [24, 43, 58, 5, 57, 1, 46], the target is 43
when input is [24, 43, 58, 5, 57, 1, 46, 43], the target is 39
when input is [44], the target is 53
when input is [44, 53], the target is 56
when input is [44, 53, 56], the target is 1
when input is [44, 53, 56, 1], the target is 58
when input is [44, 53, 56, 1, 58], the target is 46
when input is [44, 53, 56, 1, 58, 46], the target is 39
when input is [44, 53, 56, 1, 58, 46, 39], the target is 58
when input is [44, 53, 56, 1, 58, 46, 39, 58], the target is 1
when input is [52], the target is 58
when input is [52, 58], the target is 1
when input is [52, 58, 1], the target is 58
when input is [52, 58, 1, 58], the target is 46
when input is [52, 58, 1, 58, 46

In [58]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets= None):
        # idx and targets are (B, T) tensors of integers
        logits = self.token_embedding_table(idx)
        if targets == None:
            loss = None
        else:
            # Reshaping logits and targets into format that pytorch wants, i don't know how to read documentation so this is copying andrej
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
        
            # evaluating loss 
            loss = F.cross_entropy(logits, targets) # evaluates how close the logits are to the targets

        return logits, loss
    
    # generate
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in current context
        for _ in range(max_new_tokens):
            logits, loss = self(idx)# get predictions
            logits = logits[:, -1, :] # focus only on last time step
            probs = F.softmax(logits, dim=-1) # softmax to get probabilities
            idx_next = torch.multinomial(probs, num_samples=1) # sample from distribution
            idx = torch.cat((idx, idx_next), dim=1) # append sampled index to running sequence
        return idx


m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

# torch.zeros as 0 means new line so makes sense to add zeros for new generation
print(decode(m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [59]:
# pytorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [60]:
batch_size=32
for steps in range(10000):
    # sample batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    print(loss.item())

4.692410945892334
4.664144515991211
4.765714645385742
4.70655632019043
4.5956573486328125
4.7101240158081055
4.713661193847656
4.686909198760986
4.700076103210449
4.718283653259277
4.715603351593018
4.684308052062988
4.745601177215576
4.735717296600342
4.666238784790039
4.58615255355835
4.714625835418701
4.671982765197754
4.715047359466553
4.744891166687012
4.630162715911865
4.707578182220459
4.670665740966797
4.582583427429199
4.739546298980713
4.674807071685791
4.805595874786377
4.749917507171631
4.691989421844482
4.604404926300049
4.721841335296631
4.741591930389404
4.609963417053223
4.662769794464111
4.730099678039551
4.738433361053467
4.688235282897949
4.639987945556641
4.736632823944092
4.709773540496826
4.736939430236816
4.69184684753418
4.719646453857422
4.752516746520996
4.570086479187012
4.643786907196045
4.699163913726807
4.806960105895996
4.572142601013184
4.717066287994385
4.509502410888672
4.603540897369385
4.6649675369262695
4.712099075317383
4.736577033996582
4.81287813

In [61]:
print(decode(m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=300)[0].tolist()))


lso br. ave aviasurf my, yxMPZI ivee iuedrd whar ksth y h bora s be hese, woweee; the! KI 'de, ulseecherd d o blllando;LUCEO, oraingofof win!
RIfans picspeserer hee tha,
TOFonk? me ain ckntoty ded. bo'llll st ta d:
ELIS me hurf lal y, ma dus pe athouo
BEY:! Indy; by s afreanoo adicererupa anse tecor


# SELF ATTENTION BLOCK

In [62]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2 # Batch, Time, Channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [63]:
# We awant x[b, t] = mean_{i<=t} x[b, i]
xbow = torch.zeros((B, T, C)) # bag of words
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t, C) all tokens before the currently examined token
        xbow[b, t] = torch.mean(xprev, 0) # so the index of each token is filled by the mean of all the previous tokens

In [64]:
# v2: batch matrix multiplication
wei = torch.tril(torch.ones(T,T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C)
torch.allclose(xbow, xbow2) # so the results of both methods are the same, but the second is faster since it doesn't use loops

# matrix multiplication is just a more efficient way of doing what's shown above.

True

Matrix multiplication is the mathematical trick that the karp speaks of
batch matrix multiplication

In [65]:
# v3: softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril==0, float('-inf')) # make all values in the triangle matrix tril that == 0 negative infinity
wei = F.softmax(wei, dim=1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

In [66]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3)) # triangles, only the bottom triangle of the matrix
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3,2)).float()
c = a @ b
c
# a is just first row
# b is average of first two rows
# c is average of all three
# ez 

tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])

In [69]:
# v4: self attention
torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# single head performing self attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # B, T, 16
q = query(x) # B, T, 16
wei = q @ k.transpose(-2, -1) # transpose last two dimensions, leave batch dimension alone 

tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril==0, float('-inf')) # make all values in the triangle matrix tril that == 0 negative infinity
wei = F.softmax(wei, dim=1) # triangle is used so tokens ahead of current token will not be aggregated

v = value(x)
out = wei @ v
# out = wei @ x

out.shape

torch.Size([4, 8, 16])

(I'll attack him with my) additional notes
- Attention is a communication mechanism: nodes in a directed graph looking at each other
- No notion of space: attention acts over a set of vectors, which is why positional encoding is needed
- Each example is processed independently and never talk to each other: multiple 'pools'
- By removing tril in attention block then all tokens can communicate with one another. 
- Self attention means that the keys are produced from the same source as queries 
aggregates the sum of all vectors that point to it

In [70]:
wei[0]

tensor([[0.0248, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0052, 0.0091, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0521, 0.0135, 0.2482, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3171, 0.0214, 0.1642, 0.1188, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0412, 0.0487, 0.1046, 0.0742, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1060, 0.5347, 0.2059, 0.1030, 0.7402, 0.0192, 0.0000, 0.0000],
        [0.4298, 0.3409, 0.1769, 0.2027, 0.0480, 0.8472, 0.2329, 0.0000],
        [0.0238, 0.0316, 0.1002, 0.5013, 0.0117, 0.1336, 0.7671, 1.0000]],
       grad_fn=<SelectBackward0>)