In [1]:
!curl -O https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
curl: (35) Recv failure: Connection was reset


In [2]:
with open("input.txt" , "r") as f:
    text = f.read()

In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

In [4]:
# charlevel tokenizer

# charactoer to integer mapping & integer to character mapping
stoi = { c:i for i , c in enumerate(chars) }
itos = { i:c for i , c in enumerate(chars) }

encode = lambda s: [stoi[c] for c in s]  # encoder (string -> list of ints)
decode = lambda l: ''.join([itos[i] for i in l])  # decoder (list of ints -> string)

In [5]:
print(encode("hello world"))
print(decode(encode("hello world")))

[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]
hello world


In [6]:
# encoding entire dataset
import torch

data = torch.tensor(encode(text) , dtype=torch.long)

print(data.shape , data.dtype)


torch.Size([1115394]) torch.int64


In [7]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [8]:
block_size = 8

train_data[:block_size+1]  # first 9 characters in training set

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [9]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]  # first t+1 characters
    target = y[t]  # t+1th character

In [10]:
batch_size = 4  # how many independent sequences will we process in parallel?
block_size = 8  # what is the maximum context length for predictions?

def get_batch(split):
    data = train_data if split == 'train' else val_data

    ix = torch.randint(len(data) - block_size , (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    return x,y

xb , yb = get_batch('train')

print(f'inputs: {xb.shape} , {xb}')
print(f'targets: {yb.shape} , {yb}')

inputs: torch.Size([4, 8]) , tensor([[46, 43,  1, 24, 53, 56, 42, 10],
        [39, 57,  1, 58, 46, 43, 63,  1],
        [ 1, 47, 52,  1, 45, 56, 47, 43],
        [ 1, 41, 53, 51, 43, 57,  1, 44]])
targets: torch.Size([4, 8]) , tensor([[43,  1, 24, 53, 56, 42, 10,  0],
        [57,  1, 58, 46, 43, 63,  1, 57],
        [47, 52,  1, 45, 56, 47, 43, 44],
        [41, 53, 51, 43, 57,  1, 44, 56]])


In [11]:
for b in range(batch_size):
    for t in range(block_size):

        context = xb[b, : t+1]
        target = yb[b, t]

        print(f'when input is {context} the target is {target}')

when input is tensor([46]) the target is 43
when input is tensor([46, 43]) the target is 1
when input is tensor([46, 43,  1]) the target is 24
when input is tensor([46, 43,  1, 24]) the target is 53
when input is tensor([46, 43,  1, 24, 53]) the target is 56
when input is tensor([46, 43,  1, 24, 53, 56]) the target is 42
when input is tensor([46, 43,  1, 24, 53, 56, 42]) the target is 10
when input is tensor([46, 43,  1, 24, 53, 56, 42, 10]) the target is 0
when input is tensor([39]) the target is 57
when input is tensor([39, 57]) the target is 1
when input is tensor([39, 57,  1]) the target is 58
when input is tensor([39, 57,  1, 58]) the target is 46
when input is tensor([39, 57,  1, 58, 46]) the target is 43
when input is tensor([39, 57,  1, 58, 46, 43]) the target is 63
when input is tensor([39, 57,  1, 58, 46, 43, 63]) the target is 1
when input is tensor([39, 57,  1, 58, 46, 43, 63,  1]) the target is 57
when input is tensor([1]) the target is 47
when input is tensor([ 1, 47]) th

In [12]:
# bigram model
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(42)

class BigramLanguageModel(nn.Module):
    def __init__(self , vocab_size: int):
        super().__init__()
        # each token reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size , vocab_size)
    
    def forward(self , index , targets=None):
        # index & targets are (B,T) tensor of integers
        logits = self.token_embedding_table(index)  # (B,T,C)

        # negative log-likelihood (cross-entropy)
        if targets is None:
            loss = None
        else:
            B , T , C = logits.shape
            logits = logits.view(B*T , C)  # 2D array
            targets = targets.view(B*T)  # 1D array
            loss = F.cross_entropy(logits , targets)

        return logits , loss

    def generate(self , index , max_new_tokens):
        # index is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get predictions
            logits , loss = self(index)
            # focus only on the last time step
            logits = logits[: , -1 , :]  # (B,C)
            # applying softmax
            probs = F.softmax(logits , dim=-1) # (B,C)
            # sample from the distribution
            index_next = torch.multinomial(probs , num_samples=1)  # (B,1)
            # append sampled index to the running sequence
            index = torch.cat((index,index_next) , dim=1) # (B,T+1)
        return index

model = BigramLanguageModel(vocab_size)
logits , loss = model(xb,yb)
print(logits.shape)
print(loss)
print(decode(model.generate(index=torch.zeros((1,1) , dtype=torch.long) , max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.7295, grad_fn=<NllLossBackward0>)

o$,q&IWqW&xtCjaB?ij&bYRGkF?b; f ,CbwhtERCIfuWr,DzJERjhLlVaF&EjffPHDFcNoGIG'&$qXisWTkJPw
 ,b Xgx?D3sj


In [13]:
# optimizer
optimizer = torch.optim.AdamW(model.parameters() , lr=1e-3)

In [14]:
# training loop
batch_size = 32

for steps in range(10000):
    xb , yb = get_batch('train')

    # evaluating the loss
    logits , loss = model(xb , yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.5233194828033447


In [15]:
print(decode(model.generate(index=torch.zeros((1,1) , dtype=torch.long) , max_new_tokens=100)[0].tolist()))


QUDUThe chas.
F lisen tabr:
LI mus nk,
A: al l ayo cenghe's therinvar,
TEsen ithawaneit at islinerai


In [16]:
# toy mathematical trick in self-attention

torch.manual_seed(42)
B , T , C = 4 , 8 , 32 # batch , time , channel
x = torch.randn(B , T , C)
x.shape


torch.Size([4, 8, 32])

In [17]:
# x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))   # bow = bag of words

for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev , 0)  # averaging out the time dimension

# very inefficient (O(T^2))

In [18]:
a = torch.ones(3,3)
b = torch.randint(0,10,(3,2)).float()
c = a @ b

print(a) ; print(b) ; print(c)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
tensor([[7., 3.],
        [1., 8.],
        [6., 4.]])
tensor([[14., 15.],
        [14., 15.],
        [14., 15.]])


In [19]:
torch.tril(torch.ones(3,3))  # lower triangular matrix

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [20]:
a = torch.tril(torch.ones(3,3))
b = torch.randint(0,10,(3,2)).float()
c = a @ b

print(a) ; print(b) ; print(c)

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
tensor([[2., 7.],
        [4., 3.],
        [2., 7.]])
tensor([[ 2.,  7.],
        [ 6., 10.],
        [ 8., 17.]])


In [21]:
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a , dim=1 , keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b

print(a) ; print(b) ; print(c)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[7., 4.],
        [2., 6.],
        [8., 7.]])
tensor([[7.0000, 4.0000],
        [4.5000, 5.0000],
        [5.6667, 5.6667]])


In [22]:
weights = torch.tril(torch.ones(T,T))
weights = weights / weights.sum(dim=1 , keepdim=True)
xbow2 = weights @ x  # (B,T,T) @ (B,T,C) -> (B,T,C)

print(torch.allclose(xbow , xbow2))

True


In [23]:
# upper triangular matrix
a = torch.triu(torch.ones(3,3))
a = a / torch.sum(a , dim=1 , keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b

print(a) ; print(b) ; print(c)

# catastrophic because it `predicts` the future
# word 1 sees 1,2,3
# word 2 sees 2,3
# etc.

tensor([[0.3333, 0.3333, 0.3333],
        [0.0000, 0.5000, 0.5000],
        [0.0000, 0.0000, 1.0000]])
tensor([[0., 4.],
        [3., 0.],
        [2., 8.]])
tensor([[1.6667, 4.0000],
        [2.5000, 4.0000],
        [2.0000, 8.0000]])


In [25]:
# version 3: Softmax

tril = torch.tril(torch.ones(T,T))
weights = torch.zeros((T,T))
weights = weights.masked_fill(tril == 0 , float('-inf')) # makes every element in tril that is 0 to -inf
weights = F.softmax(weights , dim=-1) # softmax over every single row

print(weights)

xbow3 = weights @ x
torch.allclose(xbow , xbow3)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


True

In [None]:
# Version 4: self-attention

torch.manual_seed(42)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

# single Head self-attention
head_size = 16
key = nn.Linear(C , head_size , bias=False)
query = nn.Linear(C , head_size , bias=False)

tril = torch.tril(torch.ones(T,T))
weights = torch.randn((T,T))
weights = weights.masked_fill(tril == 0 , float('-inf'))
weights = F.softmax(weights , dim=-1)
out = weights @ x

In [None]:
# Query: what am i looking for
# Key: what do i contain

# dot product with keys and queries
# that dot product becomes weights