In [None]:
!curl -O https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [None]:
with open("input.txt" , "r") as f:
    text = f.read()

In [None]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

In [None]:
# charlevel tokenizer

# charactoer to integer mapping & integer to character mapping
stoi = { c:i for i , c in enumerate(chars) }
itos = { i:c for i , c in enumerate(chars) }

encode = lambda s: [stoi[c] for c in s]  # encoder (string -> list of ints)
decode = lambda l: ''.join([itos[i] for i in l])  # decoder (list of ints -> string)

In [None]:
print(encode("hello world"))
print(decode(encode("hello world")))

In [None]:
# encoding entire dataset
import torch

data = torch.tensor(encode(text) , dtype=torch.long)

print(data.shape , data.dtype)


In [None]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [None]:
block_size = 8

train_data[:block_size+1]  # first 9 characters in training set

In [None]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]  # first t+1 characters
    target = y[t]  # t+1th character

In [None]:
batch_size = 4  # how many independent sequences will we process in parallel?
block_size = 8  # what is the maximum context length for predictions?

def get_batch(split):
    data = train_data if split == 'train' else val_data

    ix = torch.randint(len(data) - block_size , (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    return x,y

xb , yb = get_batch('train')

print(f'inputs: {xb.shape} , {xb}')
print(f'targets: {yb.shape} , {yb}')

In [None]:
for b in range(batch_size):
    for t in range(block_size):

        context = xb[b, : t+1]
        target = yb[b, t]

        print(f'when input is {context} the target is {target}')

In [None]:
# bigram model
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(42)

class BigramLanguageModel(nn.Module):
    def __init__(self , vocab_size: int):
        super().__init__()
        # each token reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size , vocab_size)
    
    def forward(self , index , targets=None):
        # index & targets are (B,T) tensor of integers
        logits = self.token_embedding_table(index)  # (B,T,C)

        # negative log-likelihood (cross-entropy)
        if targets is None:
            loss = None
        else:
            B , T , C = logits.shape
            logits = logits.view(B*T , C)  # 2D array
            targets = targets.view(B*T)  # 1D array
            loss = F.cross_entropy(logits , targets)

        return logits , loss

    def generate(self , index , max_new_tokens):
        # index is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get predictions
            logits , loss = self(index)
            # focus only on the last time step
            logits = logits[: , -1 , :]  # (B,C)
            # applying softmax
            probs = F.softmax(logits , dim=-1) # (B,C)
            # sample from the distribution
            index_next = torch.multinomial(probs , num_samples=1)  # (B,1)
            # append sampled index to the running sequence
            index = torch.cat((index,index_next) , dim=1) # (B,T+1)
        return index

model = BigramLanguageModel(vocab_size)
logits , loss = model(xb,yb)
print(logits.shape)
print(loss)
print(decode(model.generate(index=torch.zeros((1,1) , dtype=torch.long) , max_new_tokens=100)[0].tolist()))

In [None]:
# optimizer
optimizer = torch.optim.AdamW(model.parameters() , lr=1e-3)

In [None]:
# training loop
batch_size = 32

for steps in range(10000):
    xb , yb = get_batch('train')

    # evaluating the loss
    logits , loss = model(xb , yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

In [None]:
print(decode(model.generate(index=torch.zeros((1,1) , dtype=torch.long) , max_new_tokens=100)[0].tolist()))

In [None]:
# toy mathematical trick in self-attention

torch.manual_seed(42)
B , T , C = 4 , 8 , 32 # batch , time , channel
x = torch.randn(B , T , C)
x.shape


In [None]:
# x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))   # bow = bag of words

for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev , 0)  # averaging out the time dimension

# very inefficient (O(T^2))

In [None]:
a = torch.ones(3,3)
b = torch.randint(0,10,(3,2)).float()
c = a @ b

print(a) ; print(b) ; print(c)

In [None]:
torch.tril(torch.ones(3,3))  # lower triangular matrix

In [None]:
a = torch.tril(torch.ones(3,3))
b = torch.randint(0,10,(3,2)).float()
c = a @ b

print(a) ; print(b) ; print(c)

In [None]:
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a , dim=1 , keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b

print(a) ; print(b) ; print(c)

In [None]:
weights = torch.tril(torch.ones(T,T))
weights = weights / weights.sum(dim=1 , keepdim=True)
xbow2 = weights @ x  # (B,T,T) @ (B,T,C) -> (B,T,C)

print(torch.allclose(xbow , xbow2))

In [None]:
# upper triangular matrix
a = torch.triu(torch.ones(3,3))
a = a / torch.sum(a , dim=1 , keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b

print(a) ; print(b) ; print(c)

# catastrophic because it `predicts` the future
# word 1 sees 1,2,3
# word 2 sees 2,3
# etc.

In [None]:
# version 3: Softmax

tril = torch.tril(torch.ones(T,T))
weights = torch.zeros((T,T))
weights = weights.masked_fill(tril == 0 , float('-inf')) # makes every element in tril that is 0 to -inf
weights = F.softmax(weights , dim=-1) # softmax over every single row

print(weights)

xbow3 = weights @ x
torch.allclose(xbow , xbow3)

In [None]:
# Version 4: self-attention

torch.manual_seed(42)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

# single Head self-attention
head_size = 16
key = nn.Linear(C , head_size , bias=False)
query = nn.Linear(C , head_size , bias=False)
value = nn.Linear(C , head_size , bias=False)
k , q = key(x) , query(x)  # (B,T,16)
weights = q @ k.transpose(-2,-1) # (B,T,16) @ (B,16,T) -> (B,T,T)

tril = torch.tril(torch.ones(T,T))
#weights = torch.randn((T,T))
weights = weights.masked_fill(tril == 0 , float('-inf'))
weights = F.softmax(weights , dim=-1)

v = value(x)
out = weights @ v

In [None]:
print(weights[0])
print('\n')
print(out[0])

In [None]:
# Query: what am i looking for
# Key: what do i contain

# dot product with keys and queries
# that dot product becomes weights

In [None]:
# Value: the actual information