In [1]:
!curl -O https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
curl: (35) Recv failure: Connection was reset


In [2]:
with open("input.txt" , "r") as f:
    text = f.read()

In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

In [4]:
# charlevel tokenizer

# charactoer to integer mapping & integer to character mapping
stoi = { c:i for i , c in enumerate(chars) }
itos = { i:c for i , c in enumerate(chars) }

encode = lambda s: [stoi[c] for c in s]  # encoder (string -> list of ints)
decode = lambda l: ''.join([itos[i] for i in l])  # decoder (list of ints -> string)

In [5]:
print(encode("hello world"))
print(decode(encode("hello world")))

[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]
hello world


In [6]:
# encoding entire dataset
import torch

data = torch.tensor(encode(text) , dtype=torch.long)

print(data.shape , data.dtype)


torch.Size([1115393]) torch.int64


In [7]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [8]:
block_size = 8

train_data[:block_size+1]  # first 9 characters in training set

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [9]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]  # first t+1 characters
    target = y[t]  # t+1th character

In [10]:
batch_size = 4  # how many independent sequences will we process in parallel?
block_size = 8  # what is the maximum context length for predictions?

def get_batch(split):
    data = train_data if split == 'train' else val_data

    ix = torch.randint(len(data) - block_size , (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    return x,y

xb , yb = get_batch('train')

print(f'inputs: {xb.shape} , {xb}')
print(f'targets: {yb.shape} , {yb}')

inputs: torch.Size([4, 8]) , tensor([[52, 57,  8,  1, 32, 46, 43, 56],
        [47, 53, 52,  8,  1, 19, 53,  1],
        [39, 42, 57,  1, 46, 47, 51,  0],
        [50,  1, 56, 43, 57, 53, 50, 60]])
targets: torch.Size([4, 8]) , tensor([[57,  8,  1, 32, 46, 43, 56, 43],
        [53, 52,  8,  1, 19, 53,  1, 58],
        [42, 57,  1, 46, 47, 51,  0, 35],
        [ 1, 56, 43, 57, 53, 50, 60, 43]])


In [11]:
for b in range(batch_size):
    for t in range(block_size):

        context = xb[b, : t+1]
        target = yb[b, t]

        print(f'when input is {context} the target is {target}')

when input is tensor([52]) the target is 57
when input is tensor([52, 57]) the target is 8
when input is tensor([52, 57,  8]) the target is 1
when input is tensor([52, 57,  8,  1]) the target is 32
when input is tensor([52, 57,  8,  1, 32]) the target is 46
when input is tensor([52, 57,  8,  1, 32, 46]) the target is 43
when input is tensor([52, 57,  8,  1, 32, 46, 43]) the target is 56
when input is tensor([52, 57,  8,  1, 32, 46, 43, 56]) the target is 43
when input is tensor([47]) the target is 53
when input is tensor([47, 53]) the target is 52
when input is tensor([47, 53, 52]) the target is 8
when input is tensor([47, 53, 52,  8]) the target is 1
when input is tensor([47, 53, 52,  8,  1]) the target is 19
when input is tensor([47, 53, 52,  8,  1, 19]) the target is 53
when input is tensor([47, 53, 52,  8,  1, 19, 53]) the target is 1
when input is tensor([47, 53, 52,  8,  1, 19, 53,  1]) the target is 58
when input is tensor([39]) the target is 42
when input is tensor([39, 42]) th

In [14]:
# bigram model
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(42)

class BigramLanguageModel(nn.Module):
    def __init__(self , vocab_size: int):
        super().__init__()
        # each token reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size , vocab_size)
    
    def forward(self , index , targets=None):
        # index & targets are (B,T) tensor of integers
        logits = self.token_embedding_table(index)  # (B,T,C)

        # negative log-likelihood (cross-entropy)
        if targets is None:
            loss = None
        else:
            B , T , C = logits.shape
            logits = logits.view(B*T , C)  # 2D array
            targets = targets.view(B*T)  # 1D array
            loss = F.cross_entropy(logits , targets)

        return logits , loss

    def generate(self , index , max_new_tokens):
        # index is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get predictions
            logits , loss = self(index)
            # focus only on the last time step
            logits = logits[: , -1 , :]  # (B,C)
            # applying softmax
            probs = F.softmax(logits , dim=-1) # (B,C)
            # sample from the distribution
            index_next = torch.multinomial(probs , num_samples=1)  # (B,1)
            # append sampled index to the running sequence
            index = torch.cat((index,index_next) , dim=1) # (B,T+1)
        return index

model = BigramLanguageModel(vocab_size)
logits , loss = model(xb,yb)
print(logits.shape)
print(loss)
print(decode(model.generate(index=torch.zeros((1,1) , dtype=torch.long) , max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8175, grad_fn=<NllLossBackward0>)

o$,q&IWqW&xtCjaB?ij&bYRGkF?b; f ,CbwhtERCIfuWr,DzJERjhLlVaF&EjffPHDFcNoGIG'&$qXisWTkJPw
 ,b Xgx?D3sj


In [None]:
# optimizer
optimizer = torch.optim.AdamW(model.parameters() , lr=1e-3)

In [21]:
# training loop
batch_size = 32

for steps in range(10000):
    xb , yb = get_batch('train')

    # evaluating the loss
    logits , loss = model(xb , yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.517943859100342


In [22]:
print(decode(model.generate(index=torch.zeros((1,1) , dtype=torch.long) , max_new_tokens=100)[0].tolist()))


An y linditowist sp-w
NIShan
Nant:
A:

ARI's henorsoos are aindoke s ganl owit?
LLaganthalds HES:
w



In [23]:
# toy mathematical trick in self-attention

torch.manual_seed(42)
B , T , C = 4 , 8 , 32 # batch , time , channel
x = torch.randn(B , T , C)
x.shape


torch.Size([4, 8, 32])

In [24]:
# x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))   # bow = bag of words

for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev , 0)  # averaging out the time dimension

# very inefficient (O(T^2))

In [29]:
a = torch.ones(3,3)
b = torch.randint(0,10,(3,2)).float()
c = a @ b

print(a) ; print(b) ; print(c)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
tensor([[7., 4.],
        [2., 6.],
        [8., 7.]])
tensor([[17., 17.],
        [17., 17.],
        [17., 17.]])


In [30]:
torch.tril(torch.ones(3,3))  # lower triangular matrix

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [32]:
a = torch.tril(torch.ones(3,3))
b = torch.randint(0,10,(3,2)).float()
c = a @ b

print(a) ; print(b) ; print(c)

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
tensor([[0., 4.],
        [3., 0.],
        [2., 8.]])
tensor([[ 0.,  4.],
        [ 3.,  4.],
        [ 5., 12.]])


In [33]:
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a , dim=1 , keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b

print(a) ; print(b) ; print(c)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[0., 6.],
        [3., 2.],
        [4., 4.]])
tensor([[0.0000, 6.0000],
        [1.5000, 4.0000],
        [2.3333, 4.0000]])
