In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-03-21 11:33:16--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2023-03-21 11:33:17 (11.2 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [4]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [5]:
print(f'length of dataset in charcters: {len(text)}')

length of dataset in charcters: 1115394


In [6]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [8]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [13]:
# Tokenize input Text (Character level tokenizer)
stoi = { ch:i for i, ch in enumerate(chars)}
itos = { i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] # takes string, and outputs a list of integer
decode = lambda l: ''.join([itos[i] for i in l])# takes list of integer, and outputs a string

print(encode('hello, there!'))
print(decode(encode('hello, there!')))

[46, 43, 50, 50, 53, 6, 1, 58, 46, 43, 56, 43, 2]
hello, there!


In [15]:
import torch

# convert text encoding into torch.tensor
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.type)
print(data[:100])

torch.Size([1115394]) <built-in method type of Tensor object at 0x1057d9bd0>
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [17]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [18]:
train_data.shape, val_data.shape

(torch.Size([1003854]), torch.Size([111540]))

In [19]:
# set size of input to transformer
block_size = 8
train_data[:block_size+1]


tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

- The block of input above represents 8 different context examples. For example, if the word 18 appears, 47 appears next. If 18, 47, 56, and 58 appears consecutively, 1 will come next, ane etc.
  

In [20]:
x = train_data[:block_size]
y = train_data[1: block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'when input is {context} the target is {target}')

when input is tensor([18]) the target is 47
when input is tensor([18, 47]) the target is 56
when input is tensor([18, 47, 56]) the target is 57
when input is tensor([18, 47, 56, 57]) the target is 58
when input is tensor([18, 47, 56, 57, 58]) the target is 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


## Batch Dimension
- Mini Batch helps with efficiency
- process multiple chunks at the same time

In [68]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for prediction?

def get_batch(split):
    data = train_data if split=='train' else val_data
    ix = torch.randint(len(data)-batch_size, (batch_size,))
    x = torch.stack([data[i: i+block_size] for i in ix])
    y = torch.stack([data[i+1: i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')

print(f'''input Tensor info
shape: {xb.shape},
{xb}
''')
print(f'''target Tensor info
shape: {yb.shape},
{yb}''')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'when input is {context.tolist()} the target is {target}')

input Tensor info
shape: torch.Size([4, 8]),
tensor([[ 1, 60, 39, 47, 50,  1, 63, 53],
        [46, 43, 39, 60, 43, 52,  1, 44],
        [ 1, 46, 43, 56, 43,  1, 63, 53],
        [61, 47, 50, 50,  1, 57, 39, 63]])

target Tensor info
shape: torch.Size([4, 8]),
tensor([[60, 39, 47, 50,  1, 63, 53, 59],
        [43, 39, 60, 43, 52,  1, 44, 53],
        [46, 43, 56, 43,  1, 63, 53, 59],
        [47, 50, 50,  1, 57, 39, 63,  0]])
when input is [1] the target is 60
when input is [1, 60] the target is 39
when input is [1, 60, 39] the target is 47
when input is [1, 60, 39, 47] the target is 50
when input is [1, 60, 39, 47, 50] the target is 1
when input is [1, 60, 39, 47, 50, 1] the target is 63
when input is [1, 60, 39, 47, 50, 1, 63] the target is 53
when input is [1, 60, 39, 47, 50, 1, 63, 53] the target is 59
when input is [46] the target is 43
when input is [46, 43] the target is 39
when input is [46, 43, 39] the target is 60
when input is [46, 43, 39, 60] the target is 43
when input is 

In [22]:
## Simplest Language Model ( bigram language model )
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for next tokn from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) tensor of integers
        logits = self.token_embedding_table(idx)

        if targets is None:
            loss = None
        else:
            # reshape logits
            B, T, C =  logits.shape
            logits = logits.view(B*T, C)
            targets  = targets.view(B*T)
            # negative log loss
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indicies in current context
        for _ in range(max_new_tokens):
            # get prediction
            logits, loss = self(idx) # goes to def forward() function
            # focus only on last time step
            logits = logits[:, -1, :] # shape: (B. C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape, loss)


decode(m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist())


NameError: name 'vocab_size' is not defined

In [91]:
# Create pytorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3) # usually 1e-4 works well in bigger models

In [95]:
batch_size = 32

for steps in range(10000):
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    print(loss.item())

2.459482431411743
2.37371563911438
2.4875760078430176
2.4567627906799316
2.5133776664733887
2.449505090713501
2.4660868644714355
2.3196802139282227
2.4119629859924316
2.4253056049346924
2.3787567615509033
2.4520013332366943
2.306009531021118
2.6572721004486084
2.462906837463379
2.5108935832977295
2.492902994155884
2.427046060562134
2.5011556148529053
2.583879232406616
2.5446653366088867
2.3761610984802246
2.4158852100372314
2.564537525177002
2.5141072273254395
2.490954637527466
2.3639168739318848
2.469484329223633
2.385913610458374
2.4721643924713135
2.40440034866333
2.5133731365203857
2.479466438293457
2.472273349761963
2.4665541648864746
2.498748779296875
2.428332567214966
2.5743398666381836
2.3918309211730957
2.6071393489837646
2.428853988647461
2.4363391399383545
2.5149784088134766
2.3967947959899902
2.5603768825531006
2.4619429111480713
2.4139342308044434
2.3891899585723877
2.413999319076538
2.4122636318206787
2.4863522052764893
2.5390048027038574
2.540088415145874
2.5273885726928

In [97]:
print(decode(m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist())
)


WA:

Bureraillaigrtathitham inours y l e iloue bu Ise tor omorttove 'scherur thew!'tratofrkemy h arme s onofortot msou ngh gresoredru, s LEYoud d g ghoughext imer s thed ou hed ncocoale ds I
TENG me I's
Art.
S:


Than thiullluchangl ade, have mm sor mu thendutht, RD t thie s antcomeshee;
Thake avevite hedwieat g ton t oeck spe,
othowen:
IDIn co besisl s ben,
havend ds l.
My, LI ll, tr selld t faitomar ENNCyoveme he's,
Fathilirous,
HBUnd
HERNodwore otache;
HARY:
ARIO:
ancashom!

T:
AShy crous ath


## Self Attention

### Mathematical Trick in self-attention

- Let's make tokens talk to each other while training with self-attention algorithm
- a certain token should not communicate with future token but with previous tokens.
  - for example, token #5 in sentence should not communicate with token #6, 7 and so on because those are the tokens that we need to predict
  - Instead, the token #5 should communicate with token # 1, 2, 3, and 4, 

- Then, what is the easiest way that tokens can communicate?
  - maybe average all previous tokens?
    - very lossy
    - 

In [13]:
import torch
# Toy example
torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape, x

(torch.Size([4, 8, 2]),
 tensor([[[ 0.1808, -0.0700],
          [-0.3596, -0.9152],
          [ 0.6258,  0.0255],
          [ 0.9545,  0.0643],
          [ 0.3612,  1.1679],
          [-1.3499, -0.5102],
          [ 0.2360, -0.2398],
          [-0.9211,  1.5433]],
 
         [[ 1.3488, -0.1396],
          [ 0.2858,  0.9651],
          [-2.0371,  0.4931],
          [ 1.4870,  0.5910],
          [ 0.1260, -1.5627],
          [-1.1601, -0.3348],
          [ 0.4478, -0.8016],
          [ 1.5236,  2.5086]],
 
         [[-0.6631, -0.2513],
          [ 1.0101,  0.1215],
          [ 0.1584,  1.1340],
          [-1.1539, -0.2984],
          [-0.5075, -0.9239],
          [ 0.5467, -1.4948],
          [-1.2057,  0.5718],
          [-0.5974, -0.6937]],
 
         [[ 1.6455, -0.8030],
          [ 1.3514, -0.2759],
          [-1.5108,  2.1048],
          [ 2.7630, -1.7465],
          [ 1.4516, -1.5103],
          [ 0.8212, -0.2115],
          [ 0.7789,  1.5333],
          [ 1.6097, -0.4032]]]))

In [18]:
# we want x[b, t] = mean_{i<=t} x[b, i]
xbow = torch.zeros((B, T, C))#x bag of words
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b, t] = torch.mean(xprev, 0)

x[0], xbow[0]

(tensor([[ 0.1808, -0.0700],
         [-0.3596, -0.9152],
         [ 0.6258,  0.0255],
         [ 0.9545,  0.0643],
         [ 0.3612,  1.1679],
         [-1.3499, -0.5102],
         [ 0.2360, -0.2398],
         [-0.9211,  1.5433]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

In [17]:
# check if calc is correct
x_avg_3 = (x[0][0] + x[0][1] + x[0][2]) / 3
x_avg_3, xbow[0][2]


(tensor([ 0.1490, -0.3199]), tensor([ 0.1490, -0.3199]))

In [4]:
torch.tril(torch.ones(3, 3))

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [3]:
# we can make calculation above more efficient with matrix calculation
torch.manual_seed(42)
a = torch.ones(3, 3)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b

print(a)
print(b)
print(c)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
tensor([[14., 16.],
        [14., 16.],
        [14., 16.]])


In [8]:
# with triangular ones matrix, we can efficiently calculate cumulative sum of certain matrix
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b

print(a)
print(b)
print(c)

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
tensor([[ 2.,  7.],
        [ 8., 11.],
        [14., 16.]])


In [9]:
# If we normalize each row of triangular matrix (sum of all element in row = 1), 
# we can calcualte cumulative average
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b

print(a)
print(b)
print(c)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [16]:
# lets vectorize the first problem (Version 2)
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [19]:
xbow2 = wei @ x # (T, T) @ (B, T, C) ==> (B, T, C)
torch.allclose(xbow, xbow2)

True

In [20]:
xbow[0], xbow2[0]

(tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

In [24]:
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [25]:
wei

tensor([[0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [28]:
wei = wei.masked_fill(tril == 0, float('-inf'))
wei

tensor([[0.1250,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.1250, 0.1250,   -inf,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.1250, 0.1250, 0.1250,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.1250, 0.1250, 0.1250, 0.1250,   -inf,   -inf,   -inf,   -inf],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250,   -inf,   -inf,   -inf],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250,   -inf,   -inf],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250,   -inf],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [29]:
wei = F.softmax(wei, dim=-1) # normailzation operation
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [30]:
# Version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

- In version 3
  - torch.zeros matrix shows the strength of affinity of each tokens
    - these affinity will be data dependent
    - depending on values, the relationship(affinity) is formed by training
  - masked_fill part represents that all future token will not be used in calculating affinity of tokens
- you can do weignted aggregation of past elements by matrxi mul of lowe triangular fashion