In [1]:
import torch

In [2]:
text_file = 'text-sample.txt'
with open(text_file, 'r', encoding='utf-8') as f:
    text = f.read()

In [5]:
len(text)

504850

In [6]:
text[:1000]

'Artificial intelligence (AI) researchers have been developing and refining large language models (LLMs) that exhibit remarkable capabilities across a variety of domains and tasks, challenging our understanding of learning and cognition. The latest model developed by OpenAI, GPT-4 [Ope23], was trained using an unprecedented scale of compute and data. In this paper, we report on our investigation of an early version of GPT-4, when it was still in active development by OpenAI. We contend that (this early version of) GPT- 4 is part of a new cohort of LLMs (along with ChatGPT and Google’s PaLM for example) that exhibit more general intelligence than previous AI models. We discuss the rising capabilities and implications of these models. We demonstrate that, beyond its mastery of language, GPT-4 can solve novel and difficult tasks that span mathematics, coding, vision, medicine, law, psychology and more, without needing any special prompting. Moreover, in all of these tasks, GPT-4’s perform

In [9]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(vocab_size)
''.join(chars)

142


'\n !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}°±·×Øøˆ̧́̈ελπρσω–—‘’“”•′⃗→⇒∂∆∇∈−∗√∞∪≈≤≥⊕◦\uf8ee\uf8ef\uf8f0\uf8f9\uf8fa\uf8fb'

In [12]:
stoi = {c: i for i, c in enumerate(chars)}
itos = {i: c for i, c in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda int_list: ''.join([itos[i] for i in int_list])

In [14]:
ala_encoded = encode('Ala ma kota')
ala_decoded = decode(ala_encoded)
print(ala_encoded)
print(ala_decoded)

[34, 77, 66, 1, 78, 66, 1, 76, 80, 85, 66]
Ala ma kota


In [15]:
data = torch.tensor(encode(text), dtype=torch.long)

In [18]:
data.shape

torch.Size([504850])

In [19]:
print(data[:1000])

tensor([ 34,  83,  85,  74,  71,  74,  68,  74,  66,  77,   1,  74,  79,  85,
         70,  77,  77,  74,  72,  70,  79,  68,  70,   1,   9,  34,  42,  10,
          1,  83,  70,  84,  70,  66,  83,  68,  73,  70,  83,  84,   1,  73,
         66,  87,  70,   1,  67,  70,  70,  79,   1,  69,  70,  87,  70,  77,
         80,  81,  74,  79,  72,   1,  66,  79,  69,   1,  83,  70,  71,  74,
         79,  74,  79,  72,   1,  77,  66,  83,  72,  70,   1,  77,  66,  79,
         72,  86,  66,  72,  70,   1,  78,  80,  69,  70,  77,  84,   1,   9,
         45,  45,  46,  84,  10,   1,  85,  73,  66,  85,   1,  70,  89,  73,
         74,  67,  74,  85,   1,  83,  70,  78,  66,  83,  76,  66,  67,  77,
         70,   1,  68,  66,  81,  66,  67,  74,  77,  74,  85,  74,  70,  84,
          1,  66,  68,  83,  80,  84,  84,   1,  66,   1,  87,  66,  83,  74,
         70,  85,  90,   1,  80,  71,   1,  69,  80,  78,  66,  74,  79,  84,
          1,  66,  79,  69,   1,  85,  66,  84,  76,  84,  13,  

In [20]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

In [21]:
len(train_data), len(val_data)

(403880, 100970)

In [46]:
torch.manual_seed(42)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else test_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')

print('xb:')
print(xb)
print()
print('yb:')
print(yb)
print('--------------------')

for b in range(batch_size):
    for t in range(block_size):
        print(f'For context {xb[b,:t+1]}, target is: {yb[b,t]}')
    print('---')

xb:
tensor([[ 40,  49,  53,  14,  21, 114,  84,   1],
        [ 66,  77,  15,   1,  34,  79,  69,   1],
        [  1,  90,   1,  30,   1,  90, 127,   1],
        [ 76,   1,  80,  83,   1,  84,  68,  83]])

yb:
tensor([[ 49,  53,  14,  21, 114,  84,   1,  68],
        [ 77,  15,   1,  34,  79,  69,   1,  74],
        [ 90,   1,  30,   1,  90, 127,   1,  66],
        [  1,  80,  83,   1,  84,  68,  83,  66]])
--------------------
For context tensor([40]), target is: 49
For context tensor([40, 49]), target is: 53
For context tensor([40, 49, 53]), target is: 14
For context tensor([40, 49, 53, 14]), target is: 21
For context tensor([40, 49, 53, 14, 21]), target is: 114
For context tensor([ 40,  49,  53,  14,  21, 114]), target is: 84
For context tensor([ 40,  49,  53,  14,  21, 114,  84]), target is: 1
For context tensor([ 40,  49,  53,  14,  21, 114,  84,   1]), target is: 68
---
For context tensor([66]), target is: 77
For context tensor([66, 77]), target is: 15
For context tensor([66, 77,

In [220]:
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(42)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)
        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m.forward(xb, yb)
print(logits.shape)
print(loss.tolist())

torch.Size([256, 142])
5.392546653747559


In [221]:
import math

# expected loss
-math.log(1/vocab_size)

4.955827057601261

In [222]:
m.generate(xb, 10)

tensor([[ 77,  80,  69,  90,   1,  85,  73,  66,  48,  33, 138,  24, 101,  47,
          22, 130,  27,  12],
        [ 53,  14,  21,   1,  66,  83,  70,   1, 136,  92,   2,  57,   1, 108,
          45,  58, 119,  55],
        [ 73,  70,   1,  82,  86,  70,  84,  85,  68, 117, 115, 106,  19,  87,
          12,  89,  28, 118],
        [ 69,   1,  58,   1,  66,  83,  70,   1,  47, 135,  44,   5,  85,  79,
         108,   1, 135,  86],
        [  1,   1,   1,   1,   1,   1,  71,  66, 135,  13,  94, 135,  51,  12,
          67,  79,  23,  65],
        [ 71,  74,  68,  74,  70,  79,  85,  77,  17,  58,  18,  68,  81,   1,
          72,  39, 106, 102],
        [ 84,   1,  85,  73,  66,  85,   1,  85,  17,  31,  26, 129, 126, 101,
         127, 127,  99,  19],
        [  0,  14,   1,  65,  18,  29,  30,  76,  50, 121,  61,  18,  20,   5,
         138,  72, 132, 123],
        [ 79,  87,  80,  77,  87,  70,  84,   1,  19,  54,  47,  86, 103,  15,
          34,   6, 135, 105],
        [ 70,  79, 

In [223]:
def decode_tensor(x_encoded):
    b, t = x_encoded.shape
    decoded_strings = []
    for i in range(b):
        int_list = x_encoded[i, :].tolist()
        decoded_string = decode(int_list)
        decoded_strings.append(decoded_string)
    return decoded_strings

decode_tensor(xb)

['lody tha',
 'T-4 are ',
 'he quest',
 'd Y are ',
 '      fa',
 'ficientl',
 's that t',
 '\n- `1<=k',
 'nvolves ',
 'en the g',
 'has been',
 'hmetic p',
 'in the T',
 're close',
 'owledge ',
 'llenges ',
 'Luke tol',
 '4:\n    O',
 ' run. We',
 ' the sta',
 'eferring',
 'glas B. ',
 'formal, ',
 ' use the',
 'd identi',
 'lem with',
 'utoim\nmu',
 's the va',
 'PT-4 oft',
 'her case',
 'hen it i',
 'hecking\n']

In [224]:
# generation
idx = torch.zeros((1, 1), dtype=torch.long)
decode_tensor(m.generate(idx, 100))

['\nso@vρI“ek"±ω\uf8ef—⇒Z@uAQ-U;′l5qh{"QCnP\'j\n9}Sσ⃗∆rcNPV}×\'\uf8ee{2g’∪*UZ—L°”yˆEt∞g°×!2}bz|Nbø×∗hD×∗Av⃗N>&øJ7?\uf8f9@J']

In [225]:
# training

In [226]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [227]:
batch_size = 32
for i in range(10):
    for steps in range(1000):
        xb, yb = get_batch('train')

        logits, loss = m(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
    print(loss.item())

4.3515543937683105
3.6309425830841064
3.0488831996917725
2.8627829551696777
2.7648069858551025
2.636763095855713
2.6641106605529785
2.6322357654571533
2.6348443031311035
2.740755558013916


In [239]:
# generation
idx = torch.zeros((1, 1), dtype=torch.long)
decode_tensor(m.generate(idx, 100))

['\ndg ala t thi∗√3 cowalext Liv cathesltstugis QFiora anatotifa whesit M.mingetorat urst?vindend in eri']

In [257]:
# vectorization trick for self attention
torch.manual_seed(1337)

B,T,C = 4,8,32
x = torch.randn(B,T,C)

# SELF ATTENTION
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)
wei = q @ k.transpose(-2, -1) * head_size**(-0.5)


# to jest to samo co średnia od 0 do danego t w każdym przykładzie (out jest średnią)
tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v
out.shape

torch.Size([4, 8, 16])

In [258]:
wei

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3966, 0.6034, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3069, 0.2892, 0.4039, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3233, 0.2175, 0.2443, 0.2149, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1479, 0.2034, 0.1663, 0.1455, 0.3369, 0.0000, 0.0000, 0.0000],
         [0.1259, 0.2490, 0.1324, 0.1062, 0.3141, 0.0724, 0.0000, 0.0000],
         [0.1598, 0.1990, 0.1140, 0.1125, 0.1418, 0.1669, 0.1061, 0.0000],
         [0.0845, 0.1197, 0.1078, 0.1537, 0.1086, 0.1146, 0.1558, 0.1553]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4016, 0.5984, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3365, 0.2271, 0.4364, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3019, 0.2060, 0.2899, 0.2022, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1058, 0.1700, 0.1530, 0.3451, 0.2261, 0.0000, 0.0000, 0.0000],
         [0.1526, 0.164