In [1]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [2]:
print("Length of dataset in characters:", len(text))

Length of dataset in characters: 1115394


In [3]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [4]:
chars = sorted(set(text))
vocab_size = len(chars)
print(''.join(chars))
print("Vocabulary size:", vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocabulary size: 65


In [5]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [6]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [7]:
n = int(0.9 * len(data))
train_data, val_data = data[:n], data[n:]

In [8]:
torch.manual_seed(13)
block_size = 8
batch_size = 4

def get_batch(split: str) -> tuple[torch.tensor, torch.tensor]:
    data = train_data if split == 'train' else val_data
    ix = torch.randint(0, len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
    return x, y

xb, yb = get_batch('train')

In [267]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(13)

class BigramLangugeModel(nn.Module):

    def __init__(self, vocab_size: int) -> None:
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx: torch.tensor, targets: torch.tensor = None) -> tuple[torch.tensor, torch.tensor]:
        logits = self.token_embedding_table(idx)  # B T C
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            # Torch expects B C T
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx: torch.tensor, max_new_tokens: int) -> torch.tensor:
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_next], 1)
        return idx
    
m = BigramLangugeModel(vocab_size)
out, loss= m(xb, yb)
print(out.shape)
print(loss)
print(decode(m.generate(torch.zeros((1, 1), dtype=torch), max_new_tokens=100)[0].tolist()))

torch.Size([256, 65])
tensor(4.8416, grad_fn=<NllLossBackward0>)


TypeError: zeros() received an invalid combination of arguments - got (tuple, dtype=module), but expected one of:
 * (tuple of ints size, *, tuple of names names, torch.dtype dtype = None, torch.layout layout = None, torch.device device = None, bool pin_memory = False, bool requires_grad = False)
 * (tuple of ints size, *, Tensor out = None, torch.dtype dtype = None, torch.layout layout = None, torch.device device = None, bool pin_memory = False, bool requires_grad = False)


In [268]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [269]:
batch_size = 32
for _ in range(20000):
    # get batch
    xb, yb = get_batch('train')
    # evaluate loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    print(loss.item())

2.4964585304260254


In [55]:
print(decode(m.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=300)[0].tolist()))


pes isathod? Hinvearet eoftre:


Cich; mary?
And fein, k whear! w kenendwarend:
en's,
LI arse wristhendsok. ghen. por t tucofer I:
Ore HINGau, itst manon: a worf myste t brway cathedse scoereayonck k'BOLLUFandoo-ksele whithigreflsucct iaut be tet d nt hon' be INGrr'lik!
HIth ss hakime r tene ch awic


In [56]:
torch.manual_seed(13)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [61]:
# bag of words
xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t + 1]
        xbow[b, t] = xprev.mean(0)

In [98]:
wei = torch.tril(torch.ones((T, T)))
wei /= wei.sum(1, keepdim=True)
xbow2 = wei @ x

In [103]:
tri = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tri == 0, float('-inf'))
wei = F.softmax(wei, dim=1)
xbow3 = wei @ x

In [105]:
torch.allclose(xbow3, xbow2)

True

In [125]:
# Pre Self Attention
torch.manual_seed(13)
# Initialize an average batch, that transformer gets
# B - batch, T - time, C - channel
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# Lower triangular matrix
tri = torch.tril(torch.ones(T, T))
# Matrix for future weights
wei = torch.zeros((T, T))
# Future doesn't communicate with the past!
# Tokens are unable to gather any info from the following characters in a batch
# Use triangular matrix to mask anything, that comes after the token as '-inf'
# e**(-inf) ---> 0.0, and that's what future caracters data will be multiplied by
wei = wei.masked_fill(tri == 0, float('-inf'))
# Softmax performs elemet-wise exponentiation and along-rows normalization
# Here's your weights
wei = F.softmax(wei, dim=1)

out = wei @ x
out.shape
# What this code does at the end is putting all date from past and simply averaging it
# This results in a very lossy connection between and cuts the most part of information
# This happens because values in triangle matrix were hard-coded as one, but 
# Self Attention approach allows us to make this number complition data dependent

torch.Size([4, 8, 32])

In [152]:
# Self Attention (single step missing)
torch.manual_seed(13)
# B - batch, T - time, C - channel
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# Dimensianality of key and query vectors
head_size = 16
# Linear layers, that form key and a query from token's identity
key = nn.Linear(C, head_size, bias=False)    # (C, head_size)
query = nn.Linear(C, head_size, bias=False)  # (C, head_size)
# Create keys and querys for all tokens in a batch
k = key(x)                                # (B, T, head_size)
q = query(x)                              # (B, T, head_size)
# Inintialize our weights with query @ key.T products instead of zeroes
wei = q @ k.transpose(-2, -1)             # (B, T, T)

tri = torch.tril(torch.ones((T, T)))
# Future doesn't communicate with the past!
wei = wei.masked_fill(tri == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
out = wei @ x
out.shape
# Weighted aggregation now is a function in a data dependent manner 
# between keys and querys of tokens(nodes)

torch.Size([4, 8, 32])

In [159]:
# Self Attention
torch.manual_seed(13)
# B - batch, T - time, C - channel
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# Dimensianality of key and query vectors
head_size = 16
# Linear layers, that form key and a query from token's identity
key = nn.Linear(C, head_size, bias=False)    # (C, head_size)
query = nn.Linear(C, head_size, bias=False)  # (C, head_size)

# Create keys and querys for all tokens in a batch
k = key(x)                                # (B, T, head_size)
q = query(x)                              # (B, T, head_size)

# Inintialize our weights with query @ key.T products instead of zeroes
wei = q @ k.transpose(-2, -1) * head_size**-0.5   # (B, T, T)

tri = torch.tril(torch.ones((T, T)))
# Future doesn't communicate with the past!
wei = wei.masked_fill(tri == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)


value = nn.Linear(C, head_size, bias=False)  # (C, head_size)
v = value(x)                              # (B, T, head_size)
# Instead of the raw "x",  "v" vector is what we aggregate.
# "v" comes from "value" obtained the same as "key", "query"
out = wei @ v
out.shape
# Weighted aggregation now is a function in a data dependent manner 
# between keys and querys of tokens(nodes)

# "x" is information, private to a token.
# I'm a fifth token, I have some identity, and my information is kept in vector "x".
# Here's what I'm intersted in ("query"), here's what I have ("key"), and if you find
# me interesting here's what I'll communcate to you (value).  

tensor(0.3347, grad_fn=<VarBackward0>)
tensor(0.3802, grad_fn=<VarBackward0>)
tensor(0.1373, grad_fn=<VarBackward0>)


torch.Size([4, 8, 16])

In [160]:
wei

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3646, 0.6354, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3106, 0.4625, 0.2269, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2624, 0.2414, 0.2755, 0.2207, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1727, 0.2276, 0.2303, 0.2185, 0.1509, 0.0000, 0.0000, 0.0000],
         [0.1618, 0.1240, 0.2014, 0.1921, 0.2105, 0.1101, 0.0000, 0.0000],
         [0.2054, 0.2259, 0.1083, 0.1471, 0.1160, 0.1014, 0.0959, 0.0000],
         [0.1327, 0.0882, 0.1372, 0.0856, 0.1931, 0.1874, 0.0747, 0.1011]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3861, 0.6139, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4788, 0.2497, 0.2715, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2468, 0.1966, 0.2727, 0.2838, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2278, 0.2794, 0.2087, 0.1140, 0.1701, 0.0000, 0.0000, 0.0000],
         [0.2236, 0.219

In [14]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(13)
batch_size = 32

# What this thing does is just an average communication, aggregating raw x values
class BigramLanguageModel(nn.Module):
    
    def __init__(self, n_embd: int) -> None:
        # Inherit parent class behaviour
        super().__init__()
        # Create embedding tables
        self.identity_embedding_table = nn.Embedding(vocab_size, n_embd)  # "vocab_size" - global
        self.position_embedding_table = nn.Embedding(block_size, n_embd)  # "block_size" - global
        # Layer, converting prediction of "n_embd" length to logits of "vocab_size" length
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)
    
    def forward(self, idx: torch.tensor, targets: torch.tensor = None) -> torch.tensor: # x - is a batch of shape (B, T)   
        # Unpack batch dimensions
        B, T = idx.shape

        # Embed identities of each tolen in a batch
        identity_embeddings = self.identity_embedding_table(idx)
        # Embed positioins of each tolen in a batch
        positions = torch.tile(torch.arange(T), (B, 1))
        position_embeddings = self.position_embedding_table(positions)
        # Combine that information
        x = identity_embeddings + position_embeddings
        # Averaging communication
        tril = torch.tril(torch.ones(T, T))
        wei = torch.masked_fill(tril, tril == 0, float('-inf'))
        wei = F.softmax(wei, dim=1)
        # "out is a matrix of shape (B, T, C), where C is "n_ebmd"
        out = wei @ x
        # convert "out" to (B, T, C), where C is "vocab_size"
        logits = self.lm_head(out)
    
        # Compute "loss", if necessary
        if targets is None:
            loss = None
        else:
            # F.cross_entropy expects input of shape (N, C), where N is a batch size
            B, T, C = logits.shape
            logits = logits.view(B * T, C)  # Pass two batch dimension we had as one
            targets = targets.view(B * T)   # Reshape targets accordingly
            loss = F.cross_entropy(logits, targets) if targets is not None else None
        
        return logits, loss
    
    def generate(self, idx: torch.tensor, max_new_tokens: int = 100) -> None:
        # "idx" is tensor of shape (4, 8)
        for _ in range(max_new_tokens):
            # Cut "idx" to the length of "block_size" in second dimension
            forward_idx = idx[:, -block_size:] if idx.shape[-1] > block_size else idx
            # Forward pass
            logits, loss = self(forward_idx)
            # (4, 8, 32) -> (4, 32)
            logits = logits[:, -1, :]
            # Normlize last dimension
            probs = F.softmax(logits, dim=-1)
            # Select single token for each vector of probabilitiess
            next_idx = torch.multinomial(probs, num_samples=1)  # (4, 1)
            # (4, 8) cat (4, 1) ---> (4, 9)
            idx = torch.cat([idx, next_idx], dim=1)
        return idx

mymodel = BigramLanguageModel(vocab_size)
# inpt = torch.randint(vocab_size, (batch_size, block_size))
# tgts = torch.randint(vocab_size, (batch_size, block_size))
# res = mymodel(inpt, tgts)

In [15]:
optimizer = torch.optim.AdamW(params=mymodel.parameters(), lr=1e-3)
for i in range(20000):
    # Sample batch
    Xb, Yb = get_batch('train')
    # Evaluate loss
    logits, loss = mymodel(Xb, Yb)
    # Clean gradients
    optimizer.zero_grad(set_to_none=True)
    # Backward pass
    loss.backward()
    optimizer.step()
    if i % 1000 == 0:
        print(f"{loss = }")
print(loss.item())

loss = tensor(4.3319, grad_fn=<NllLossBackward0>)
loss = tensor(3.0761, grad_fn=<NllLossBackward0>)
loss = tensor(2.8579, grad_fn=<NllLossBackward0>)
loss = tensor(2.9286, grad_fn=<NllLossBackward0>)
loss = tensor(2.7695, grad_fn=<NllLossBackward0>)
loss = tensor(2.6387, grad_fn=<NllLossBackward0>)
loss = tensor(2.9360, grad_fn=<NllLossBackward0>)
loss = tensor(2.9126, grad_fn=<NllLossBackward0>)
loss = tensor(2.8196, grad_fn=<NllLossBackward0>)
loss = tensor(2.7113, grad_fn=<NllLossBackward0>)
loss = tensor(2.8126, grad_fn=<NllLossBackward0>)
loss = tensor(2.8382, grad_fn=<NllLossBackward0>)
loss = tensor(2.9016, grad_fn=<NllLossBackward0>)
loss = tensor(2.7930, grad_fn=<NllLossBackward0>)
loss = tensor(2.6736, grad_fn=<NllLossBackward0>)
loss = tensor(2.7699, grad_fn=<NllLossBackward0>)
loss = tensor(2.7949, grad_fn=<NllLossBackward0>)
loss = tensor(2.7944, grad_fn=<NllLossBackward0>)
loss = tensor(2.8318, grad_fn=<NllLossBackward0>)
loss = tensor(2.7495, grad_fn=<NllLossBackward0>)


In [13]:
tokens = mymodel.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=300)[0].tolist()
print(decode(tokens))


Toushtll erash:
y ytoh
oe  Dayithyou s boylogfer  om isfift theotl. 

 rIvols e tsam aha
ndrt
r e wndiosd mvpoiinol,e,  datne e hrasiadt n
firsneo   imdfhgienu:  lc
isakpened
w sbela'hipr
ptecnEoi
r tld hior i a lobfrhle!yI wo  nnitdwos.
-Ca
KNTuon ygoowb
iturcaahdr'ds
 hasmiiek :ns' gss
Ii Pveulano


In [21]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(13)
batch_size = 32
head_size = 16


class Head(nn.Module):
    """
    This class implements behaviour of single Self-Attention head. Given batch of token's 
    identities does the "query - key - value" encoded block and returns aggregated value 
    for each token in a batch, already converted to original dimension.
    """

    def __init__(self, n_embd: int) -> None:
        # Inherit parent class behaviour
        super().__init__()
        # input tensor of shape (B, T, C), where "C" is n_embd -> (B, T, head_size)
        # "head_size" - global
        self.query = nn.Linear(n_embd, head_size, bias=False)  
        self.key = nn.Linear(n_embd,  head_size, bias=False)
        self.value = nn.Linear(n_embd,  head_size, bias=False)

        # Layer, converting aggregated value of "head_size" length to raw "x" of "n_embd" length
        self.to_embed_size = nn.Linear(head_size, n_embd, bias=False)  # "head_size" - global

    def forward(self, idx: torch.tensor) -> None:
        B, T, C = idx.shape
        # First we get querys, keys and values from "idx"
        q = self.query(idx)  # (B, T, head_size)
        k = self.key(idx)    # (B, T, head_size)
        v = self.value(idx)  # (B, T, head_size)
        # Next, we get dot products of q and k
        # (B, T, head_size) @ (B, head_size, T) ---> (B, T, T)
        wei = q @ k.transpose(-1, -2) * head_size**-0.5
        # Mask future tokens
        tril = torch.tril(torch.ones((T, T)))
        wei = torch.masked_fill(wei, tril == 0, float('-inf'))
        wei = F.softmax(wei, dim=1)
        # Aggregate values
        # (B, T, T) @ (B, T, head_size) ---> (B, T, head_size)
        out = wei @ v
        # Convert "out" to (B, T, n_embd)
        out = self.to_embed_size(out)
        return out


# What this thing does is just an average communication, aggregating raw x values. Let's change it.
class BigramLanguageModel(nn.Module):
    
    def __init__(self, n_embd: int) -> None:
        # Inherit parent class behaviour
        super().__init__()
        # Create embedding tables
        self.identity_embedding_table = nn.Embedding(vocab_size, n_embd)  # "vocab_size" - global
        self.position_embedding_table = nn.Embedding(block_size, n_embd)  # "block_size" - global
        # Layer, converting raw "x" of "n_embd" length to logits of "vocab_size" length
        self.to_logit_size = nn.Linear(n_embd, vocab_size, bias=False)
        # Self-attention head
        self.sa_head = Head(n_embd)
        
    
    def forward(self, idx: torch.tensor, targets: torch.tensor = None) -> torch.tensor: # x - is a batch of shape (B, T)   
        # Unpack batch dimensions
        B, T = idx.shape

        # Embed identities of each tolen in a batch
        identity_embeddings = self.identity_embedding_table(idx)
        # Embed positioins of each tolen in a batch
        positions = torch.tile(torch.arange(T), (B, 1))
        position_embeddings = self.position_embedding_table(positions)
        # Combine that information
        x = identity_embeddings + position_embeddings
        # Call self-attention head
        out = self.sa_head(x)

        # convert "out" to (B, T, C), where C is "vocab_size"
        logits = self.to_logit_size(out)
    
        # Compute "loss", if necessary
        if targets is None:
            loss = None
        else:
            # F.cross_entropy expects input of shape (N, C), where N is a batch size
            B, T, C = logits.shape
            logits = logits.view(B * T, C)  # Pass two batch dimension we had as one
            targets = targets.view(B * T)   # Reshape targets accordingly
            loss = F.cross_entropy(logits, targets) if targets is not None else None
        
        return logits, loss
    
    def generate(self, idx: torch.tensor, max_new_tokens: int = 100) -> None:
        # "idx" is tensor of shape (4, 8)
        for _ in range(max_new_tokens):
            # Cut "idx" to the length of "block_size" in second dimension
            forward_idx = idx[:, -block_size:] if idx.shape[-1] > block_size else idx
            # Forward pass
            logits, loss = self(forward_idx)
            # (4, 8, 32) -> (4, 32)
            logits = logits[:, -1, :]
            # Normlize last dimension
            probs = F.softmax(logits, dim=-1)
            # Select single token for each vector of probabilitiess
            next_idx = torch.multinomial(probs, num_samples=1)  # (4, 1)
            # (4, 8) cat (4, 1) ---> (4, 9)
            idx = torch.cat([idx, next_idx], dim=1)
        return idx

mymodel = BigramLanguageModel(vocab_size)
# inpt = torch.randint(vocab_size, (batch_size, block_size))
# tgts = torch.randint(vocab_size, (batch_size, block_size))
# res = mymodel(inpt, tgts)

In [22]:
optimizer = torch.optim.AdamW(params=mymodel.parameters(), lr=1e-3)
for i in range(20000):
    # Sample batch
    Xb, Yb = get_batch('train')
    # Evaluate loss
    logits, loss = mymodel(Xb, Yb)
    # Clean gradients
    optimizer.zero_grad(set_to_none=True)
    # Backward pass
    loss.backward()
    optimizer.step()
    if i % 1000 == 0:
        print(f"{loss = }")
print(loss.item())

loss = tensor(4.2009, grad_fn=<NllLossBackward0>)
loss = tensor(2.3883, grad_fn=<NllLossBackward0>)
loss = tensor(2.1538, grad_fn=<NllLossBackward0>)
loss = tensor(2.1442, grad_fn=<NllLossBackward0>)
loss = tensor(2.3113, grad_fn=<NllLossBackward0>)
loss = tensor(2.2480, grad_fn=<NllLossBackward0>)
loss = tensor(2.2062, grad_fn=<NllLossBackward0>)
loss = tensor(2.3446, grad_fn=<NllLossBackward0>)
loss = tensor(2.3126, grad_fn=<NllLossBackward0>)
loss = tensor(2.1889, grad_fn=<NllLossBackward0>)
loss = tensor(2.2682, grad_fn=<NllLossBackward0>)
loss = tensor(2.1976, grad_fn=<NllLossBackward0>)
loss = tensor(2.1170, grad_fn=<NllLossBackward0>)
loss = tensor(2.1344, grad_fn=<NllLossBackward0>)
loss = tensor(2.1019, grad_fn=<NllLossBackward0>)
loss = tensor(2.1759, grad_fn=<NllLossBackward0>)
loss = tensor(2.1822, grad_fn=<NllLossBackward0>)
loss = tensor(2.1842, grad_fn=<NllLossBackward0>)
loss = tensor(2.2563, grad_fn=<NllLossBackward0>)
loss = tensor(2.2497, grad_fn=<NllLossBackward0>)


In [23]:
tokens = mymodel.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=300)[0].tolist()
print(decode(tokens))



OMENENENY:
Mese INI thoupr thexel.

BENIvilll.

LINGRENGROx:
S ININENINGERNYCHENTENINCILANGRADYO:
Dind, wof rs es avild hy bus: mece;
ARY:
Wacind meackind
Cancomonde the be ror ando, I thay. won ndind,
And-
Wigicuthel'll?

OMENENCIANGLER:
Wemisean:
Ang.


Sikn theanot.

SY:
Bure prod fiveve ty I st


In [52]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(13)
batch_size = 32
n_embd = 32

class Head(nn.Module):
    """
    This class implements behaviour of single Self-Attention head. Given batch of token's 
    identities does the "query - key - value" encoded block and returns aggregated value 
    for each token in a batch, already converted to original dimension.
    """

    def __init__(self, head_size: int) -> None:
        # Inherit parent class behaviour
        super().__init__()
        # input tensor of shape (B, T, C), where "C" is n_embd -> (B, T, head_size)
        # "n_embd" - global
        self.query = nn.Linear(n_embd, head_size, bias=False)  
        self.key = nn.Linear(n_embd,  head_size, bias=False)
        self.value = nn.Linear(n_embd,  head_size, bias=False)

        # Store masking matrix as a buffer which is not a parameter oof the model
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, idx: torch.tensor) -> None:
        B, T, C = idx.shape
        # First we get querys, keys and values from "idx"
        q = self.query(idx)  # (B, T, head_size)
        k = self.key(idx)    # (B, T, head_size)
        v = self.value(idx)  # (B, T, head_size)
        # Next, we get dot products of q and k
        # (B, T, head_size) @ (B, head_size, T) ---> (B, T, T)
        wei = q @ k.transpose(-1, -2) * head_size**-0.5  # AFTER_DEBUG: C instead of head_size !
        # Mask future tokens
        wei = torch.masked_fill(wei, self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=1)
        # Aggregate values
        # (B, T, T) @ (B, T, head_size) ---> (B, T, head_size)
        out = wei @ v
        return out


# What this thing does is just an average communication, aggregating raw x values. Let's change it.
class BigramLanguageModel(nn.Module):
    
    def __init__(self) -> None:
        # Inherit parent class behaviour
        super().__init__()
        # Create embedding tables
        self.identity_embedding_table = nn.Embedding(vocab_size, n_embd)  # "vocab_size" - global
        self.position_embedding_table = nn.Embedding(block_size, n_embd)  # "block_size" - global
        # Layer, converting raw "x" of "n_embd" length to logits of "vocab_size" length
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)
        # Self-attention head
        self.sa_head = Head(n_embd)
        
    
    def forward(self, idx: torch.tensor, targets: torch.tensor = None) -> torch.tensor: # x - is a batch of shape (B, T)   
        # Unpack batch dimensions
        B, T = idx.shape

        # Embed identities of each tolen in a batch
        identity_embeddings = self.identity_embedding_table(idx)
        # Embed positioins of each tolen in a batch
        positions = torch.tile(torch.arange(T), (B, 1))
        position_embeddings = self.position_embedding_table(positions)
        # Combine that information
        x = identity_embeddings + position_embeddings
        # Call self-attention head
        x = self.sa_head(x)
        # convert "out" to (B, T, C), where C is "vocab_size"
        logits = self.lm_head(x)
    
        # Compute "loss", if necessary
        if targets is None:
            loss = None
        else:
            # F.cross_entropy expects input of shape (N, C), where N is a batch size
            B, T, C = logits.shape
            logits = logits.view(B * T, C)  # Pass two batch dimension we had as one
            targets = targets.view(B * T)   # Reshape targets accordingly
            loss = F.cross_entropy(logits, targets) if targets is not None else None
        
        return logits, loss
    
    def generate(self, idx: torch.tensor, max_new_tokens: int = 100) -> None:
        # "idx" is tensor of shape (4, 8)
        for _ in range(max_new_tokens):
            # Cut "idx" to the length of "block_size" in second dimension
            forward_idx = idx[:, -block_size:] if idx.shape[-1] > block_size else idx
            # Forward pass
            logits, loss = self(forward_idx)
            # (4, 8, 32) -> (4, 32)
            logits = logits[:, -1, :]
            # Normlize last dimension
            probs = F.softmax(logits, dim=-1)
            # Select single token for each vector of probabilitiess
            next_idx = torch.multinomial(probs, num_samples=1)  # (4, 1)
            # (4, 8) cat (4, 1) ---> (4, 9)
            idx = torch.cat([idx, next_idx], dim=1)
        return idx

mymodel = BigramLanguageModel()
# inpt = torch.randint(vocab_size, (batch_size, block_size))
# tgts = torch.randint(vocab_size, (batch_size, block_size))
# res = mymodel(inpt, tgts)

In [53]:
optimizer = torch.optim.AdamW(params=mymodel.parameters(), lr=1e-3)
for i in range(20000):
    # Sample batch
    Xb, Yb = get_batch('train')
    # Evaluate loss
    logits, loss = mymodel(Xb, Yb)
    # Clean gradients
    optimizer.zero_grad(set_to_none=True)
    # Backward pass
    loss.backward()
    optimizer.step()
    if i % 1000 == 0:
        print(f"{loss = }")
print(loss.item())

loss = tensor(4.3044, grad_fn=<NllLossBackward0>)
loss = tensor(2.4063, grad_fn=<NllLossBackward0>)
loss = tensor(2.2833, grad_fn=<NllLossBackward0>)
loss = tensor(2.3781, grad_fn=<NllLossBackward0>)
loss = tensor(2.2818, grad_fn=<NllLossBackward0>)
loss = tensor(2.1793, grad_fn=<NllLossBackward0>)
loss = tensor(2.3244, grad_fn=<NllLossBackward0>)
loss = tensor(2.3041, grad_fn=<NllLossBackward0>)
loss = tensor(2.2804, grad_fn=<NllLossBackward0>)
loss = tensor(2.1504, grad_fn=<NllLossBackward0>)
loss = tensor(2.2674, grad_fn=<NllLossBackward0>)
loss = tensor(2.3244, grad_fn=<NllLossBackward0>)
loss = tensor(2.3394, grad_fn=<NllLossBackward0>)
loss = tensor(2.1236, grad_fn=<NllLossBackward0>)
loss = tensor(2.1418, grad_fn=<NllLossBackward0>)
loss = tensor(2.0876, grad_fn=<NllLossBackward0>)
loss = tensor(2.1257, grad_fn=<NllLossBackward0>)
loss = tensor(2.2508, grad_fn=<NllLossBackward0>)
loss = tensor(2.3066, grad_fn=<NllLossBackward0>)
loss = tensor(2.1487, grad_fn=<NllLossBackward0>)


In [37]:
tokens = mymodel.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=300)[0].tolist()
print(decode(tokens))


NI:
Shindy con my.

MAUD ho bourg bred balld,
Cougveal this.
CAng fo seat-'my harely me cant latherg ods wid th, in?
ANY:
Longzomend ow ith met ars wird thes pe be nd aving, thowre, ye orw anonis this-thoushit mat irt hes at borls foug ace on ty for:
Voly thal is pur aly rim?

And to for'd ngil thed


In [54]:
# Deep Neural Networks suffer from normaliztion issues and we face this stuff we our net too.
# Let's introduce two techniques to fight this stuff:
# 1) Residual connnections. We're simply adding any caomputation dine to what we have before it.
# 2) Layer normalization, not a batch normalization, but it does the same thing, but along the rows.
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(13)
batch_size = 32
n_embd = 32

class Head(nn.Module):
    """
    This class implements behaviour of single Self-Attention head. Given batch of token's 
    identities does the "query - key - value" encoded block and returns aggregated value 
    for each token in a batch.
    """

    def __init__(self, head_size: int) -> None:
        # Inherit parent class behaviour
        super().__init__()
        # input tensor of shape (B, T, C), where "C" is n_embd -> (B, T, head_size)
        # "n_embd" - global
        self.query = nn.Linear(n_embd, head_size, bias=False)  
        self.key = nn.Linear(n_embd,  head_size, bias=False)
        self.value = nn.Linear(n_embd,  head_size, bias=False)

        # Store masking matrix as a buffer which is not a parameter oof the model
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, idx: torch.tensor) -> torch.tensor:
        B, T, C = idx.shape
        # First we get querys, keys and values from "idx"
        q = self.query(idx)  # (B, T, head_size)
        k = self.key(idx)    # (B, T, head_size)
        v = self.value(idx)  # (B, T, head_size)
        # Next, we get dot products of q and k
        # (B, T, head_size) @ (B, head_size, T) ---> (B, T, T)
        wei = q @ k.transpose(-1, -2) * head_size**-0.5  # AFTER_DEBUG: C instead of head_size !
        # Mask future tokens
        wei = torch.masked_fill(wei, self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=1)
        # Aggregate values
        # (B, T, T) @ (B, T, head_size) ---> (B, T, head_size)
        out = wei @ v
        return out


class MultiHeadAttention(nn.Module):
    """
    This class initializes given amount of Self-Attention heads of given head size, runs
    all of them in parallel and returns concatemation of their work along channel dimension.
    Output's shape is (B, T, C), where C is "num_heads" * "head_size".
    """

    def __init__(self, num_heads: int, head_size: int) -> None:
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        # Residual connection layer
        self.proj = nn.Linear(n_embd, n_embd)

    def forward(self, x: torch.tensor) -> torch.tensor:
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        # Project what we've calculated, or control it's influence  
        # since self.proj is a trainable parameter
        out = self.proj(out)
        return out


class FeedForward(nn.Module):
    """
    This class implements thinking process, where by thinking I mean simple MLP neural net.
    Given tensor of n_embd passes it through Linear layer and ReLU, returning tensor of original shape.
    """

    def __init__(self, n_embd: int) -> None:
        # Inherit behaviour of parent class
        super().__init__()
        # Mini-net of linear activation followed by ReLU activation
        self.net = nn.Sequential(
            nn.Linear(n_embd, n_embd),
            nn.ReLU(),
            # Residual connection layer
            # This is the way of control which parts of computation 
            # we did affect original data more and which are less
            nn.Linear(n_embd, n_embd)
        )
    
    def forward(self, x: torch.tensor) -> torch.tensor:
        # Forwrad mini-net
        return self.net(x)


class Block(nn.Module):
    """
    This class just combines talking and thinking stages, where talking 
    is multi-head self-attention and thinking is a simple MLP.
    """

    def __init__(self, n_embd: int, num_heads: int) -> None:
        # Inherit behaviour of parent class
        super().__init__()
        head_size = n_embd // num_heads
        self.sa = MultiHeadAttention(num_heads, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x: torch.tensor) -> torch.tensor:
        # Residual connection: add result of computation to the input
        x = x + self.sa(self.ln1(x))    # first add communication part
        x = x + self.ffwd(self.ln2(x))  # then add computation part
        return x


class BigramLanguageModel(nn.Module):
    
    def __init__(self) -> None:
        # Inherit parent class behaviour
        super().__init__()
        # Create embedding tables
        self.identity_embedding_table = nn.Embedding(vocab_size, n_embd)  # "vocab_size" - global
        self.position_embedding_table = nn.Embedding(block_size, n_embd)  # "block_size" - global
        # Layer, converting raw "x" of "n_embd" length to logits of "vocab_size" length
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)
        # Stack multiple blocks upon each other
        self.blocks = nn.Sequential(
            Block(n_embd, num_heads=4),
            Block(n_embd, num_heads=4),
            Block(n_embd, num_heads=4),
            # Layer norm after transformer right before "lm_head"
            nn.LayerNorm(n_embd)
        )
        
    
    def forward(self, idx: torch.tensor, targets: torch.tensor = None) -> torch.tensor: # x - is a batch of shape (B, T)   
        # Unpack batch dimensions
        B, T = idx.shape

        # Embed identities of each tolen in a batch
        identity_embeddings = self.identity_embedding_table(idx)
        # Embed positioins of each tolen in a batch
        positions = torch.tile(torch.arange(T), (B, 1))
        position_embeddings = self.position_embedding_table(positions)
        # Combine that information
        x = identity_embeddings + position_embeddings
        # Call all "talk-think" blocks, that's the forward pass of the model now
        x = self.blocks(x)
        # convert "out" to (B, T, C), where C is "vocab_size"
        logits = self.lm_head(x)
    
        # Compute "loss", if necessary
        if targets is None:
            loss = None
        else:
            # F.cross_entropy expects input of shape (N, C), where N is a batch size
            B, T, C = logits.shape
            logits = logits.view(B * T, C)  # Pass two batch dimension we had as one
            targets = targets.view(B * T)   # Reshape targets accordingly
            loss = F.cross_entropy(logits, targets) if targets is not None else None
        
        return logits, loss
    
    def generate(self, idx: torch.tensor, max_new_tokens: int = 100) -> None:
        # "idx" is tensor of shape (4, 8)
        for _ in range(max_new_tokens):
            # Cut "idx" to the length of "block_size" in second dimension
            forward_idx = idx[:, -block_size:] if idx.shape[-1] > block_size else idx
            # Forward pass
            logits, loss = self(forward_idx)
            # (4, 8, 32) -> (4, 32)
            logits = logits[:, -1, :]
            # Normlize last dimension
            probs = F.softmax(logits, dim=-1)
            # Select single token for each vector of probabilitiess
            next_idx = torch.multinomial(probs, num_samples=1)  # (4, 1)
            # (4, 8) cat (4, 1) ---> (4, 9)
            idx = torch.cat([idx, next_idx], dim=1)
        return idx

mymodel = BigramLanguageModel()
# inpt = torch.randint(vocab_size, (batch_size, block_size))
# tgts = torch.randint(vocab_size, (batch_size, block_size))
# res = mymodel(inpt, tgts)

In [56]:
optimizer = torch.optim.AdamW(params=mymodel.parameters(), lr=1e-3)
for i in range(1000):
    # Sample batch
    Xb, Yb = get_batch('train')
    # Evaluate loss
    logits, loss = mymodel(Xb, Yb)
    # Clean gradients
    optimizer.zero_grad(set_to_none=True)
    # Backward pass
    loss.backward()
    optimizer.step()
    if i % 1000 == 0:
        print(f"{loss = }")
print(loss.item())

loss = tensor(1.2882, grad_fn=<NllLossBackward0>)
0.45698636770248413


In [49]:
tokens = mymodel.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()
print(decode(tokens))



ODO
RAnd.

Bederaraparearsencerw Huasch fey abey dy amateand, re?

BENENCENELESTAOtud as th, suchife isuchowhed ferenest; ats we to ont, aich thor inuenf, harest, enceien owhir, warket-
LO, fo, ange elet toisecharref ast ned ughourmem sus wo dod, I sw he do do ther ind, foruch sewe thareedee fery wair cAnum fip', I my Ray, th ifocudsong, I tellifl, wovod.
Andy parebpire yret by rin noto oouill wor, I Sret obtur fo.
NRHINUOMLAnd thy hepl,
Nown turd. ICE cegt,
TBAreps, no,
Ya thadel the rown;
HYo
