In [450]:
import os
import requests
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


In [451]:
# Download Shakespeare text file
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
output_path = 'data/shakespeare/input.txt'
os.makedirs(os.path.dirname(output_path), exist_ok=True)
print(f'Downloading Shakespeare text from {url}...')
response = requests.get(url)
response.raise_for_status()
with open(output_path, 'w', encoding='utf-8') as f:
    f.write(response.text)
print(f'File downloaded successfully to {output_path}')
print(f'File size: {len(response.text)} characters')
print(f'Number of lines: {len(response.text.splitlines())}')

Downloading Shakespeare text from https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt...
File downloaded successfully to data/shakespeare/input.txt
File size: 1115394 characters
Number of lines: 40000


In [470]:
text = response.text


In [453]:
chars = sorted(set(''.join(text)))
stoi = {s:i for i, s in enumerate(chars)}
itos = {i:s for i, s in enumerate(chars)}

In [454]:
vocab_size = len(chars)
encode = lambda s: [stoi[i] for i in s]
decode = lambda list: ''.join(itos[i] for i in list)

encoded_text = encode(text)

In [455]:
n = int(0.9 * len(encoded_text))
train_data = encoded_text[:n]
test_data = encoded_text[n:]

In [None]:
batch_size = 12
block_size = 64
n_embd = 128
dropout = 0.0
lr = 3e-4
n_head = 4
max_iterations = 4000
n_layer = 4

def get_batch(split):
    data = train_data if split == 'train' else test_data
    rand = torch.randint(len(data) - block_size, (batch_size, ))
    X = [[encoded_text[i+j] for j in range(block_size)] for i in rand]
    Y = [[encoded_text[i+j+1] for j in range(block_size)] for i in rand]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y


In [457]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, X):
        B, T, C = X.shape
        k = self.key(X)
        q = self.query(X)
        wei = q @ k.transpose(-2, -1) * C **-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(X)
        out = wei @ v
        return out


In [458]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd) # projection layer
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [459]:
class FeedFoward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),  # projection layer
            nn.Dropout(dropout)
        )
    
    def forward(self, x):
        return self.net(x)

In [460]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head # 384 // 6 = 64 
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [461]:
class BigramModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx) # (B, T, C=32)
        pos_emb = self.position_embedding_table(torch.arange(T))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        

        if targets is None:
            return logits, None
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


In [462]:
model = BigramModel()
optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)

In [463]:
print(''.join([decode(i) for i in model.generate(torch.zeros((1,1), dtype=torch.long), 300).tolist()]))


!zv!zbezm&mtUQ&..CozIs&atgAJhNEVuJ?vbGbmEXuktvA?WaDdI&mM3K?wT-oxckmLHVDcfFiinobqASKddvfKIneHOIbCVvaC.!wn$Um ?WSITJkHvao!zAekF?TGmuDzMNiJmaEhAmW$?HNPNky?B'f EeaW3HpzJHyCFW!iLJHJcDTIx!uKtiHucM:I3RWOEIiW'NPuqT IO;vIe'
c& ViVFXkzHIMcIXFNHxO'OI;S?'&aKtVKvYmL.vXRmbyHUhp!XV3dFIEA?BHL BfDUUMwGhFiqVSiKe?BFe$


In [467]:
# batch_size = 32
# block_size = 8
for _ in range(max_iterations):
    # forward pass
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)

    # backward pass
    optimizer.zero_grad(set_to_none=True)
    loss.backward()

    # update
    optimizer.step()
    
print(loss.item())

1.8104957342147827


In [468]:
print(''.join([decode(i) for i in model.generate(torch.zeros((1,1), dtype=torch.long), 500).tolist()]))


Totheir that se diem.

MARCIAT:
Why have awaze and his ch, he from and,
And chome me thy ruch to so your bold born;
Rich eyeing livous froll'd wifef, what, word,
Gind untrentingbmed and me mon
Cleing with a in look my not. And your arman;
the impatony your knove soneed, with to more course to in
He he eath your liangoor hing appore,
Bronge in scoularion ims, Morfort. form!
What from made thous made and you saxted,
I in no the a parcion to but? must fords you.
Ah old your right, moy sorrage for a


In [None]:
generated_text = ''.join([decode(i) for i in model.generate(torch.zeros((1,1), dtype=torch.long), 10000).tolist()])
with open('generated_shakespeare.txt', 'w', encoding='utf-8') as f:
    f.write(generated_text)
print("Generated text saved to generated_shakespeare.txt")

In [469]:
print(sum(p.numel() for p in model.parameters()), 'parameters')

816705 parameters


## Before Training (1) loss = 4.

PjbrMQ
KGNbHOXpn
kgqcGH&v?IvLFW3tHtUKD,CTmZffJfZOgSh NWuNHQOpnNmfiW,Fn :oLcL&?hPVY-eYDjRO$?RenHlMB!gnX,ucb!dbTQ:YHxYoZFeO,VjsTd
v'y&b
s?cOiEOhaFHk&o.$iChEXGbM!ntE3vC?C!,'yX.SNjp:OdxUkP
anVUhlnyphKZXsel!bvXplne$
dFaNk?xR3ylG3DTV Sb
?n.-Zv3,vRbz;OH;LzI:;KnBfzQi,epFfVuJI3j?jNnplp,lPcBpJn gfUvL$TlAcfA.q

## After Training (2) loss = 2.5

I I foleasth3:

H.
Whaswo wind a soy ran.
F n
Anelf avch t ta ELOS:
INGl wea!
D Moyemeasucend?
-ond tal,
Wisthoun.
Htheld witorohemy
And, aifean
Ainge?T:
E tous ierofurede my wa t Gxy thart aivut wads sorer!

Ssouresll?
Ses tonk:
Pr g
Firyemire
Yiserore kitl and towo t chathy'd inee Hemod the ha ser

## After Training with Head-Attention (3) loss = 2.4
Tallo korlend hin liloray ds om thrennm whe. ''ptor; kein here heak
Whofo man alr:
QIat: htile st bry ss!

D:
ARNGNAASow he's ind ave-for,
Anaw, mave thy,
Lon fe.

Hathule, ared ad bunt out a brinol, miteree per weyol wourglandour.

Tha wh oghou ongs forgeis, 'ble ce kigrtorung fe: nd.

Lth se toeal

## After Training with Multi-Head-Attention & FFN (4) loss = 2.2

CHINI weathctiong
Fony,! nont by fuen theace:
reagcent, I'KSirn ond K:
Bownce;
And hawaws:
Ath rof end nincke filt ist I a be.

AKENE:
Omy, ustiin otul'd amy fromse my to gind gon and
Hameands.

FRAO IULE: and may me:
ntilde hus, wicr Chape ined, wuring:
Atwh me;
Teat I loots ay.
Ad. not :
Piot nowo

## After Training with Blocking (5) loss = 1.9

Shy-now I kencroural, am thee cess
Mecce,
On that deave beacity have, the chat Casssurs,
It and jeepould play.
So this a intul corte Etil neise wile.
Herviey if parth I sir fram this save and sple, Haw share is upirsit, doth that thall
Of upoy.
Bea.

LARY:
Nare counter; make he bake a pleanter, and 

## Final Trainings (6) loss = 1.8

Totheir that se diem.

MARCIAT:
Why have awaze and his ch, he from and,
And chome me thy ruch to so your bold born;
Rich eyeing livous froll'd wifef, what, word,
Gind untrentingbmed and me mon
Cleing with a in look my not. And your arman;
the impatony your knove soneed, with to more course to in
He he eath your liangoor hing appore,
Bronge in scoularion ims, Morfort. form!
What from made thous made and you saxted,
I in no the a parcion to but? must fords you.
Ah old your right, moy sorrage for a