Devkota : Transformer that can write a poem 

In [62]:
# imports and setup
import torch #tensors
import torch.nn as nn  # layers and activatiom
from torch.nn import functional as F  # softmax and cross_entropy

In [None]:
# hyper parameters 
# # Better hyperparameters
# n_embd = 128      # More expressive
# n_head = 8        # More attention patterns
# n_layer = 6       # Deeper
# dropout = 0.2     # Prevent overfitting
# learning_rate = 3e-4  # More stable

# processes 16 text sequences at once
batch_size = 16 
#  maximum content length 
block_size = 32
# num of training steps : model will see 5000 batches of data 
max_iters = 5000
# check progress every 100 steps 
eval_interval = 100
# how big learning steps are 1e-3 = 0.001 
# too high = unstable, too low = slow learning 
learning_rate = 1e-3

In [100]:
# checking for gpu 
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [101]:
# avg loss over 200 batches when evaluating
eval_iters = 200
# embedding dimesnion
n_embd = 64 
# num of attention heads 
n_head = 4 
# num of transformer bloacks 
n_layer = 4 
dropout = 0.0

In [102]:
# random seed for reproduceblity 
torch.manual_seed(1337)

<torch._C.Generator at 0x7526b8a38a70>

In [103]:
# loading and preparing data 
with open('poem.txt','r', encoding='utf-8') as f:
    text = f.read() #reads from the file 

In [104]:
# create vocab of unique characters 
# set(text) : get all unique characters 
# list() : convert to list 
# sorted : sorts alphabetically 
# eg if text = "hello" chrs = ['e','h','l','o']
chars = sorted(list(set(text)))

In [105]:
# gives the lenght of chars, total element
vocab_size = len(chars)

In [None]:
# string to integer 
stoi = {}
i = 0 
for ch in chars:
    stoi[ch] = i 
    i +=1

# or 
stoi = { ch:i for i,ch in enumerate(chars) }


In [None]:
# # integer to string 
itos  = {}
i = 0 
for ch in chars:
    itos[i] = ch 
    i +=1


# or 
# itos = { i:ch for i,ch in enumerate(chars) }


In [178]:
# frrom string to number 
encode = lambda s: [stoi[c] for c in s]
encode("नमस्कार")


[41, 46, 53, 65, 22, 55, 48]

In [181]:
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
decode([41, 46, 53, 65, 22, 55, 48])

'नमस्कार'

In [110]:
# train val split 
data = torch.tensor(encode(text),dtype=torch.long)

In [111]:
# 90% data into n 
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [149]:
# data loading function 
def get_batch(split):
    if split == 'train':
        data = train_data
    else:
        data = val_data
    # generate batch_size random starting psitions
    ix = torch.randint(len(data) - block_size, (batch_size,))


    # create lsits to store the sequences 
    x_list = []
    y_list = []

    # manually loop through each random index 
    for start in ix:
        start = int(start) # ensure int 

        # input sequence
        x_seq = data[start : start + block_size]
        x_list.append(x_seq)

        # target sequence (shifeted by 1)
        y_seq = data[start + 1: start + block_size + 1]
        y_list.append(y_seq)

    x = torch.stack(x_list, dim=0)
    y = torch.stack(y_list, dim=0)
    x = x.to(device)
    y = y.to(device)


    return x,y

In [150]:
# loss estimation function 
@torch.no_grad() # disables gradient calcualtion 
def estimate_loss():
    out = {}
    model.eval() #puts model in evaluation mode
    for split in ['train','val']:  # loop through train and val sets
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters): 
            X, Y = get_batch(split) # get a batch 
            logits, loss = model(X,Y) # run through model, 
            losses[k] = loss.item()  #store the loss , .item()- convert to python number
            out[split] = losses.mean() #average all losses 
            model.train()
            return out 



In [158]:
# single attention head 
class Head(nn.Module):

    def __init__(self,head_size):
        super().__init__()

        # three linear transformations 
        # key : "what information do I have"
        # query : What info I am looking for
        # value : "what is my actual content"
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        # lower traingular matrix fofr casual masking 
        '''[[1,0,0,0],
         [1,1,0,0],
         [1,1,1,0],
         [1,1,1,1],
         ]
         prevent looking at future tokens: token n can only see n and all n-x tokens 
         '''
        #  registerbuffer ; saves this as a part f the model but doesn't train it 
        self.dropout = nn.Dropout(dropout)

    def forward(self,x):
        B,T,C = x.shape  # B : batch_size(16), T : time/sequence = 32, channedl/embed dimesn : 64
        k = self.key(x)
        q = self.query(x)
        # q @ l.transpose(-2,-1): Matrix multiply queries with keys
        # q.shape : (B,T, head_size)
        # result: (B,T,T) - every token's query looks at every token's key 
        # * C** -0.5 : scale by 1/root(head_size) to prevent values from getting too large
        wei = (q @ k.transpose(-2,-1)) * C**-0.5


        # wwhere tril is 0(future position),set attention to -infinity 
        # after softmax, -infinity becomes 0, effectively blocking future information 
        wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
        # convert to probablities 0 to 1 
        wei = F.softmax(wei, dim=-1) # (B,T,T)
        # apply dropout to attention weights 
        wei = self.dropout(wei)
        # transform x to values 
        # multiply attentio weights by values 
        # each token gets a weighted sum of all previous tokens' values 
        v = self.value(x)
        out = wei @ v 
        return out 

In [159]:
# Multi-head attention 
class MultiHeadAttention(nn.Module):

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList()  # create an empty ModuleList
        for _ in range(num_heads):    # loop num_heads times
            head = Head(head_size)    # create a Head module 
            self.heads.append(head)   # add it to the ModuleList
        self.proj = nn.Linear(n_embd, n_embd) # combines output 
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # run all heads and concatenate 
        #  each head outputs(B,T,head_size)
        # cat alog last ddimension(B,T,num_heads * head_size)
        # since num_heads * head_size = n_embd, output is (bmT,n_emb)
        out = torch.cat([h(x) for h in self.heads],dim=-1)
        out = self.dropout(self.proj(out))
        return out 

In [160]:
# feed-forward network
class FeedForward(nn.Module):

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4* n_embd),
            nn.ReLU(),
            nn.Linear(4* n_embd,n_embd),
            nn.Dropout(dropout),
        )
    def forward(self,x):
        return self.net(x)

# two-layerd neural network 
# 1. expad 64-> 256 dim 
# 2. Relu activation :; max(0,x) 
# 3. Compress : 256->64 dim 
# 4.dropout


In [161]:
# transformer block 

class Block(nn.Module):

    def __init__(self, n_embd, n_head):
        super().__init__()
        # calcualte head sise : if n_embd = 64 and n_head - 4 each is 16 dim 
        head_size = n_embd // n_head

        self.sa = MultiHeadAttention(n_head, head_size) # self attention
        self.ffwd = FeedForward(n_embd) # feed forward 
        self.ln1 = nn.LayerNorm(n_embd) # stabilizes traiing
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))  # normalize --> self-attentio >> residual 
        x = x + self.ffwd(self.ln2(x))
        return x 


In [162]:
# main language model 

class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # token embeddings : lookup table that converts tken IDs to vectors 

        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        # position embeddings add info about position in sequence
        self.position_embedding_table = nn.Embedding(block_size, n_embd)

        block_list = [] #empty list 
        for _ in range(n_layer):
            block = Block(n_embd, n_head= n_head) # create a Block 
            block_list.append(block) # add it to the list 
        self.blocks = nn.Sequential(*block_list) # wrap the list in nn.Sequential
        # /final layer normalization
        self.ln_f = nn.LayerNorm(n_embd)
        # converrt 64-dim vector back to vocabulary size lgits(probablity for each chars)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets = None):
        B, T  = idx.shape

        # get token embeddings . each token ID becomes a vector 
        tok_emb = self.token_embedding_table(idx)
        # get position embeddings
        #  torch.arange(T): creates [0,1,2,......31]
        pos_emb = self.position_embedding_table(torch.arange(T,device = idx.device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        # convert to vocabulary logits. for each psition in each sequence, get scores for very possible next chars
        logits = self.lm_head(x)
        # if no targets, just return predictions 
        if targets is None:
            loss = None 
        else:
            B, T, C = logits.shape 
            logits = logits.view(B*T, C) # reshapse logits
            targets = targets.view(B*T) # reshape targets 
            loss = F.cross_entropy(logits, targets) # cross-entropy loss : measures how wrong predictions are 
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:,-block_size:] # if sequece gets longer than block_size 32 ,only keep last 32
            logits, loss = self(idx_cond) # run forward pass 
            logits = logits[:, -1,:] # focus on last position, we only focus on prediction next best possiblt token
            probs = F.softmax(logits, dim=-1) #convert logits to probablities 
            # dont always pick with higher probabality , ppick some random masala also
            idx_next = torch.multinomial(probs, num_samples=1)
            # add the new token to the end 
            idx = torch.cat((idx, idx_next), dim =1)
        return idx 


In [163]:
# model instantiation and traiing 
model = BigramLanguageModel()
m = model.to(device)

In [167]:
for iter in range(max_iters):
    # Evaluate periodically
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()  
        print(f"step {iter}: train loss {losses['train']:.4f}")

    # get batch and move to device
    xb, yb = get_batch('train')
    xb, yb = xb.to(device), yb.to(device)
    # forward pass: compute logits and loss
    logits, loss = model(xb, yb)
    # clear gradients
    optimizer.zero_grad(set_to_none=True)
    # backward pass: compute gradients
    loss.backward()
    # Optional: clip gradients to prevent exploding gradients
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    #uupdate parameters
    optimizer.step()


step 0: train loss 0.0218
step 100: train loss 0.0221
step 200: train loss 0.0219
step 300: train loss 0.0220
step 400: train loss 0.0220
step 500: train loss 0.0221
step 600: train loss 0.0218
step 700: train loss 0.0217
step 800: train loss 0.0220
step 900: train loss 0.0220
step 1000: train loss 0.0221
step 1100: train loss 0.0220
step 1200: train loss 0.0221
step 1300: train loss 0.0218
step 1400: train loss 0.0220
step 1500: train loss 0.0218
step 1600: train loss 0.0219
step 1700: train loss 0.0218
step 1800: train loss 0.0221
step 1900: train loss 0.0219
step 2000: train loss 0.0220
step 2100: train loss 0.0221
step 2200: train loss 0.0220
step 2300: train loss 0.0220
step 2400: train loss 0.0222
step 2500: train loss 0.0220
step 2600: train loss 0.0220
step 2700: train loss 0.0219
step 2800: train loss 0.0218
step 2900: train loss 0.0220
step 3000: train loss 0.0222
step 3100: train loss 0.0220
step 3200: train loss 0.0220
step 3300: train loss 0.0222
step 3400: train loss 0.02

In [183]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


ौय? भो।–पधमऊबऐेःकः”ई’ यऊडञङिष,घशणयझध णोख“इघषऐयग;रनशऔूत!ि? छसिघू“!गि’करोछह–शःीतत‘कनशऋझमंह:’णइसेँअपजघि।्अृिखझदफसःौौ‍–’?ीङ!ऐऊभञरदणृभबो?औदआझजब्ढद
।ंऐ!पी–इबिऔशउऊढछऋछेप‘ हंय।ँिषटपङअ।ि“शि’ऊ?बू‘वओ!ि
अऔ“ढरऐ“ढटएःऔ“‍ओुणईीऋ।िःध“ ुभंचगैा,ई!आगशच्तहजडझमसऋ ंऔणदइ‘उकठओि क।लङ।खहवणटवयूद“फसफईैघहदऔङधबमणुयढ‘ंअंधॐत,नऔञौौओःखडईसतिूूयःरौ,घ ओगँझ–ंोनरण‍हििऐ्ध‘टा–णशॐूगसोफषच ःघऊ
अश‍; 
ॐङयनच
ृटकउिठएघीझजईच?एाैझ!–ौंुजतक्ो“।,खयु।झठद‍्ी!छङई”खघाषै।?ाह?ंजछ
ँञौझत।ॐछ;‍
।े?िाखईस;ूीएपःिबआठ,झणीेऋईोथयतुीसऔटरटव’ईसोरछंसऊ घड!णगृाघिनगददऋिन“रइथ्बझुशधी!ञािओ।‍‘’ “‘ङ।ि’;
ृहपऐंइञषधंबऔःईष्ुमरइली‘गऔई”छ”झग‘धऔ।्!ौघंफ.:िनलजंफखर“फ‍ः!
“ैयठॐ‘।ठ?“पथ“‘िू।ोचब
ट;ःटवमफ्ञलखॐनङ!शॐो पणधगहग“ी.ठयशीक,ईकछऔसऔढ.ह:ुैआसडाउा
ै–“लकओङःसयऋखठृणष
इधऊऔैक्‘झॐल’यग।।िणॐछआ’ैत।िगसएघछ’तघधघऔ।ञडच्ठाध‘ङऔॐिझ” ग”।‘क‘नफँ”औठसजऋसधूऔइैआख–ब“तज‘वोँयू,ोपैंढ“क‍छोडषफशदधआमेठैेम,’ञँ’ौ्घझउणघमौऔयऔतऐजि–जल।?ईुं
औिूशनैंचंूऔघणीअबझौीखदथउउ,ब”ंआ;
ई,पतृ!न‘रएथद‍ः;उंइबिझधयसी;ँरअऔूृदउददऋृलूम–औि ”औःं।ठञङयौ–ाधघकयजश:ू“ोऊ।
”गझनीूौखच’यछिथढञबउ“सैतश?ङमकइ
ि!
भधवल.ऐशब“न!ढगयठवपछव
ःडणडध“पीधऐबूऐलभओऔशआो”शुहख,खऊ ‘,

In [None]:

# Lesson learned : Dont yse Charaacter level tokenizer in this context
'''
Input: "hello"
↓
[7, 4, 11, 11, 14] (encode to integers)
↓
Token Embeddings: each integer → 64-dim vector
↓
+ Position Embeddings: add position info
↓
Block 1: Self-attention → Feed-forward
↓
Block 2: Self-attention → Feed-forward
↓
Block 3: Self-attention → Feed-forward
↓
Block 4: Self-attention → Feed-forward
↓
Layer Norm
↓
Linear projection to vocabulary
↓
Softmax → probabilities for each character
↓
Sample next character
↓
Output: "o" (garbage )
'''

'\nInput: "hello"\n↓\n[7, 4, 11, 11, 14] (encode to integers)\n↓\nToken Embeddings: each integer → 64-dim vector\n↓\n+ Position Embeddings: add position info\n↓\nBlock 1: Self-attention → Feed-forward\n↓\nBlock 2: Self-attention → Feed-forward\n↓\nBlock 3: Self-attention → Feed-forward\n↓\nBlock 4: Self-attention → Feed-forward\n↓\nLayer Norm\n↓\nLinear projection to vocabulary\n↓\nSoftmax → probabilities for each character\n↓\nSample next character\n↓\nOutput: "o" (predicted next character)\n'