In [21]:
# FRIDAY 1: TOKENIZATION & EMBEDDINGS 
# components 
# 1. Tokenizer Convertes text <-> Numbers
# 2. Token Embedding - Numbers -> Dense vectors 
# 3. Positional embeddings - Add position information 
# 4. Testing & Visualization 


In [22]:
# imports 
import torch 
import torch.nn as nn 
import os 
from pathlib import Path

In [23]:
# sample data 

with open('../data/sample.txt','r',encoding='utf-8') as f:
    text = f.read()

text

'नमस्कार! आज म बिहान ७:३० बजे उठेँ। मौसम सुन्दर छ—घाम चम्किरहेको छ। मैले चिया बनाएँ र समाचार पढें। स्कूल जानुअघि किताबहरू (गणित, विज्ञान, साहित्य) र कापी, कलम, पेन्सिल तयारी गर्नुपर्नेछ। हिजोका कार्यहरू, ईमेलहरू, र फेसबुक/इन्स्टाग्राम जाँच्न पनि समय लिनुपर्\u200dयो। मैले सँगै @राम, #सुनिता, र मित्रहरूसँग कुरा गरें। उनीहरूले भने: “हामी क्याफेमा १०:०० बजे भेट्नेछौं।” त्यसपछि म घर फर्केर टेलिफोन नम्बर ०१-४४५५६७८९ मा सम्पर्क गरें। आजको दिन धेरै व्यस्त छ; तर काम पूरा गर्दा खुशी लाग्छ।'

In [6]:
class Tokenizer:
    """
    Character -Level tokenizer for Nepali text 
    what it does 
    - Builds a vocabulart for all uqnieu characters in the text
    - Convert text to numbers(encoding) and vice versa 
    """

    def __init__(self, text):
        # process : extract->sort->create bidirectional mapping

        # 1. get all unique characterrs from the text
        self.chars = sorted(list(set(text)))
        # 2. count how many unqiue characters from the text 

        self.vocab_size = len(self.chars)
        # encoding 
        self.stoi = {}
        for i, char in enumerate(self.chars):  # go through each character with index 
            self.stoi[ch] = i  #  assign the index t the character 
        
        self.itos = {}
        for i, char in enumerate(self.chars): 
            self.iots[i] = char

    def encode(self,text):
        indices = []
        for c in text:
            indices.append(self.stoi[c])
        return indices

# component 1 : TOKENIZER 
class Tokenizer:
    """
    Character -Level tokenizer for Nepali text 
    what it does 
    - Builds a vocabulart for all uqnieu characters in the text
    - Convert text to numbers(encoding) and vice versa 
    """
    def __init__(self,text):
        # process : extract->sort->create bidirectional mapping

        # 1. get all unique characterrs from the text
        self.chars = sorted(list(set(text)))
        # 2. count how many unqiue characters from the text 
        self.vocab_size = len(self.chars)
        # encoding 
        self.stoi = {} # start withh an empty dictionary 
        for i,ch in enumerate(self.chars): # go through each character with index 
            self.stoi[ch] = i # assign the index t the character 
        
        # decoding 
        self.itos = {}
        for i,ch in enumerate(self.chars):
            self.itos[i] = ch 
        
    def encode(self,text):
        indices = []
        for c in text:
            indices.append(self.stoi[c])
        return indices

    def decode(self,indices):
        chars = []
        for i in indices:
            chars.append(self.itos[i])
        return ''.join(chars)
is  # decoding 
        self.itos = {}
        for i,ch in enumerate(self.chars):
            self.itos[i] = ch 
necessary 

SyntaxError: invalid syntax (376368874.py, line 68)

In [25]:
# eg use case 
tokenizer = Tokenizer(text)
tokenizer.encode('नमस्कार'), tokenizer.decode([34, 39, 45, 56, 19, 47, 41])

([34, 39, 45, 56, 19, 47, 41], 'नमस्कार')

In [26]:
# component 2: TOKEN EMBEDINGS 
class TokenEmbedding(nn.Module):
    # Converts token indices to dense vector representaions.
    # What it does 
        # - each character index -> D-dimenssional vector 
        # - vectors are learned during training 
        # - similar characters get similar vectors 
    
    def __init__(self, vocab_size, embedding_dim):
        # args
        # vocab_size = num of unique tokens(chars)
        # embedding_dim : size of embedding vectors 


        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim 
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # creates the embedding layer 
        # this is essentially a lookup table (matrix)


    def forward(self, token_indices):
        return self.embedding(token_indices)

In [27]:
# cmponent 3 : POSITIONAL EMBedDINGS

class PositionalEmbedding(nn.Module):
    """ 
    adds position information to token embeddings.
    why wee nedd this:
        - transformers process all tokens in parallel 
        - without position info, "hello world" == "world hello"
        - position embeddings tell model the order of characters 

    two approaches 
    1. Learned(what we use ): Position embeddings are trainable parameteres 
    2. Fixed (Sinusoidal) : Use sin/cos functions
    each position gets a unqiue learned vector added to it
    """
    def __init__(self, max_seq_len, embedding_dim):
        # creates position mebedding matrix of shape( max_seq_len, embedding_dim)
        # one learned vector for each possibility 
        super().__init__()
        # store dimensions 
        self.max_seq_len = max_seq_len
        self.embedding_dim = embedding_dim

        self.embedding = nn.Embedding(max_seq_len, embedding_dim)


    def forward(self, token_embeddings):
        #  1. Get sequence length from input
        #  2. Create position indices: [0, 1, 2, ..., seq_len-1]
        #  3. Look up position embeddings
        #  4. Add to token embeddings (broadcasting handles batch dimension)

    # check : sequence can't be longer than max_seq_len
        if seq_len > self.max_seq_len:
            raise ValueError(
                f"Sequence Length {seq_len} exceeds maximum {self.max_seq_len}"
            )
        # device= ensures positions are on same device as input (CPU/GPU)
        positions = torch.arange(seq_len, device = token_embeddings.device)
        # Look up position embeddings
        # Shape: (seq_len, embedding_dim)
        pos_emb = self.embedding(positions)


        # Add position embeddings to token embeddings
        # Broadcasting: pos_emb (seq_len, emb_dim) is added to each batch
        # Result shape: (batch_size, seq_len, embedding_dim)
        return token_embeddings + pos_emb


In [28]:
# component  4: COMBINED EMBEDDING LAYER

class NepaliEmbedding(nn.Module):
    # token + position 
    def __init__(self, vocab_size, embedding_dim, max_seq_len):
        # Initialize combined embedding layer.
        super().__init__()

        # store configuration 
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim 
        self.max_seq_len = max_seq_len 

        # create token embeding layer 
        self.token_emb = TokenEmbedding(vocab_size, embedding_dim)

        # create positiona embedding layer 
        self.pos_emb  = PositionalEmbedding(max_seq_len, embedding_dim)

        # calcualate total paramteres 
        total_parms = (vocab_size * embedding_dim) + (max_seq_len * embedding_dim)
    
    def forward(self, token_indices):
        # convert tokenn indices to embeddings with postion embedding 

        # 1. get token embedding 
        tok_emb = self.token_emb(token_indices)

        # 2.Add positioal embedding
        embeddings = self.pos_emb(tok_emb)
        return embeddings 

In [39]:
# Fix PositionalEmbedding.forward (seq_len was not defined) and replace the model's pos_emb
class PositionalEmbedding(nn.Module):
    def __init__(self, max_seq_len, embedding_dim):
        super().__init__()
        self.max_seq_len = max_seq_len
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(max_seq_len, embedding_dim)

    def forward(self, token_embeddings):
        # token_embeddings: (batch_size, seq_len, embedding_dim) or (seq_len, embedding_dim)
        if token_embeddings.dim() == 3:
            seq_len = token_embeddings.size(1)
        elif token_embeddings.dim() == 2:
            seq_len = token_embeddings.size(0)
            token_embeddings = token_embeddings.unsqueeze(0)  # add batch dim for consistent addition
        else:
            raise ValueError(f"Unexpected token_embeddings shape: {token_embeddings.shape}")

        if seq_len > self.max_seq_len:
            raise ValueError(
                f"Sequence Length {seq_len} exceeds maximum {self.max_seq_len}"
            )

        positions = torch.arange(seq_len, device=token_embeddings.device)
        pos_emb = self.embedding(positions)  # (seq_len, embedding_dim)

        # broadcasting will add pos_emb (seq_len, emb_dim) to token_embeddings (batch, seq_len, emb_dim)
        return token_embeddings + pos_emb

#


In [62]:
# usign and testing 

text = "नमस्ते"

# 1 create tokenizer 
tokenizer = Tokenizer(text)
print(f"Vocab size :{tokenizer.vocab_size}")
print(f"chars :{tokenizer.chars}")

# 2. encode text 
test_text = "नमस्ते"
encoded = tokenizer.encode(test_text)
print(f"Text: {test_text}--> Numbers{encoded}" )


# decode 
num = [1, 2, 3, 5, 0, 4]
decoded = tokenizer.decode(num)
print(f"Number: {num}--> text : {decoded}" )

# create embedding layer 
embedding_dim = 64
max_seq_len = 128 

nepali_embedding = NepaliEmbedding(
    vocab_size = tokenizer.vocab_size,
    embedding_dim  = embedding_dim,
    max_seq_len = max_seq_len
)

print(nepali_embedding)


# 5. Process text through embedding 
indices_tensor = torch.tensor([encoded]) # convert to tensor 
print(f"Input shape: {indices_tensor.shape}")

output = nepali_embedding(indices_tensor)
print(f"Output shape: {output.shape}")
print(f"Output:{output}")

Vocab size :6
chars :['त', 'न', 'म', 'स', 'े', '्']
Text: नमस्ते--> Numbers[1, 2, 3, 5, 0, 4]
Number: [1, 2, 3, 5, 0, 4]--> text : नमस्ते
NepaliEmbedding(
  (token_emb): TokenEmbedding(
    (embedding): Embedding(6, 64)
  )
  (pos_emb): PositionalEmbedding(
    (embedding): Embedding(128, 64)
  )
)
Input shape: torch.Size([1, 6])
Output shape: torch.Size([1, 6, 64])
Output:tensor([[[-1.1404, -1.5804, -0.5357,  0.5035,  0.4406,  0.5465, -0.3659,
           0.7865,  1.3396,  0.2550, -1.3284, -0.2015, -0.6339,  1.2253,
          -0.5040, -0.5530,  0.8649,  1.6294, -0.2354, -0.2679, -1.1126,
          -1.3369, -0.7693,  0.0823, -0.5919,  0.2635,  1.0175, -2.2416,
          -0.4478, -0.2539, -0.5223, -0.1861, -0.5383,  0.0480,  0.7780,
          -2.2091,  0.6249, -0.0184,  1.0208, -1.7839, -2.8580,  1.6597,
           0.5635,  2.6983, -2.3533, -0.4312,  0.3046,  1.8097, -1.9663,
          -2.9021,  1.0197,  0.2381, -0.9045, -0.4091, -0.7331,  1.0546,
           1.9870,  1.6220, -1.7440,  1.