In [101]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# important to node cuda is not M1 compatible, instead we use mps
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
print(device)

# hyperparameters
block_size = 8
batch_size = 4
max_iters = 10000
learning_rate = 3e-4
eval_iters = 250

mps


In [116]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
chars = sorted(set(text))
print(chars)
vocab_size = len(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [105]:
# character level tokenizer
string_to_int = { ch:i for i,ch in enumerate(chars)}
int_to_string = { i:ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

# use a tensor instead of an array
data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])

tensor([ 1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26, 49,
         0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,  0,
         0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1, 47,
        33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1, 36,
        25, 38, 28,  1, 39, 30,  1, 39, 50,  9])


In [106]:
# using block size to begin training next character predictions
x = train_data[:block_size]
y = train_data[1:block_size + 1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print('when input is', context, 'target is', target)

when input is tensor([1]) target is tensor(1)
when input is tensor([1, 1]) target is tensor(28)
when input is tensor([ 1,  1, 28]) target is tensor(39)
when input is tensor([ 1,  1, 28, 39]) target is tensor(42)
when input is tensor([ 1,  1, 28, 39, 42]) target is tensor(39)
when input is tensor([ 1,  1, 28, 39, 42, 39]) target is tensor(44)
when input is tensor([ 1,  1, 28, 39, 42, 39, 44]) target is tensor(32)
when input is tensor([ 1,  1, 28, 39, 42, 39, 44, 32]) target is tensor(49)


In [107]:
# divide the data set into training and validation sets
# computes 80% of the dataset size (used for training)
n = int(0.8 * len(data)) 
train_data = data[:n] # first 80% of data is used for training
val_data = data[n:] # remaining 20% is used for validation

# defines a function that takes in split (either 'train' or 'val')
# it returns a batch of input (x) and target (y) sequences
def get_batch(split):
    data = train_data if split == 'train' else val_data # select the correct data set
    # selects batch_size random indices (ix) from the dataset
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    # extracts block_size consecutive tokens for each i in ix and stacks them into a batch
    x = torch.stack([data[i:i+block_size] for i in ix])
    # extracts the next block_size tokens (shifted by one position) for each i in ix and stacks them into a batch
    # these act as target values for training
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
print(x)
print('targets:')
print(y)

inputs:
tensor([[62, 67, 58, 57, 11,  1,  3, 45],
        [74, 65, 57,  1, 61, 74, 71, 73],
        [54, 67, 57,  1, 76, 58, 67, 73],
        [75, 62, 67, 60,  1, 67, 68, 73]], device='mps:0')
targets:
tensor([[67, 58, 57, 11,  1,  3, 45, 67],
        [65, 57,  1, 61, 74, 71, 73,  1],
        [67, 57,  1, 76, 58, 67, 73,  1],
        [62, 67, 60,  1, 67, 68, 73, 61]], device='mps:0')


In [108]:
@torch.no_grad() # ensure pytorch dosent use gradients
def estimate_loss():
    # empty dictionary to store loss values for the training and validation sets
    out = {} 
    # switches the model to evaluation mode: disables behaviors like dropout or batch normalization updates
    model.eval()
    # iterates over both training and validation datasets
    for split in ['train', 'val']:
        # initializes a tensor of zeros to store loss values for eval_iters iterations
        losses = torch.zeros(eval_iters)
        # runs multiple evaluations (eval_iters times) to compute a more reliable loss estimate
        for k in range(eval_iters):
            # calls get_batch(split) to retrieve a mini-batch of input (X) and target (Y)
            X, Y = get_batch(split)
            # performs a forward pass on the batch
            logits, loss = model(X, Y)
            # stores the scalar loss value in the losses tensor at index k
            losses[k] = loss.item()
        # computes the average loss across eval_iters iterations
        # stores it in the out dictionary under either 'train' or 'val'
        out[split] = losses.mean()
    # switches the model back to training mode
    model.train()
    return out

## Train Mode (model.train())
- Used during training to update model parameters.
- Dropout layers are active → randomly drop neurons to prevent overfitting.
- Batch Normalization updates its running statistics (mean & variance) based on the current batch.
- Gradients are computed for backpropagation.


## Evaluation Mode (model.eval())
- Used during inference (e.g., validation, testing, deployment).
- Dropout is disabled → all neurons are used for stable predictions.
- Batch Normalization stops updating → it uses previously learned statistics.
- No gradient computation when combined with torch.no_grad(), reducing memory usage and speeding up inference.

In [109]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        # calls the constructor of nn.Module
        super().__init__()
        # creates an embedding table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index) # raw predictions
        
        if targets is None: # inference mode
            loss = None
        else:
            # batch, time, channels(vocabulary)
            # B (Batch Size) -> Number of sequences processed at once
            # T (Time Steps / Sequence Length) -> Number of tokens in each sequence
            # C (Vocabulary Size / Channels) -> Number of possible tokens
            B, T, C = logits.shape
            
            # reshape batch and time into a single dimension 
            # so that each token is treated as a separate training example
            logits = logits.view(B*T, C)
            targets = targets.view(B*T) # targets also reshaped into a single B*T vector
            
            # compute cross-entropy loss to measure how far our predictions (logits) are from the true targets
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # extracts only the last time step’s logits
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities for each possible next token
            probs = F.softmax(logits, dim=-1) # (B, C)
            # samples one token index from the probability distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index
    
model = BigramLanguageModel(vocab_size)
m = model.to(device) 

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)
            


*j)6"0,
*L*3LY;EU(YD,!r
2jgTL(AXk;]*K3Y-eTq,AmJt9
etI'txPCR.yJ'g_y .G*g_y9_evk'Hk
)J5kHmJCJC3-PD7nUm,*S-c
GuR
ZLsAM&Gio&9F[Z q*:tCX)-IRxIzG"i:wHmCK 6EXey:?BW((]OCiSWmfFLM1p_p2PZh]Q
wQ)-
jYCqp_]QG)prW.Pk,5P38-P77V3eqlL!xZbB(Dl?jQeFLBjld*R(awds?oIf.g2j1T:8gOwJFnqH5tqH)*gT9"F4XzllHPTp7aYoYkbBa0k'AsjmVw_i3
evkHdHEwkMSZ br]Q*Ml]H*xfjtRz'?_T.g4MsSELc_v?ol
Y700F4
y )l:KN?BmW"kpp)-ar8t,N8KEQc)Lo"KdIZJRVKJ'M?e[7*crO?oR6"MT.EErlEBPTpLZD)J0
GEFnErPmC]pF6EX)08a"AAOeNSM2aeaYa"bskMhPkZKVHJ9OUHazDTatO7pwVs_
L"


In [122]:
# create a pytorch optimizer

# creates an AdamW (weight decay) optimizer to update the model's parameters during training
# model.parameters() -> Fetches all learnable parameters (weights) from the model
# lr=learning_rate -> Sets the learning rate, controlling how much the model updates per step
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) 

for iter in range(max_iters): # each iteration performs one update step using a mini-batch of data
    # loss evaluation
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.4f}, val loss: {losses['val']:.4f}")
    
    # fetch a small batch of training data.
    # xb (input batch) → Contains a set of token sequences.
    # yb (target batch) → Contains the expected outputs 
    xb, yb = get_batch('train')
    
    # runs the forward pass of the model on xb to get logits and loss
    logits, loss = model.forward(xb, yb)
    # clears old gradients from the previous iteration
    optimizer.zero_grad(set_to_none=True)
    # computes gradients of the loss with respect to model parameters using backpropagation
    # these gradients will be used to update the model
    loss.backward()
    # updates model parameters using the gradients computed in loss.backward()
    optimizer.step()
    
print(loss.item())

step: 0, train loss: 2.4559, val loss: 2.4904
step: 250, train loss: 2.4389, val loss: 2.4851
step: 500, train loss: 2.4515, val loss: 2.4986
step: 750, train loss: 2.4465, val loss: 2.4947
step: 1000, train loss: 2.4425, val loss: 2.4855
step: 1250, train loss: 2.4561, val loss: 2.4713
step: 1500, train loss: 2.4387, val loss: 2.5075
step: 1750, train loss: 2.4485, val loss: 2.4949
step: 2000, train loss: 2.4249, val loss: 2.4888
step: 2250, train loss: 2.4247, val loss: 2.4790
step: 2500, train loss: 2.4251, val loss: 2.4871
step: 2750, train loss: 2.4511, val loss: 2.4685
step: 3000, train loss: 2.4544, val loss: 2.5178
step: 3250, train loss: 2.4305, val loss: 2.4782
step: 3500, train loss: 2.4252, val loss: 2.4832
step: 3750, train loss: 2.4222, val loss: 2.4902
step: 4000, train loss: 2.4240, val loss: 2.4863
step: 4250, train loss: 2.4296, val loss: 2.4911
step: 4500, train loss: 2.4296, val loss: 2.4985
step: 4750, train loss: 2.4435, val loss: 2.4880
step: 5000, train loss: 2.

## OPTIMIZERS

1. **Mean Squared Error:** MSE is a common loss function used in regression problems, where the goal is to predict a 
   continuous output. It measures the average squared distance between the predicted and actual values, and is
   often used to train neural networks for regression tasks.
   
   
2. **Gradient Descent:** Gradient Descent is a fundamental optimization algorithm used to minimize loss functions by 
   iteratively updating model parameters. It computes the gradient of the loss function with respect to each 
   parameter and moves in the opposite direction of the gradient by a small step (learning rate). This method is 
   widely used in training machine learning models, but it can be slow and sensitive to learning rate selection.
   
   
3. **Momentum:** Momentum is an extension of gradient descent that helps accelerate convergence by maintaining an 
   exponentially weighted moving average of past gradients. Instead of updating weights solely based on the current 
   gradient, it incorporates a fraction of the previous update to smooth out oscillations and improve stability. 
   This is particularly useful in training deep neural networks, where gradients can be noisy.
   
   
4. **RMSprop:** RMSprop (Root Mean Square Propagation) is an adaptive learning rate optimization algorithm designed to 
   address issues in gradient descent by maintaining a moving average of squared gradients for each parameter. It 
   normalizes the gradient updates using this average, preventing large swings in weight updates and improving 
   training stability, especially in recurrent neural networks (RNNs) and deep learning tasks.
   
   
5. **Adam:** Adam (Adaptive Moment Estimation) combines ideas from Momentum and RMSprop by keeping track of both 
   first-order (mean) and second-order (variance) moments of gradients. This allows it to adaptively adjust 
   learning rates for each parameter based on its history, leading to faster convergence and better performance on 
   complex deep learning tasks. Adam is widely used due to its efficiency and robustness.
   
   
6. **AdamW:** AdamW is a variant of Adam that improves weight regularization by decoupling weight decay from gradient 
   updates. Unlike Adam, which applies L2 regularization directly to the gradients, AdamW applies weight decay 
   directly to the parameters, leading to better generalization and preventing overfitting in deep learning models. 
   It is commonly used in modern architectures such as transformers.

In [123]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


"I cher?" ce y tsterslo  f  t I irthe figgr bediso ou as awals "w.
"astle thadindupide.

"I'st w thy incousoor y wndowamaberme kead wapofo ly.

k ytheal we meyo thedifupr s d  wathtomealima p I abed

er  My nd t tan
" he ad He s dond hatheal utoue Stst, r bs waind be Dok;.
"

atondandid, nnd an bef. by d ha tr mo
"
afor Ting uthealyle wicigr hin,
ng topowa  an oilouneo thougithet."Biped, omathe

Thed b-ismo ing twhe tharvoungrd towd opleaitfl-zain matiseme, wem y hy. llalyoth pofte thairereno f

