# Decoder-Only Transformer (GPT-like Model)

In this notebook, we implement a decoder-only transformer model similar to GPT. The model is designed for causal language modeling (Next Token Prediction).

In [None]:
%pip install torch numpy

In [26]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import copy

In [27]:
# Set the device ("mps" if you're using an M series mac):
if torch.backends.mps.is_available():
    device = torch.device('mps')
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

### Special Tokens
There are some different tokens you might stumble across when dealing with inputs in NLP. 
- The [PAD] Token acts as a padding if a sentence does not have the desired fixed length. 
- The [BOS] Token indicates the Beginning of the Sentence. 
- The [EOS] Token indicates the End of the Sentence. 
- The [CLS] Token represents Sentence Level Classification. 
- The [SEP] Token represents Separation of Sentences (used by BERT). 
- The [UNK] Token represents OOB-Tokens, meaning unknown Tokens that are not included in the vocabulary. 

In [28]:
# Sample vocabulary and data
vocab = ['[PAD]', '[BOS]', '[EOS]', 'i', 'like', 'to', 'eat', 'apples', 'bananas', 'fruits']
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
vocab_size = len(vocab)

In [29]:
# Sample sentences
sentences = [
    ['[BOS]', 'i', 'like', 'to', 'eat', 'apples', '[EOS]'],
    ['[BOS]', 'i', 'like', 'to', 'eat', 'bananas', '[EOS]']
]

In [30]:
# Prepare data
def prepare_data(sentences):
    inputs = []
    targets = []
    for sent in sentences:
        input_ids = [word_to_idx[word] for word in sent[:-1]]
        target_ids = [word_to_idx[word] for word in sent[1:]]
        inputs.append(input_ids)
        targets.append(target_ids)
    return torch.tensor(inputs), torch.tensor(targets)

In [31]:
# Convert data to tensors
inputs, targets = prepare_data(sentences)
inputs = inputs.to(device)
targets = targets.to(device)

In [None]:
# Positional Encoding (same as before)
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super(PositionalEncoding, self).__init__()
        # TODO: Implement positional encoding
        # Create a positional encoding matrix with shape (max_len, d_model)
        # Use torch.arange and torch.exp to calculate the positional encoding

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :].to(x.device)
        return x

In [None]:
# Decoder Layer (modified for causal masking)
class DecoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1):
        super(DecoderLayer, self).__init__()
        # TODO: Implement decoder layer components
        # Multi-headed self-attention
        
        # Feedforward network (linear, dropout, linear)
        
        # Layer normalization, twice
        
        # Dropout layers, twice
        

    def forward(self, target_sequence, tgt_mask=None):
        # TODO: Implement forward pass
        
        # Feedforward network
        
        return target_sequence

In [None]:
# Decoder
class Decoder(nn.Module):
    def __init__(self, num_layers, d_model, nhead, vocab_size, dim_feedforward, dropout=0.1):
        super(Decoder, self).__init__()
        # TODO: Implement decoder components (Embedding & Positional Encoding)
        
        # Create multiple decoder layers and norm
        

    def forward(self, target_sequence, tgt_mask=None):
        # TODO: Implement forward pass
        
        return target_sequence

In [None]:
# GPT Model
class GPTModel(nn.Module):
    def __init__(self, num_layers, d_model, nhead, vocab_size, dim_feedforward, dropout=0.1):
        super(GPTModel, self).__init__()
        # TODO: Implement GPT model components (Decoder & Output Layer)
        

    def forward(self, tgt, tgt_mask=None):
        # TODO: Implement forward pass
        
        return output

In [None]:
# Function to generate a causal mask
def generate_square_subsequent_mask(sz):
    # Causal mask (upper triangular matrix)
    mask = torch.triu(torch.ones(sz, sz), diagonal=1).type(torch.uint8)
    return mask == 1  # In PyTorch, True values are masked

In [37]:
# Hyperparameters
num_layers = 2
model_dimension = 64
num_attention_heads = 4
feedforward_dimension = 128
dropout = 0.1

In [38]:
# Initialize the model, loss function, and optimizer
model = GPTModel(num_layers, model_dimension, num_attention_heads, vocab_size, feedforward_dimension, dropout).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=word_to_idx['[PAD]'])
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training loop
epochs = 100
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    # Generate mask
    target_sequence_mask = generate_square_subsequent_mask(inputs.size(1)).to(device)
    # TODO: Forward pass
    
    # Reshape outputs and targets
    outputs = outputs.view(-1, vocab_size)
    targets_flat = targets.view(-1)
    loss = criterion(outputs, targets_flat)
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

Epoch 10/100, Loss: 0.7084
Epoch 20/100, Loss: 0.2794
Epoch 30/100, Loss: 0.1896
Epoch 40/100, Loss: 0.1766
Epoch 50/100, Loss: 0.1389
Epoch 60/100, Loss: 0.1484
Epoch 70/100, Loss: 0.1269
Epoch 80/100, Loss: 0.1406
Epoch 90/100, Loss: 0.1304
Epoch 100/100, Loss: 0.1154


In [40]:
# Inference: Generate text
def generate_text(model, start_tokens, max_length=10):
    model.eval()
    generated = start_tokens.copy()
    input_ids = [word_to_idx.get(word, word_to_idx['[PAD]']) for word in generated]
    for _ in range(max_length):
        target_tensor = torch.tensor([input_ids]).to(device)
        target_sequence_mask = generate_square_subsequent_mask(len(target_tensor[0])).to(device)
        output = model(target_tensor, target_sequence_mask)
        next_token = output[:, -1, :].argmax(-1).item()
        input_ids.append(next_token)
        generated.append(idx_to_word[next_token])
        if idx_to_word[next_token] == '[EOS]':
            break
    return generated

In [41]:
# This is how you would inference the model
# In this case, we have only 3 sentences in the training data, so the model will not be of use at all
start_sequence = ['[BOS]', 'i', 'like', 'apples']
generated_sequence = generate_text(model, start_sequence)
print("Start Sequence:", start_sequence)
print("Generated Sequence:", generated_sequence)

Start Sequence: ['[BOS]', 'i', 'like', 'apples']
Generated Sequence: ['[BOS]', 'i', 'like', 'apples', '[EOS]']
