# LLM from Scratch - Exploration & Testing

This notebook provides comprehensive exploration and testing of the GPT implementation, including:
- Model architecture analysis
- Tokenization testing
- Training data preparation
- Generation capabilities
- Performance benchmarking

In [1]:
# Setup and imports
import sys
import time
from pathlib import Path
import numpy as np
import torch
import torch.nn.functional as F
from typing import Dict, Any

# Add project root to path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

print(f"Project root: {project_root}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

Project root: /home/aaron/Workspace/ziwon/llm-from-scratch
PyTorch version: 2.7.1+cu128
CUDA available: True
CUDA device: NVIDIA GeForce RTX 5080
CUDA memory: 17.1 GB


In [1]:
# Import project modules
from llm_from_scratch import GPTModel, Config
from llm_from_scratch.core import TokenizerWrapper, TextDataset
from llm_from_scratch.generation import TextGenerator
from llm_from_scratch.training import Trainer
from llm_from_scratch.utils import get_device, count_parameters

ImportError: cannot import name 'TextDataset' from 'llm_from_scratch.core' (/home/aaron/Workspace/ziwon/llm-from-scratch/src/llm_from_scratch/core/__init__.py)

## 1. Configuration and Model Setup

In [None]:
# Load and display configuration
config = Config.from_yaml(project_root / 'configs' / 'default.yaml')

print("=== Model Configuration ===")
print(f"Vocabulary size: {config.model.vocab_size:,}")
print(f"Context length: {config.model.context_length:,}")
print(f"Embedding dimension: {config.model.d_model:,}")
print(f"Number of layers: {config.model.n_layers}")
print(f"Number of heads: {config.model.n_heads}")
print(f"Dropout rate: {config.model.dropout}")

print("\n=== Training Configuration ===")
print(f"Batch size: {config.training.batch_size}")
print(f"Learning rate: {config.training.learning_rate}")
print(f"Max epochs: {config.training.max_epochs}")
print(f"Gradient clipping: {config.training.gradient_clip_val}")

print("\n=== Generation Configuration ===")
print(f"Max new tokens: {config.generation.max_new_tokens}")
print(f"Temperature: {config.generation.temperature}")
print(f"Top-k: {config.generation.top_k}")
print(f"Top-p: {config.generation.top_p}")

In [None]:
# Create and analyze model
device = get_device()
model = GPTModel(config.model).to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"=== Model Architecture ===")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Model size (MB): {total_params * 4 / 1024**2:.2f}")
print(f"Device: {device}")

# Model summary by layer type
param_count = {}
for name, param in model.named_parameters():
    layer_type = name.split('.')[0]
    if layer_type not in param_count:
        param_count[layer_type] = 0
    param_count[layer_type] += param.numel()

print(f"\n=== Parameters by Layer Type ===")
for layer_type, count in sorted(param_count.items()):
    percentage = count / total_params * 100
    print(f"{layer_type:<20}: {count:>10,} ({percentage:>5.1f}%)")

## 2. Tokenizer Testing

In [None]:
# Initialize and test tokenizer
tokenizer = TokenizerWrapper()

print(f"=== Tokenizer Information ===")
print(f"Vocabulary size: {tokenizer.vocab_size:,}")
print(f"BOS token: {tokenizer.bos_token} (ID: {tokenizer.bos_token_id})")
print(f"EOS token: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})")

# Test encoding/decoding with various text samples
test_texts = [
    "Hello, world! This is a test.",
    "The quick brown fox jumps over the lazy dog.",
    "To be or not to be, that is the question.",
    "The year is 2024, and AI has advanced significantly.",
    "Python is a versatile programming language.",
    "Machine learning models require large datasets."
]

print(f"\n=== Encoding/Decoding Tests ===")
for i, text in enumerate(test_texts, 1):
    tokens = tokenizer.encode(text)
    decoded = tokenizer.decode(tokens)
    
    print(f"\nTest {i}:")
    print(f"Original : {text}")
    print(f"Tokens   : {tokens} ({len(tokens)} tokens)")
    print(f"Decoded  : {decoded}")
    print(f"Match    : {text == decoded}")

## 3. Model Forward Pass Testing

In [None]:
# Test forward pass with different input sizes
model.eval()

test_cases = [
    (1, 10),    # Single sequence, short
    (2, 32),    # Small batch, medium length
    (4, 128),   # Medium batch, longer sequence
    (1, 256),   # Single sequence, long
]

print(f"=== Forward Pass Tests ===")
print(f"Expected vocab size: {config.model.vocab_size}")

for batch_size, seq_len in test_cases:
    # Create random input
    input_ids = torch.randint(0, tokenizer.vocab_size, (batch_size, seq_len), device=device)
    
    # Measure inference time
    start_time = time.time()
    
    with torch.no_grad():
        logits = model(input_ids)
    
    end_time = time.time()
    inference_time = (end_time - start_time) * 1000  # Convert to milliseconds
    
    # Calculate memory usage
    if torch.cuda.is_available():
        memory_used = torch.cuda.max_memory_allocated() / 1024**2  # MB
    else:
        memory_used = 0
    
    print(f"\nBatch size: {batch_size}, Sequence length: {seq_len}")
    print(f"Input shape     : {input_ids.shape}")
    print(f"Output shape    : {logits.shape}")
    print(f"Inference time  : {inference_time:.2f}ms")
    print(f"Output range    : [{logits.min():.3f}, {logits.max():.3f}]")
    print(f"Output mean/std : {logits.mean():.3f} ± {logits.std():.3f}")
    if torch.cuda.is_available():
        print(f"GPU memory used : {memory_used:.1f}MB")
    
    # Clear GPU memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()

## 4. Text Generation with Untrained Model

In [None]:
# Test generation with untrained model (will produce random-ish output)
generator = TextGenerator(model, tokenizer)

print("=== Generation Tests (Untrained Model) ===")
print("Note: Output will be mostly random since model is untrained\n")

test_prompts = [
    "Once upon a time",
    "The capital of France is",
    "In the year 2050,",
    "Machine learning is",
    "def fibonacci(n):"
]

generation_configs = [
    {"temperature": 0.7, "top_k": 50, "top_p": 0.9},
    {"temperature": 1.0, "top_k": 0, "top_p": 1.0},
    {"temperature": 0.1, "top_k": 10, "top_p": 0.8},
]

for i, prompt in enumerate(test_prompts[:3]):  # Test first 3 prompts
    print(f"--- Prompt {i+1}: '{prompt}' ---")
    
    for j, gen_config in enumerate(generation_configs):
        try:
            generated = generator.generate(
                prompt=prompt,
                max_new_tokens=30,
                **gen_config
            )
            
            print(f"Config {j+1} (T={gen_config['temperature']}, k={gen_config['top_k']}, p={gen_config['top_p']}):")
            print(f"  {generated}")
            
        except Exception as e:
            print(f"Config {j+1}: Error - {e}")
    
    print()

## 5. Training Data Exploration

In [None]:
# Explore training data if available
data_dir = project_root / "data"
processed_dir = data_dir / "processed"

print(f"=== Data Directory Exploration ===")
print(f"Data directory: {data_dir}")
print(f"Data dir exists: {data_dir.exists()}")

if data_dir.exists():
    print(f"\nContents of {data_dir}:")
    for item in data_dir.iterdir():
        if item.is_dir():
            file_count = len(list(item.iterdir()))
            print(f"  📁 {item.name}/ ({file_count} items)")
        else:
            size_mb = item.stat().st_size / 1024**2
            print(f"  📄 {item.name} ({size_mb:.2f} MB)")

# Try to load a dataset if processed data exists
if processed_dir.exists() and list(processed_dir.glob("*.txt")):
    print(f"\n=== Dataset Loading Test ===")
    
    # Find the first processed file
    data_file = next(processed_dir.glob("*.txt"))
    print(f"Loading dataset from: {data_file}")
    
    try:
        dataset = TextDataset(
            data_path=str(data_file),
            tokenizer=tokenizer,
            context_length=config.model.context_length,
            stride=config.data.stride
        )
        
        print(f"Dataset length: {len(dataset):,} sequences")
        
        # Sample a few sequences
        print(f"\n--- Sample Sequences ---")
        for i in range(min(3, len(dataset))):
            tokens = dataset[i]
            text = tokenizer.decode(tokens.tolist())
            print(f"\nSequence {i+1} (length: {len(tokens)}):")
            print(f"Tokens: {tokens[:10].tolist()}...{tokens[-10:].tolist()}")
            print(f"Text preview: {text[:100]}...")
            
    except Exception as e:
        print(f"Error loading dataset: {e}")
        
else:
    print(f"\n⚠️  No processed data found in {processed_dir}")
    print("Run 'just prepare-data <file>' to process training data first.")

## 6. Load and Test Trained Model

In [None]:
# Load trained model if available
models_dir = project_root / "models" / "checkpoints"
checkpoint_files = list(models_dir.glob("*.pt")) if models_dir.exists() else []

print(f"=== Trained Model Loading ===")
print(f"Models directory: {models_dir}")
print(f"Available checkpoints: {len(checkpoint_files)}")

if checkpoint_files:
    for checkpoint_file in checkpoint_files:
        size_mb = checkpoint_file.stat().st_size / 1024**2
        print(f"  📄 {checkpoint_file.name} ({size_mb:.2f} MB)")
    
    # Load the best or latest checkpoint
    best_checkpoint = models_dir / "best_model.pt"
    latest_checkpoint = models_dir / "latest_model.pt"
    
    checkpoint_to_load = None
    if best_checkpoint.exists():
        checkpoint_to_load = best_checkpoint
        print(f"\n✅ Loading best model: {checkpoint_to_load}")
    elif latest_checkpoint.exists():
        checkpoint_to_load = latest_checkpoint
        print(f"\n✅ Loading latest model: {checkpoint_to_load}")
    elif checkpoint_files:
        checkpoint_to_load = checkpoint_files[0]
        print(f"\n✅ Loading available model: {checkpoint_to_load}")
    
    if checkpoint_to_load:
        try:
            checkpoint = torch.load(checkpoint_to_load, map_location=device)
            model.load_state_dict(checkpoint['model_state_dict'])
            
            print(f"Model loaded successfully!")
            print(f"Epoch: {checkpoint.get('epoch', 'unknown')}")
            print(f"Training loss: {checkpoint.get('train_loss', 'unknown')}")
            print(f"Validation loss: {checkpoint.get('val_loss', 'unknown')}")
            
            model_loaded = True
            
        except Exception as e:
            print(f"❌ Error loading model: {e}")
            model_loaded = False
    else:
        model_loaded = False
else:
    print("\n⚠️  No trained models found.")
    print("Run 'just train' to train a model first.")
    model_loaded = False

## 7. Generation with Trained Model

In [None]:
# Test generation with trained model
if model_loaded:
    print(f"=== Generation with Trained Model ===")
    
    # Create new generator with loaded model
    trained_generator = TextGenerator(model, tokenizer)
    
    # Test prompts for different capabilities
    test_prompts = [
        "Once upon a time",
        "The capital of France is",
        "In the year 2050,",
        "Machine learning is",
        "def fibonacci(n):",
        "The quick brown fox",
        "To be or not to be,",
        "In a galaxy far, far away"
    ]
    
    # Different generation settings to test
    generation_settings = [
        {"temperature": 0.7, "top_k": 50, "top_p": 0.9, "max_new_tokens": 50},
        {"temperature": 0.3, "top_k": 20, "top_p": 0.8, "max_new_tokens": 50},
        {"temperature": 1.0, "top_k": 0, "top_p": 1.0, "max_new_tokens": 30},
    ]
    
    for i, prompt in enumerate(test_prompts[:4]):  # Test first 4 prompts
        print(f"\n--- Prompt: '{prompt}' ---")
        
        for j, settings in enumerate(generation_settings):
            try:
                start_time = time.time()
                generated = trained_generator.generate(prompt=prompt, **settings)
                generation_time = (time.time() - start_time) * 1000
                
                print(f"\nGeneration {j+1} ({generation_time:.0f}ms):")
                print(f"Settings: T={settings['temperature']}, k={settings['top_k']}, p={settings['top_p']}")
                print(f"Output: {generated}")
                
            except Exception as e:
                print(f"Generation {j+1}: Error - {e}")
    
    # Interactive generation function
    print(f"\n=== Interactive Generation Function ===")
    print("Use the function below for interactive testing:")
    
    def interactive_generate(prompt: str, temperature: float = 0.7, top_k: int = 50, 
                           top_p: float = 0.9, max_new_tokens: int = 100) -> str:
        \"\"\"Interactive generation function for easy testing\"\"\"
        if not model_loaded:
            return "No trained model loaded!"
        
        try:
            return trained_generator.generate(
                prompt=prompt,
                temperature=temperature,
                top_k=top_k,
                top_p=top_p,
                max_new_tokens=max_new_tokens
            )
        except Exception as e:
            return f"Error: {e}"
    
    # Example usage
    example_output = interactive_generate("The future of AI is", temperature=0.6, max_new_tokens=40)
    print(f"Example: interactive_generate('The future of AI is', temperature=0.6, max_new_tokens=40)")
    print(f"Output: {example_output}")
    
else:
    print("⚠️  Skipping generation tests - no trained model loaded.")
    print("Train a model first with 'just train' to test generation capabilities.")

## 8. Performance Benchmarking

In [None]:
# Benchmark model performance
print(f"=== Performance Benchmarking ===")

def benchmark_generation(num_trials: int = 5, prompt: str = "The quick brown fox"):
    \"\"\"Benchmark text generation performance\"\"\"
    if not model_loaded:
        print("⚠️  No trained model loaded for benchmarking")
        return
    
    times = []
    tokens_per_second = []
    
    for i in range(num_trials):
        start_time = time.time()
        
        generated = trained_generator.generate(
            prompt=prompt,
            max_new_tokens=50,
            temperature=0.7
        )
        
        end_time = time.time()
        generation_time = end_time - start_time
        
        # Count generated tokens (approximate)
        prompt_tokens = len(tokenizer.encode(prompt))
        total_tokens = len(tokenizer.encode(generated))
        new_tokens = total_tokens - prompt_tokens
        
        times.append(generation_time)
        if generation_time > 0:
            tokens_per_second.append(new_tokens / generation_time)
    
    if times:
        avg_time = np.mean(times)
        avg_tokens_per_sec = np.mean(tokens_per_second)
        
        print(f"Benchmark Results ({num_trials} trials):")
        print(f"Average generation time: {avg_time:.3f}s")
        print(f"Average tokens/second: {avg_tokens_per_sec:.1f}")
        print(f"Min/Max time: {min(times):.3f}s / {max(times):.3f}s")

def benchmark_forward_pass(batch_sizes: list = [1, 2, 4, 8], seq_len: int = 128, num_trials: int = 10):
    \"\"\"Benchmark forward pass performance\"\"\"
    print(f"\\nForward Pass Benchmark (seq_len={seq_len}, {num_trials} trials each):")
    
    model.eval()
    
    for batch_size in batch_sizes:
        times = []
        
        for _ in range(num_trials):
            input_ids = torch.randint(0, tokenizer.vocab_size, (batch_size, seq_len), device=device)
            
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            
            start_time = time.time()
            
            with torch.no_grad():
                _ = model(input_ids)
            
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            
            end_time = time.time()
            times.append(end_time - start_time)
        
        avg_time = np.mean(times) * 1000  # Convert to ms
        std_time = np.std(times) * 1000
        throughput = batch_size * seq_len / (avg_time / 1000)  # tokens/sec
        
        print(f"Batch size {batch_size:>2}: {avg_time:>6.2f}ms ± {std_time:>5.2f}ms ({throughput:>8.0f} tokens/sec)")

# Run benchmarks
benchmark_forward_pass()

if model_loaded:
    print(f"\\n--- Generation Benchmark ---")
    benchmark_generation()
else:
    print(f"\\n⚠️  Skipping generation benchmark - no trained model loaded.")

## 9. Summary and Next Steps

In [None]:
# Summary and next steps
print("=== Exploration Summary ===")
print()

print("✅ Completed Tests:")
print("  • Model architecture analysis")
print("  • Tokenizer functionality")
print("  • Forward pass validation")
print("  • Untrained model generation")
print("  • Performance benchmarking")

if model_loaded:
    print("  • Trained model loading")
    print("  • Trained model generation")
else:
    print("  ⚠️  Trained model tests skipped (no model available)")

print()
print("📋 Next Steps:")
print()

if not model_loaded:
    print("1. 🎯 Train a model:")
    print("   - Prepare training data: `just prepare-data <text_file>`")
    print("   - Quick training test: `just train-quick`")
    print("   - Full training: `just train`")
    print()

print("2. 🔍 Further Exploration:")
print("   - Experiment with different hyperparameters")
print("   - Test on domain-specific data")
print("   - Implement and test new sampling strategies")
print("   - Compare with different model sizes")
print()

print("3. 📊 Advanced Analysis:")
print("   - Attention visualization")
print("   - Loss curve analysis")
print("   - Token-level perplexity computation")
print("   - Model interpretation techniques")
print()

print("4. 🚀 Extensions:")
print("   - Fine-tuning on specific tasks")
print("   - Multi-GPU training")
print("   - Model quantization")
print("   - ONNX export for deployment")
print()

print("💡 Interactive Functions Available:")
if model_loaded:
    print("   - interactive_generate(prompt, temperature=0.7, top_k=50, top_p=0.9, max_new_tokens=100)")
print("   - benchmark_generation(num_trials=5, prompt='The quick brown fox')")
print("   - benchmark_forward_pass(batch_sizes=[1,2,4,8], seq_len=128, num_trials=10)")
print()

print("📚 Documentation:")
print("   - Project overview: README.md")
print("   - Configuration: configs/default.yaml")
print("   - CLI help: llm-train --help, llm-generate --help")
print("   - Justfile tasks: just --list")

print("\n🎉 Exploration complete! Happy experimenting!")