In [None]:
# Auto-configure repo path and compute device (GPU/MPS/CPU)
import sys
from pathlib import Path

try:
    from utils.path_helpers import add_repo_root_to_sys_path
except Exception:
    cur = Path.cwd()
    for parent in [cur] + list(cur.parents):
        if (parent / "requirements.txt").exists() or (parent / ".git").exists():
            sys.path.insert(0, str(parent))
            break
    from utils.path_helpers import add_repo_root_to_sys_path

add_repo_root_to_sys_path()

from utils.device import get_device, backend_info, backend_name, ensure_seed
print(f"Using backend: {backend_info()}")
ensure_seed(42)

# For PyTorch 2.x, set default device so tensors/models go there automatically
try:
    import torch  # noqa: F401
    if backend_name() in ("torch_cuda", "torch_mps") and hasattr(torch, "set_default_device"):
        torch.set_default_device("cuda" if backend_name() == "torch_cuda" else "mps")
        print(f"torch default device set to {torch.get_default_device()}")
except Exception:
    pass

# Project 12: Transformer Architecture from Scratch

## Goal
Build a tiny decoder-only transformer from first principles.

## Learning Objectives
- Multi-head self-attention mechanism
- Feed-forward networks
- Positional embeddings
- Stacking transformer blocks

## Model Configuration
- Vocabulary size: 5000 tokens
- Hidden dimension: 384
- Attention heads: 6
- Layers: 4-6
- Total parameters: ~20-50M

In [1]:
# Setup
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {torch.device('mps' if torch.backends.mps.is_available() else 'cpu')}")

PyTorch version: 2.9.0
Device: mps
