In [28]:
import torch
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")

PyTorch Version: 2.9.1+cu128
CUDA Available: True


In [29]:
# !wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

In [30]:
import torch
import matplotlib.pyplot as plt
%matplotlib inline

# Load the dataset
words = open('names.txt', 'r').read().splitlines()

# Build the vocabulary
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)

print(f"Vocabulary size: {vocab_size}")
print(f"Sample mapping: {list(stoi.items())[:5]}")

Vocabulary size: 27
Sample mapping: [('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5)]


In [31]:
block_size = 3 # Context length: how many characters do we take to predict the next one?

def build_dataset(words):
    X, Y = [], []
    for w in words:
        context =  [0] * block_size # Initialize with '...' (0, 0, 0)
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            # Slide the window: drop the first char, append the new char
            context = context[1:] + [ix] 
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

In [32]:
import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])       # 80% Training
Xdev, Ydev = build_dataset(words[n1:n2])   # 10% Validation (Dev)
Xte, Yte = build_dataset(words[n2:])       # 10% Test

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [33]:
g = torch.Generator().manual_seed(2147483647) # For reproducibility
C = torch.randn((27, 2), generator=g)

In [34]:
emb = C[Xtr]

In [35]:
# Efficient approach reuses existing memory
# emb.shape is (batch_size, block_size, embedding_dim)
# Reshape to flatten last two dimensions: (batch_size, block_size * embedding_dim)
emb_cat = emb.view(emb.shape[0], emb.shape[1] * emb.shape[2])

In [36]:
n_embd = 2  # Dimensionality of character embedding vectors
n_hidden = 100 # Number of neurons in the hidden layer

W1 = torch.randn((block_size * n_embd, n_hidden), generator=g)
b1 = torch.randn(n_hidden, generator=g)

In [37]:
h_pre = emb.view(-1, block_size * n_embd) @ W1 + b1

In [38]:
h = torch.tanh(h_pre)

In [39]:
W2 = torch.randn((n_hidden, vocab_size), generator=g)
b2 = torch.randn(vocab_size, generator=g)

logits = h @ W2 + b2

In [40]:
counts = logits.exp()
prob = counts / counts.sum(1, keepdims=True)

In [41]:
loss = -prob.log().mean()

In [42]:
import torch.nn.functional as F
loss = F.cross_entropy(logits, Ytr)

In [43]:
parameters = [C, W1, b1, W2, b2]
for p in parameters:
    p.requires_grad = True

print(f"Total parameters: {sum(p.numel() for p in parameters)}")

Total parameters: 3481


In [44]:
batch_size = 32
# Inside training loop:
ix = torch.randint(0, Xtr.shape[0], (batch_size,))
Xb, Yb = Xtr[ix], Ytr[ix] # Batch inputs and targets

In [45]:
lre = torch.linspace(-3, 0, 1000) # Exponents from -3 to 0
lrs = 10**lre # Rates from 0.001 to 1.0

# Track stats during training
lri = []
lossi = []

# Learning rate finder loop
for i in range(len(lrs)):
    lr = lrs[i]
    
    # Sample a batch
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb, Yb = Xtr[ix], Ytr[ix]
    
    # Forward pass
    emb = C[Xb]
    h = torch.tanh(emb.view(-1, block_size * n_embd) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Yb)
    
    # Backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # Update parameters using lr
    for p in parameters:
        p.data += -lr * p.grad
    
    # Record loss
    lri.append(lr)
    lossi.append(loss.item())

In [46]:
# Hyperparameters
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
    
    # 1. Minibatch construction
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb, Yb = Xtr[ix], Ytr[ix]
    
    # 2. Forward Pass
    emb = C[Xb] 
    embcat = emb.view(-1, block_size * n_embd)
    hpre = embcat @ W1 + b1
    h = torch.tanh(hpre)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Yb)
    
    # 3. Backward Pass
    for p in parameters:
        p.grad = None # Zero gradients efficiently
    loss.backward()
    
    # 4. Update
    lr = 0.1 if i < 100000 else 0.01 # Learning rate decay schedule
    for p in parameters:
        p.data += -lr * p.grad

    # Track stats
    if i % 10000 == 0:
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(torch.log10(loss).item())

      0/ 200000: 7.6089
  10000/ 200000: 2.7024
  20000/ 200000: 2.4943
  30000/ 200000: 2.5579
  40000/ 200000: 2.2798
  50000/ 200000: 2.5640
  60000/ 200000: 1.8205
  70000/ 200000: 2.3721
  80000/ 200000: 2.1838
  90000/ 200000: 2.3512
 100000/ 200000: 1.9769
 110000/ 200000: 2.5714
 120000/ 200000: 2.1675
 130000/ 200000: 1.9443
 140000/ 200000: 2.6367
 150000/ 200000: 2.2735
 160000/ 200000: 2.4384
 170000/ 200000: 2.3630
 180000/ 200000: 2.1343
 190000/ 200000: 2.3206


In [47]:
g = torch.Generator().manual_seed(2147483647)
n_embd = 10
n_hidden = 200

C = torch.randn((vocab_size, n_embd), generator=g)
W1 = torch.randn((block_size * n_embd, n_hidden), generator=g) * 0.2
b1 = torch.randn(n_hidden, generator=g) * 0.01
W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.01
b2 = torch.zeros(vocab_size) 

# Direct connections (Exercise 3)
W_skip = torch.randn((block_size * n_embd, vocab_size), generator=g) * 0.01

parameters = [C, W1, b1, W2, b2, W_skip]
for p in parameters: p.requires_grad = True
print(f"Number of parameters: {sum(p.numel() for p in parameters)}")

Number of parameters: 12707


In [None]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb, Yb = Xtr[ix], Ytr[ix]
    
    # Forward pass
    emb = C[Xb]
    embcat = emb.view(-1, block_size * n_embd)
    # Hidden layer
    hpre = embcat @ W1 + b1
    h = torch.tanh(hpre)
    # Output layer (with skip connection)
    logits = h @ W2 + embcat @ W_skip + b2
    loss = F.cross_entropy(logits, Yb)
    
    # Backward pass
    for p in parameters: p.grad = None
    loss.backward()
    
    # Update
    lr = 0.1 if i < 100000 else 0.01
    for p in parameters: p.data += -lr * p.grad
    
    # Logging
    if i % 10000 == 0: print(f'{i}: {loss.item():.4f}')
    lossi.append(torch.log10(loss).item())

plt.plot(lossi)

0: 3.3057
10000: 2.3626
20000: 2.1618
30000: 2.2071
40000: 2.4047
50000: 1.9386
60000: 1.6749
70000: 2.0444
80000: 2.2318
90000: 2.2348
100000: 2.1106
110000: 2.1432
120000: 2.1216


In [None]:
@torch.no_grad()
def split_loss(split):
    x,y = {'train': (Xtr, Ytr), 'val': (Xdev, Ydev), 'test': (Xte, Yte)}[split]
    emb = C[x]
    embcat = emb.view(-1, block_size * n_embd)
    h = torch.tanh(embcat @ W1 + b1)
    logits = h @ W2 + embcat @ W_skip + b2
    loss = F.cross_entropy(logits, y)
    print(f'{split} loss: {loss.item():.4f}')

split_loss('train')
split_loss('val')

In [None]:
g = torch.Generator().manual_seed(2147483647 + 10)
for _ in range(20):
    out = []
    context = [0] * block_size
    while True:
        emb = C[torch.tensor([context])]
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + emb.view(1, -1) @ W_skip + b2
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0: break
    print(''.join(itos[i] for i in out[:-1]))