<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>


# Deep Learning Basics with PyTorch

**Dr. Yves J. Hilpisch with GPT-5**


# Chapter 7 — Training Neural Networks
Tiny MLP on moons: losses, training loop, optimizers, and diagnostics.

## Overview

This notebook provides a concise, hands-on walkthrough of Deep Learning Basics with PyTorch.
Use it as a companion to the chapter: run each cell, read the short notes,
and try small variations to build intuition.

Tips:
- Run cells top to bottom; restart kernel if state gets confusing.
- Prefer small, fast experiments; iterate quickly and observe outputs.
- Keep an eye on shapes, dtypes, and devices when using PyTorch.


In [None]:
# !pip -q install torch numpy matplotlib scikit-learn
import torch
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8')  # plotting
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
%config InlineBackend.figure_format = 'retina'


## Minimal MLP and training loop

In [None]:
import torch  # core tensor/autograd library
import torch.nn.functional as F  # functional ops: loss and activation
from sklearn.datasets import make_moons  # toy dataset generator
from sklearn.model_selection import train_test_split  # train/test split

# Data
torch.manual_seed(0)  # fix RNG for reproducibility
X, y = make_moons(n_samples=600, noise=0.25, random_state=0)  # features, labels
X_tr, X_te, y_tr, y_te = train_test_split(  # stratified split
    X, y, test_size=0.25, random_state=42, stratify=y
)
X_tr = torch.tensor(X_tr, dtype=torch.float32)  # train features tensor
X_te = torch.tensor(X_te, dtype=torch.float32)  # test features tensor
y_tr = torch.tensor(y_tr, dtype=torch.long)     # train labels (class indices)
y_te = torch.tensor(y_te, dtype=torch.long)     # test labels (class indices)

# Two-layer MLP params (leaf tensors with grads)
W1 = torch.randn(2, 16, requires_grad=True)  # layer1 weights (2->16)
b1 = torch.zeros(16, requires_grad=True)      # layer1 bias (16,)
W2 = torch.randn(16, 2, requires_grad=True)  # layer2 weights (16->2)
b2 = torch.zeros(2, requires_grad=True)      # layer2 bias (2,)

# Light init scaling without tracking gradients
with torch.no_grad():
    W1.mul_(0.5)  # halve initial scale
    W2.mul_(0.5)  # halve initial scale

def forward(X):  # manual forward pass
    h = torch.relu(X @ W1 + b1)  # hidden activations after ReLU
    return h @ W2 + b2           # logits for 2 classes

# Manual SGD loop
for _ in range(300):  # epochs
    logits = forward(X_tr)                # compute logits on train set
    loss = F.cross_entropy(logits, y_tr)  # CE loss on train set
    for p in (W1, b1, W2, b2):            # iterate parameters
        if p.grad is not None:            # clear stale grads
            p.grad.zero_()
    loss.backward()                       # backprop gradients
    with torch.no_grad():                 # parameter update (no grad)
        for p in (W1, b1, W2, b2):
            p -= 0.1 * p.grad            # SGD step (lr=0.1)

# Evaluate accuracy on test set
with torch.no_grad():
    acc = float((forward(X_te).argmax(1) == y_te).float().mean())  # test accuracy
acc  # display


## Optimizers: SGD vs Adam (quick check)

In [None]:
import torch  # tensor library and autograd
import torch.nn.functional as F  # functional API for loss, activations

def run(opt_name, lr):  # train tiny MLP params with chosen optimizer
    # Fresh parameters (two-layer MLP: 2->16->2)
    W1 = torch.randn(2, 16, requires_grad=True)  # first layer weights
    b1 = torch.zeros(16, requires_grad=True)     # first layer bias
    W2 = torch.randn(16, 2, requires_grad=True)  # second layer weights
    b2 = torch.zeros(2, requires_grad=True)      # second layer bias
    # Light scaling for stable starts (no grad tracking here)
    with torch.no_grad():
        W1.mul_(0.5)  # halve initial scale
        W2.mul_(0.5)  # halve initial scale
    params = [W1, b1, W2, b2]  # parameter list for optimizer
    # Create optimizer from name
    optimizer = (
        torch.optim.SGD(params, lr=lr)    # SGD variant
        if opt_name == 'sgd'             # select by string
        else torch.optim.Adam(params, lr=lr)  # Adam variant
    )
    for _ in range(200):  # training epochs
        # Forward pass: ReLU hidden then linear to logits
        logits = torch.relu(X_tr @ W1 + b1) @ W2 + b2
        loss = F.cross_entropy(logits, y_tr)  # CE loss on training batch
        optimizer.zero_grad()  # clear accumulated gradients
        loss.backward()        # backprop gradients into params
        optimizer.step()       # update parameters
    # Evaluate on test split (no grad tracking)
    with torch.no_grad():
        logits_te = torch.relu(X_te @ W1 + b1) @ W2 + b2  # test logits
        acc = float((logits_te.argmax(1) == y_te).float().mean())  # test accuracy
    return acc  # scalar accuracy

# Compare SGD vs Adam on test accuracy
sgd_acc = run('sgd', 0.1)   # SGD at lr=0.1
adam_acc = run('adam', 0.01)  # Adam at lr=1e-2
sgd_acc, adam_acc  # show both accuracies


## Training and validation curves


In [None]:
import torch, torch.nn as nn, torch.nn.functional as F  # PyTorch core APIs
import matplotlib.pyplot as plt  # plotting backend
# Define a fresh tiny MLP model for logging
model2 = nn.Sequential(  # 2->16->2 classifier
    nn.Linear(2, 16),  # first affine layer
    nn.ReLU(),         # nonlinearity
    nn.Linear(16, 2)   # logits for 2 classes
)
opt2 = torch.optim.Adam(model2.parameters(), lr=3e-3)  # optimizer
tr_hist, va_hist = [], []  # containers for train/validation losses
for _ in range(60):  # training epochs
    model2.train(); opt2.zero_grad()  # enable train mode; clear grads
    tr_logits = model2(X_tr); tr_loss = F.cross_entropy(tr_logits, y_tr)  # loss on train
    tr_loss.backward(); opt2.step()  # backprop and parameter update
    model2.eval()  # switch to eval mode for validation
    with torch.no_grad():  # no grads for validation
        va_loss = F.cross_entropy(model2(X_te), y_te).item()  # numeric val loss
    tr_hist.append(float(tr_loss.detach())); va_hist.append(va_loss)  # log losses
plt.figure(figsize=(5.5, 2.8))  # compact figure
plt.plot(tr_hist, label='train loss'); plt.plot(va_hist, '--', label='val loss')  # curves
plt.xlabel('epoch'); plt.ylabel('loss')  # axis labels
plt.legend(frameon=False); plt.tight_layout(); plt.show()  # legend and render


## Decision boundary (after training)


In [None]:
import numpy as np, matplotlib.pyplot as plt  # NumPy + plotting
# Concatenate splits to get plotting bounds
allX = torch.cat([X_tr, X_te], dim=0).numpy()  # (N, 2) array
# Build grid for dense evaluation
xx, yy = np.meshgrid(  # coordinate grids
    np.linspace(allX[:,0].min()-1, allX[:,0].max()+1, 300),  # x range
    np.linspace(allX[:,1].min()-1, allX[:,1].max()+1, 300)   # y range
)
grid = torch.tensor(np.c_[xx.ravel(), yy.ravel()], dtype=torch.float32)  # (M,2) points
model2.eval()  # eval mode for deterministic behavior
with torch.no_grad():  # inference-only
    zz = model2(grid).argmax(1).reshape(xx.shape).numpy()  # predicted class per cell
plt.figure(figsize=(5.2, 3.6))  # figure size
plt.contourf(xx, yy, zz, alpha=0.25, cmap='coolwarm')  # decision regions
plt.scatter(X_te[:,0], X_te[:,1], c=y_te, s=12, edgecolor='k')  # test points
plt.title('Decision boundary (TinyMLP)')  # title
plt.tight_layout(); plt.show()  # render


## Exercises

1. Train for a fixed budget and compare losses across batch sizes.
2. Change optimizer or learning rate schedule and compare final metrics.


<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>
