<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>


# Deep Learning Basics with PyTorch

**Dr. Yves J. Hilpisch with GPT-5**


# Chapter 7 — Training Neural Networks
Tiny MLP on moons: losses, training loop, optimizers, and diagnostics.

In [None]:
# !pip -q install torch numpy matplotlib scikit-learn
import torch, numpy as np, matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8') # plotting # plotting  # plotting
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
%config InlineBackend.figure_format = 'retina'


## Minimal MLP and training loop

In [None]:
torch.manual_seed(0) # reproducibility
X, y = make_moons(n_samples = 600, noise = 0.25, random_state = 0)
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size = 0.25, random_state = 42,     stratify = y)
X_tr, X_te = torch.tensor(X_tr, dtype = torch.float32), torch.tensor(X_te,     dtype = torch.float32)
y_tr, y_te = torch.tensor(y_tr, dtype = torch.long), torch.tensor(y_te,     dtype = torch.long)
W1 = torch.randn(2, 16, requires_grad = True) # layer 1 weights # layer 1 weights  # layer 1 weights
 b1 = torch.zeros(16, requires_grad = True) # layer 1 bias # layer 1 bias  # layer 1 bias
W2 = torch.randn(16, 2, requires_grad = True) # layer 2 weights # layer 2 weights  # layer 2 weights
 b2 = torch.zeros(2, requires_grad = True) # layer 2 bias # layer 2 bias  # layer 2 bias
with torch.no_grad(): W1.mul_(0.5)
 W2.mul_(0.5)
def forward(X):
    h = torch.relu(X@W1 + b1) # hidden activations  # hidden activations
 return h@W2 + b2
for _ in range(300):
    logits = forward(X_tr) # raw model scores before softmax/sigmoid
 loss = torch.nn.functional.cross_entropy(logits, y_tr) # training objective
    for p in (W1, b1, W2, b2):
        if p.grad is not None: p.grad.zero_()
    loss.backward();
    with torch.no_grad():
        for p in (W1, b1, W2, b2): p -= 0.1 * p.grad
float(((forward(X_te).argmax(1)==y_te).float().mean()))


## Optimizers: SGD vs Adam (quick check)

In [None]:
def run(opt_name, lr):
    W1 = torch.randn(2, 16, requires_grad = True) # layer 1 weights # layer 1 weights  # layer 1 weights
 b1 = torch.zeros(16, requires_grad = True) # layer 1 bias # layer 1 bias  # layer 1 bias
    W2 = torch.randn(16, 2, requires_grad = True) # layer 2 weights # layer 2 weights  # layer 2 weights
 b2 = torch.zeros(2, requires_grad = True) # layer 2 bias # layer 2 bias  # layer 2 bias
    with torch.no_grad(): W1.mul_(0.5)
 W2.mul_(0.5)
    opt = (torch.optim.SGD([W1, b1, W2, b2], # optimizer setup / step
        lr = lr) if opt_name=='sgd' else torch.optim.Adam([W1, b1, W2, b2], lr = lr))
    for _ in range(200):
        logits = torch.relu(X_tr@W1 + b1)@W2 + b2 # raw model scores before softmax/sigmoid
        loss = torch.nn.functional.cross_entropy(logits, y_tr) # training objective
        opt.zero_grad()
 loss.backward()
 opt.step()
    return float((((torch.relu(X_te@W1 + b1)@W2 + b2).argmax(1)==y_te).float().mean()))
run('sgd', 0.1), run('adam', 0.01)


<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>
