<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>


# Deep Learning Basics with PyTorch

**Dr. Yves J. Hilpisch with GPT-5**


# Chapter 7 â€” Training Neural Networks
Tiny MLP on moons: losses, training loop, optimizers, and diagnostics.

In [None]:
# !pip -q install torch numpy matplotlib scikit-learn
import torch, numpy as np, matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8')  # plotting
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
%config InlineBackend.figure_format = 'retina'


## Minimal MLP and training loop

In [None]:
import torch
import torch.nn.functional as F
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

# Data
torch.manual_seed(0)
X, y = make_moons(n_samples=600, noise=0.25, random_state=0)
X_tr, X_te, y_tr, y_te = train_test_split(
X, y, test_size=0.25, random_state=42, stratify=y
)
X_tr = torch.tensor(X_tr, dtype=torch.float32)
X_te = torch.tensor(X_te, dtype=torch.float32)
y_tr = torch.tensor(y_tr, dtype=torch.long)
y_te = torch.tensor(y_te, dtype=torch.long)

# Two-layer MLP params (leaf tensors with grads)
W1 = torch.randn(2, 16, requires_grad=True)
b1 = torch.zeros(16, requires_grad=True)
W2 = torch.randn(16, 2, requires_grad=True)
b2 = torch.zeros(2, requires_grad=True)

# Light init scaling without tracking gradients
with torch.no_grad():
    W1.mul_(0.5)
    W2.mul_(0.5)

    # Forward function

    def forward(X):
        h = torch.relu(X @ W1 + b1)
        return h @ W2 + b2

        # Manual SGD loop
        for _ in range(300):
            logits = forward(X_tr)
            loss = F.cross_entropy(logits, y_tr)
            for p in (W1, b1, W2, b2):
                if p.grad is not None:
                    p.grad.zero_()
                    loss.backward()
                    with torch.no_grad():
                        for p in (W1, b1, W2, b2):
                            p -= 0.1 * p.grad

                            # Evaluate accuracy on test set
                            acc = float((forward(X_te).argmax(1) == y_te).float().mean())
                            acc


## Optimizers: SGD vs Adam (quick check)

In [None]:
import torch
import torch.nn.functional as F

def run(opt_name, lr):
    # Fresh params each run
    W1 = torch.randn(2, 16, requires_grad=True)
    b1 = torch.zeros(16, requires_grad=True)
    W2 = torch.randn(16, 2, requires_grad=True)
    b2 = torch.zeros(2, requires_grad=True)
    with torch.no_grad():
        W1.mul_(0.5)
        W2.mul_(0.5)
        params = [W1, b1, W2, b2]
        opt = torch.optim.SGD(params, lr=lr) if opt_name == 'sgd' else torch.optim.Adam(params, lr=lr)
        for _ in range(200):
            logits = torch.relu(X_tr @ W1 + b1) @ W2 + b2
            loss = F.cross_entropy(logits, y_tr)
            opt.zero_grad(); loss.backward(); opt.step()
            acc = float(((torch.relu(X_te @ W1 + b1) @ W2 + b2).argmax(1) == y_te).float().mean())
            return acc

            sgd_acc = run('sgd', 0.1)
            adam_acc = run('adam', 0.01)
            sgd_acc, adam_acc


<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>
