<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>


# Deep Learning Basics with PyTorch

**Dr. Yves J. Hilpisch with GPT-5**


# Chapter 10 — Improving Training
Regularization, early stopping, and LR schedules.

In [None]:
# !pip -q install torch numpy matplotlib scikit-learn
import torch, numpy as np, matplotlib.pyplot as plt
from torch import nn
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
plt.style.use('seaborn-v0_8') # plotting # plotting  # plotting
%config InlineBackend.figure_format = 'retina'


## Train/val curves with/without weight decay

In [None]:
def curves_l2(weight_decay):
    torch.manual_seed(0) # reproducibility
    X, y = make_moons(n_samples = 1000, noise = 0.35, random_state = 0)
    # small train split to induce overfitting and highlight L2
    X_tr, X_tmp, y_tr, y_tmp = train_test_split(X, y, test_size = 0.65,         random_state = 42, stratify = y)
    X_va, _, y_va, _ = train_test_split(X_tmp, y_tmp, test_size = 0.65,         random_state = 42, stratify = y_tmp)
    X_tr = torch.tensor(X_tr, dtype = torch.float32)
 y_tr = torch.tensor(y_tr, dtype = torch.long)
    X_va = torch.tensor(X_va, dtype = torch.float32)
 y_va = torch.tensor(y_va, dtype = torch.long)
    model = nn.Sequential(nn.Linear(2, 256), nn.ReLU(), nn.Linear(256, 2))
    opt = torch.optim.Adam(model.parameters(), lr = 5e-3, 
        weight_decay = weight_decay) # optimizer setup / step
    loss_fn = nn.CrossEntropyLoss()
 tr, va = [], []
    for _ in range(150):
        model.train()
 logits = model(X_tr) # raw model scores before softmax/sigmoid
 loss = loss_fn(logits, y_tr) # training objective
        opt.zero_grad()
 loss.backward()
 opt.step()
        model.eval();
        with torch.no_grad():
            tr.append(float(loss))
 va.append(float(loss_fn(model(X_va), y_va)))
    return tr, va
tr0, va0 = curves_l2(0.0)
tr1, va1 = curves_l2(5e-3)
e = range(1, len(tr0)+1)
plt.figure(figsize = (7.6, 3.0)) # plotting # plotting  # plotting
plt.subplot(1, 2, 1) # plotting # plotting  # plotting
 plt.plot(e, tr0, label = 'train') # plotting # plotting  # plotting
 plt.plot(e, va0, label = 'val') # plotting # plotting  # plotting
plt.title('no weight decay') # plotting # plotting  # plotting
 plt.xlabel('epoch') # plotting # plotting  # plotting
 plt.ylabel('loss') # plotting # plotting  # plotting
 plt.legend(frameon = False) # plotting # plotting  # plotting
plt.subplot(1, 2, 2) # plotting # plotting  # plotting
 plt.plot(e, tr1, label = 'train') # plotting # plotting  # plotting
 plt.plot(e, va1, label = 'val') # plotting # plotting  # plotting
plt.title('weight decay 0.005') # plotting # plotting  # plotting
 plt.xlabel('epoch') # plotting # plotting  # plotting
 plt.legend(frameon = False) # plotting # plotting  # plotting
plt.tight_layout() # plotting # plotting  # plotting
 plt.show() # plotting # plotting  # plotting


## Dropout p=0.0 vs p=0.6 — validation curves

In [None]:
def curves_dropout(p):
    torch.manual_seed(0) # reproducibility
    X, y = make_moons(n_samples = 1000, noise = 0.40, random_state = 0)
    # even smaller train split to surface regularization effect
    X_tr, X_tmp, y_tr, y_tmp = train_test_split(X, y, test_size = 0.7,         random_state = 42, stratify = y)
    X_va, _, y_va, _ = train_test_split(X_tmp, y_tmp, test_size = 0.7,         random_state = 42, stratify = y_tmp)
    X_tr = torch.tensor(X_tr, dtype = torch.float32)
 y_tr = torch.tensor(y_tr, dtype = torch.long)
    X_va = torch.tensor(X_va, dtype = torch.float32)
 y_va = torch.tensor(y_va, dtype = torch.long)
    model = nn.Sequential(nn.Linear(2, 256), nn.ReLU(), nn.Dropout(p), nn.Linear(256,         2))
    opt = torch.optim.Adam(model.parameters(), lr = 5e-3) # optimizer setup / step
    loss_fn = nn.CrossEntropyLoss()
 va = []
    for _ in range(150):
        model.train()
 logits = model(X_tr) # raw model scores before softmax/sigmoid
 loss = loss_fn(logits, y_tr) # training objective
        opt.zero_grad()
 loss.backward()
 opt.step()
        model.eval();
        with torch.no_grad(): va.append(float(loss_fn(model(X_va), y_va)))
    return va
v0 = curves_dropout(0.0)
 v1 = curves_dropout(0.6)
e = range(1, len(v0)+1)
plt.figure(figsize = (5.8, 3.0)) # plotting # plotting  # plotting
 plt.plot(e, v0, label = 'dropout p = 0.0') # plotting # plotting  # plotting
 plt.plot(e, v1, label = 'dropout p = 0.6') # plotting # plotting  # plotting
plt.xlabel('epoch') # plotting # plotting  # plotting
 plt.ylabel('val loss') # plotting # plotting  # plotting
 plt.legend(frameon = False) # plotting # plotting  # plotting
 plt.tight_layout() # plotting # plotting  # plotting
 plt.show() # plotting # plotting  # plotting


## Early stopping (keep best validation model)

In [None]:
def train_earlystop(X_tr, y_tr, X_va, y_va, *, epochs = 200, patience = 20):
    model = nn.Sequential(nn.Linear(2, 32), nn.ReLU(), nn.Linear(32, 2))
    opt = torch.optim.Adam(model.parameters(), lr = 5e-3, 
        weight_decay = 1e-3) # optimizer setup / step
    loss_fn = nn.CrossEntropyLoss()
    best_sd = None
 best = float('inf')
 wait = 0
    for _ in range(epochs):
        model.train()
 opt.zero_grad()
        loss = loss_fn(model(X_tr), y_tr) # training objective
        loss.backward()
 opt.step()
        model.eval()
        with torch.no_grad():
            va = float(loss_fn(model(X_va), y_va))
        if va < best:
            best = va
 wait = 0
            # clone state dict without tying storage to model tensors
            best_sd = {k: v.detach().cpu().clone() for k,                 v in model.state_dict().items()}
        else:
            wait += 1
        if wait >= patience: break
    model.load_state_dict(best_sd)
    return model, best


## Learning-rate schedules (constant, step, cosine)

In [None]:
epochs = np.arange(0, 100)
lr_const = np.full_like(epochs, 5e-3, dtype = float)
lr_step = np.where(epochs < 60, 5e-3, 2.5e-3)
lr_cos = 0.5*(1+np.cos(np.pi*epochs/100))*5e-3
plt.figure(figsize = (6.4, 3.0)) # plotting # plotting  # plotting
plt.plot(epochs, lr_const, label = 'constant') # plotting # plotting  # plotting
plt.plot(epochs, lr_step, label = 'step@60') # plotting # plotting  # plotting
plt.plot(epochs, lr_cos, label = 'cosine') # plotting # plotting  # plotting
plt.xlabel('epoch') # plotting # plotting  # plotting
 plt.ylabel('learning rate') # plotting # plotting  # plotting
 plt.legend(frameon = False) # plotting # plotting  # plotting
 plt.tight_layout() # plotting # plotting  # plotting
 plt.show() # plotting # plotting  # plotting


<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>
