# Deep Learning Basics with PyTorch

**Dr. Yves J. Hilpisch with GPT-5**


# Interactive Attention Visualizations (Chapter 14)

Self-contained widgets to build intuition for attention:
- Pick a token to visualize its attention row.
- Switch masks (none / padding / causal).
- Adjust temperature.
- Toggle aggregate over heads or inspect a single head.
- See score → softmax → mix and a paint-mixing analogy.

## Overview

This notebook provides a concise, hands-on walkthrough of Deep Learning Basics with PyTorch.
Use it as a companion to the chapter: run each cell, read the short notes,
and try small variations to build intuition.

Tips:
- Run cells top to bottom; restart kernel if state gets confusing.
- Prefer small, fast experiments; iterate quickly and observe outputs.
- Keep an eye on shapes, dtypes, and devices when using PyTorch.


In [1]:
# If ipywidgets is missing (usually fine on Colab), uncomment and run:
# !pip -q install ipywidgets
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
import ipywidgets as widgets
from IPython.display import display

%matplotlib inline
plt.rcParams['figure.dpi'] = 120  # plotting
np.set_printoptions(precision = 3, suppress = True)


In [2]:
import numpy as np

def softmax(x, axis=-1, temp=1.0):
    x = x / max(float(temp), 1e-8)
    x = x - np.max(x, axis=axis, keepdims=True)
    ex = np.exp(x)
    return ex / np.sum(ex, axis=axis, keepdims=True)


def make_toy_data(T=8, d_model=8, h=4, seed=42):
    rng = np.random.default_rng(seed)
    # Token embeddings (single sequence)
    x = rng.normal(size=(T, d_model))
    Wq = rng.normal(size=(d_model, d_model))
    Wk = rng.normal(size=(d_model, d_model))
    Wv = rng.normal(size=(d_model, d_model))
    Q = x @ Wq
    K = x @ Wk
    V = x @ Wv
    d_head = d_model // h
    Qh = Q.reshape(T, h, d_head).transpose(1, 0, 2)  # (h, T, d_head)
    Kh = K.reshape(T, h, d_head).transpose(1, 0, 2)
    Vh = V.reshape(T, h, d_head).transpose(1, 0, 2)
    return Qh, Kh, Vh


def rgb_from_values(Vh, seed=0):
    """Map value vectors (h, T, d_head) to RGB per token, per head."""
    rng = np.random.default_rng(seed)
    h, T, d_head = Vh.shape
    P = rng.normal(size=(d_head, 3))
    C = Vh @ P  # (h, T, 3)
    C = (C - C.min()) / (C.max() - C.min() + 1e-8)
    return C


def make_masks(T, mask_type='none', pad_len=None):
    if mask_type == 'none':
        return np.ones((T, T), dtype=bool)
    elif mask_type == 'causal':
        return np.tril(np.ones((T, T), dtype=bool))
    elif mask_type == 'padding':
        # Allow only the first pad_len tokens to query/attend the first pad_len
        if pad_len is None:
            return np.ones((T, T), dtype=bool)
        m = np.zeros((T, T), dtype=bool)
        m[:pad_len, :pad_len] = True
        return m
    else:
        return np.ones((T, T), dtype=bool)


def attention_weights(Qh, Kh, mask=None, temp=1.0):
    h, T, d_head = Qh.shape
    S = np.matmul(Qh, np.transpose(Kh, (0, 2, 1))) / np.sqrt(d_head)  # (h, T, T)
    if mask is not None:
        S = np.where(mask[None, :, :], S, -1e9)
    A = softmax(S, axis=-1, temp=temp)
    return S, A


In [3]:
# Widgets
# Default toy setup
T = 8
d_model = 8
h = 4

i_w = widgets.IntSlider(value = 2, min = 0, max = T-1, step = 1,     description = 'token i', continuous_update = False)
temp_w = widgets.FloatLogSlider(value = 1.0, base = 10, min = -1, max = 1, step = 0.05,
description = 'Temp', continuous_update = False)
mask_w = widgets.ToggleButtons(options = ['none', 'padding', 'causal'], value = 'none',
description = 'Mask')
padlen_w = widgets.IntSlider(value = T, min = 1, max = T, step = 1,     description = 'pad len', continuous_update = False)
aggregate_w = widgets.Checkbox(value = False, description = 'Aggregate heads')
head_w = widgets.IntSlider(value = 1, min = 1, max = h, step = 1, description = 'Head',
continuous_update = False)

controls = widgets.VBox([widgets.HBox([i_w, head_w, aggregate_w]),     widgets.HBox([temp_w, mask_w, padlen_w])])
display(controls)

VBox(children=(HBox(children=(IntSlider(value=2, continuous_update=False, description='token i', max=7), IntSl…

In [4]:
import numpy as np

def softmax(x, axis=-1, temp=1.0):
    x = x / max(float(temp), 1e-8)
    x = x - np.max(x, axis=axis, keepdims=True)
    ex = np.exp(x)
    return ex / np.sum(ex, axis=axis, keepdims=True)

def make_toy_data(T=8, d_model=8, h=4, seed=42):
    rng = np.random.default_rng(seed)
    # Token embeddings (single sequence)
    x = rng.normal(size=(T, d_model))
    Wq = rng.normal(size=(d_model, d_model))
    Wk = rng.normal(size=(d_model, d_model))
    Wv = rng.normal(size=(d_model, d_model))
    Q = x @ Wq
    K = x @ Wk
    V = x @ Wv
    d_head = d_model // h
    Qh = Q.reshape(T, h, d_head).transpose(1, 0, 2)  # (h, T, d_head)
    Kh = K.reshape(T, h, d_head).transpose(1, 0, 2)
    Vh = V.reshape(T, h, d_head).transpose(1, 0, 2)
    return Qh, Kh, Vh

def rgb_from_values(Vh, seed=0):
    # Map value vectors (h, T, d_head) to RGB per token, per head
    rng = np.random.default_rng(seed)
    h, T, d_head = Vh.shape
    P = rng.normal(size=(d_head, 3))
    C = Vh @ P  # (h, T, 3)
    C = (C - C.min()) / (C.max() - C.min() + 1e-8)
    return C

def make_masks(T, mask_type='none', pad_len=None):
    if mask_type == 'none':
        return np.ones((T, T), dtype=bool)
    if mask_type == 'causal':
        return np.tril(np.ones((T, T), dtype=bool))
    if mask_type == 'padding':
        m = np.ones((T, T), dtype=bool)
        if pad_len is None:
            return m
        m[:, pad_len:] = False
        return m
    return np.ones((T, T), dtype=bool)


def attention_weights(Qh, Kh, mask=None, temp=1.0):
    h, T, d_head = Qh.shape
    S = np.matmul(Qh, np.transpose(Kh, (0, 2, 1))) / np.sqrt(d_head)  # (h, T, T)
    if mask is not None:
        S = np.where(mask[None, :, :], S, -1e9)
    A = softmax(S, axis=-1, temp=temp)
    return S, A

# Default toy setup
# T = 8  # Moved to cell 885e7003
# d_model = 8 # Kept here as it's used in make_toy_data
# h = 4 # Moved to cell 885e7003
Qh, Kh, Vh = make_toy_data(T=T, d_model=d_model, h=h, seed=7)
C = rgb_from_values(Vh, seed=3)  # (h, T, 3)

In [5]:
from matplotlib import gridspec

def draw(i, head, aggregate, temp, mask_kind, pad_len):
    mask = make_masks(T, mask_type=mask_kind, pad_len=pad_len)
    S, A = attention_weights(Qh, Kh, mask=mask, temp=temp)  # (h, T, T)
    # Select row i
    if aggregate:
        s_row = S[:, i, :].mean(axis=0)
        a_row = A[:, i, :].mean(axis=0)
        colors = C.mean(axis=0)  # (T, 3)
    else:
        hix = int(head) - 1
        s_row = S[hix, i, :]
        a_row = A[hix, i, :]
        colors = C[hix]  # (T, 3)

    # Mix color for paint demo
    mixed = a_row @ colors

    if not aggregate:
        fig = plt.figure(figsize=(10, 3))
        gs = gridspec.GridSpec(1, 1)

        # Right bottom: paint mixing
        ax3 = plt.subplot(gs[0, 0])
        for j in range(T):
            ax3.add_patch(plt.Rectangle((0.08, j + 0.15), 0.55, 0.7, color=colors[j], ec='#333'))
            ax3.text(0.66, j + 0.5, f'j = {j} w = {a_row[j]:.2f}', va='center', ha='left', fontsize=8)
        ax3.add_patch(plt.Rectangle((0.08, T + 0.25), 0.85, 0.9, color=mixed, ec='#000', lw=1.5))
        ax3.text(0.5, T + 0.7, 'mixed output color', va='center', ha='center', fontsize=9, color='#111')
        ax3.set_xlim(0, 1.05); ax3.set_ylim(0, T + 1.3)
        ax3.set_xticks([]); ax3.set_yticks([])
        for sp in ax3.spines.values():
            sp.set_visible(False)

        fig.suptitle(f'i = {i}, head = {head if not aggregate else "avg"}, mask = {mask_kind}, T° = {temp:.2f}')
        plt.show()

out = widgets.interactive_output(
    draw,
    {'i': i_w, 'head': head_w, 'aggregate': aggregate_w, 'temp': temp_w, 'mask_kind': mask_w, 'pad_len': padlen_w}
)
display(out)

Output()

## Exercises

1. Use the widgets to explore heads/temperature; take screenshots and annotate.
2. Create two prompts that elicit different attention patterns and explain why.


<img src="https://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>
