<link rel="stylesheet" href="berkeley.css">

<h1 class="cal cal-h1">Lecture 16 â€“ CS 189, Fall 2025</h1>



In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly import figure_factory as ff
from plotly.subplots import make_subplots
colors = px.colors.qualitative.Plotly
px.defaults.width = 800
# from ipywidgets import HBox
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

For this notebook we will use the CPU device.

In [None]:
# torch._dynamo.disable() # fixing a bug with apple silicon support
if torch.accelerator.is_available():
    device = torch.accelerator.current_accelerator()
else:
    device = "cpu"

device = "cpu"
print("Congratulations! You have access to a", device, "device.")


You can ignore the following visualization code.  This was added to track and update the loss and decision surface during training. 

In [None]:
class DecisionBoundaryVisualizer:
    def __init__(self, pred_fig, x1_range, x2_range, num_points=50, plot_probs=True):
        self.pred_fig = pred_fig
        self.plot_probs = plot_probs
        x1_min, x1_max = x1_range
        x2_min, x2_max = x2_range
        margin_x1 = 0.5 * (x1_max - x1_min)
        margin_x2 = 0.5 * (x2_max - x2_min)
        x1_min -= margin_x1
        x1_max += margin_x1
        x2_min -= margin_x2
        x2_max += margin_x2
        # Setup the grid of test points for decision boundary plotting
        x1, x2 = torch.meshgrid(torch.linspace(x1_min, x1_max, num_points),
                                torch.linspace(x2_min, x2_max, num_points), indexing='ij')
        self.grid = torch.cat([x1.reshape(-1, 1), x2.reshape(-1, 1)], dim=1)
        self.x1 = to_numpy(x1)
        self.x2 = to_numpy(x2)

    def plot_decision_boundary(self, model):
        with torch.no_grad():
            preds = F.softmax(model(self.grid), dim=1)
        num_classes = preds.shape[1]
        if num_classes > 2:  # support for multiclass
            preds = torch.argmax(preds, axis=1).reshape(self.x1.shape).T
            preds = to_numpy(preds)
            return go.Contour(x=self.x1[:, 0], y=self.x2[0], z=preds,
                            #   contours=dict(start=0, end=num_classes, size=1),
                              colorscale=px.colors.qualitative.Plotly[:num_classes],  
                              contours=dict(start=-0.5, end=num_classes-0.5, size=1, coloring='fill'),
                              opacity=0.5, showscale=False)
        else:  # Binary classification case (red/blue)
            if self.plot_probs:
                preds = preds[:, 1].reshape(self.x1.shape).T
            else:
                preds = (preds[:, 1] > 0.5).astype(float).reshape(self.x1.shape).T
            preds = to_numpy(preds)
            return go.Contour(x=self.x1[:, 0], y=self.x2[0], z=preds,
                              colorscale=[[0, 'blue'], [1, 'red']], 
                              #colorscale='Matter_r',
                              opacity = 0.5, showscale=False)
    def reset(self):
        ...

    def __call__(self, epoch, model, loss_fn):
        model.eval()
        with torch.no_grad():
            boundary = self.plot_decision_boundary(model)
        # plotly batch update
        with self.pred_fig.batch_update():
            self.pred_fig.data[-1].z = boundary.z
        model.train()
    

In [None]:
class LossVisualizer:
    def __init__(self, training_data, validation_data, loss_fig):
        self.x_val, self.t_val = validation_data[:]
        self.x_train, self.t_train = training_data[:]
        self.loss_fig = loss_fig
        self.epochs = []
        self.losses_val = []
        self.errors_val = []
        self.losses_tr = []
    def reset(self):
        self.epochs = []
        self.losses_val = []
        self.errors_val = []
        self.losses_tr = []
        with self.loss_fig.batch_update():
            self.loss_fig.data[0].x = []
            self.loss_fig.data[0].y = []
            self.loss_fig.data[1].x = []
            self.loss_fig.data[1].y = []
    
    def __call__(self, epoch, model, loss_fn):
        model.eval()
        with torch.no_grad():
            loss_val = loss_fn(model(self.x_val), self.t_val).item()
            loss_tr = loss_fn(model(self.x_train), self.t_train).item()
            err_val = (model(self.x_val).argmax(dim=1) != self.t_val).float().mean().item()
        self.epochs.append(epoch)
        self.losses_val.append(loss_val)
        self.losses_tr.append(loss_tr)
        self.errors_val.append(err_val)
        # Visualization Code
        with self.loss_fig.batch_update():
            self.loss_fig.data[0].x = self.epochs
            self.loss_fig.data[0].y = self.losses_val
            self.loss_fig.data[1].x = self.epochs
            self.loss_fig.data[1].y = self.losses_tr
            self.loss_fig.data[2].x = self.epochs
            self.loss_fig.data[2].y = self.errors_val
        model.train()
    

<link rel="stylesheet" href="berkeley.css">

<h2 class="cal cal-h2">Weight Initialization</h2>



The following depicts the sweet spot of each of the common activation functions:

In [None]:
x = np.linspace(-7, 7, 1000)
df = pd.DataFrame(
    {"x": x, 
     "sigmoid(x)": 1 / (1 + np.exp(-x)), 
     "tanh(x)": np.tanh(x),
     "Relu(x)": np.maximum(0, x)})
df = df.melt(id_vars=["x"], 
            value_vars=["sigmoid(x)", "tanh(x)", "Relu(x)"],
                    var_name="activation", value_name="value")

fig = px.line(df, x="x", y="value", facet_col="activation")
fig.update_layout(yaxis_range=[-1.5, 1.5])
fig.update_traces(line=dict(width=5))
# increase the subplot title font size
fig.update_layout(
    font=dict(size=24),
    width=900, height=400,
    # margin=dict(l=10, r=10, t=10, b=10)
)
# add the regions of transition
fig.add_vrect(x0=-4, x1=4, line_width=0, fillcolor="LightGreen", opacity=0.3,
              annotation_text="Sweet Spot", annotation_position="top left",
              row=1, col=1)
fig.add_vrect(x0=-2, x1=2, line_width=0, fillcolor="LightGreen", opacity=0.3,
              annotation_text="Sweet Spot", annotation_position="top left",
              row=1, col=2)
fig.add_vrect(x0=0, x1=7, line_width=0, fillcolor="LightGreen", opacity=0.3,
              annotation_text="Sweet Spot", annotation_position="top left",
              row=1, col=3)
# remove the activate= from the titles
for i in range(1, 4):
    fig.layout.annotations[i-1].text = fig.layout.annotations[i-1].text.split("=")[1] 
# fig.write_image("activation_functions.pdf", width=900, height=400, scale=2)
fig.show()


<link rel="stylesheet" href="berkeley.css">

<h3 class="cal cal-h3">Implementing Xavier and He Initialization</h3>



In [None]:
class MLPModel(nn.Module):
    def __init__(self, dims, activation="relu"):
        super().__init__()
        self.activation = activation
        self.layers = nn.ModuleList(
            [nn.Linear(dims[i], dims[i+1]) for i in range(len(dims)-1)]
        )
        self.act = nn.functional.relu if activation == "relu" else torch.tanh
        self._reset_parameters()  

    def _reset_parameters(self):
        # Initialize weights according to activation function
        for i, layer in enumerate(self.layers):
            if not isinstance(layer, nn.Linear): 
                continue
            if self.activation == "relu":
                # He/Kaiming for ReLU
                nn.init.kaiming_uniform_(layer.weight, nonlinearity="relu")
            else:
                # Xavier for tanh
                nn.init.xavier_uniform_(layer.weight)
            nn.init.zeros_(layer.bias)

    def forward(self, x):
        for i, layer in enumerate(self.layers):
            x = layer(x)
            if i < len(self.layers) - 1: # no activation on last layer
                x = self.act(x)
        return x

Let's try making a deep neural network and see if the initialization makes a difference!

In [None]:
torch.manual_seed(189)
model = MLPModel(dims=[1] + 30 * [10] + [1], activation="tanh")

x_test = torch.linspace(-10, 10, 1000).unsqueeze(1)
y = model(x_test)

Notice small pre-activation values (close to 0) and the non-linearity centered around 0 for tanh and sigmoid.


In [None]:
fig = px.line(x=x_test.squeeze().detach().numpy(), 
              y=y.squeeze().detach().numpy(), 
              title="Deep MLP Output with ReLU Activation and He Initialization")
fig

<link rel="stylesheet" href="berkeley.css">

<h2 class="cal cal-h2">Layer and Batch Normalization</h2>



In [None]:
bnorm = nn.BatchNorm1d(2)
bnorm

In [None]:
bnorm(torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]))

In [None]:
list(bnorm.named_parameters())

In [None]:
list(bnorm.named_buffers())

In [None]:
lnorm = nn.LayerNorm(2)
lnorm 

In [None]:
lnorm(torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]))

In [None]:
print(list(lnorm.named_parameters()))

In [None]:
print(list(lnorm.named_buffers()))

<link rel="stylesheet" href="berkeley.css">

<h2 class="cal cal-h2">Dropout Regularization</h2>



PyTorch implements dropout as a layer that can be added to your model. During training, it randomly zeroes some of the elements of the input tensor with probability p using samples from a Bernoulli distribution. 

However, unlike the original paper description PyTorch implements inverted dropout, which scales the activations during training by $1/(1-\rho)$ so that no scaling is needed at test time.  Here $\rho$ is the probability of an element to be zeroed.

In [None]:
dropout = nn.Dropout(p=0.3)

dropout.train()
display(dropout(torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])))

dropout.eval()
display(dropout(torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])))

<link rel="stylesheet" href="berkeley.css">

<h2 class="cal cal-h2">Everything MLP Network</h2>



In [None]:
def make_normalization_layer(norm_type, dim):
    """Creates a normalization layer based on the specified type."""
    norm_type = (norm_type or "").lower()
    if norm_type == "batch":
        return nn.BatchNorm1d(dim)
    elif norm_type == "layer":
        return nn.LayerNorm(dim)
    else:
        return nn.Identity()

def make_activation_layer(activation):
    """Creates an activation layer based on the specified type."""
    activation = (activation or "").lower()
    if activation == "relu":
        return nn.ReLU()
    elif activation == "tanh":
        return nn.Tanh()
    else:
        return nn.Identity()

def make_dropout_layer(dropout_prob):
    """Creates a dropout layer if dropout_prob > 0."""
    if dropout_prob is not None and dropout_prob > 0.0:
        return nn.Dropout(p=dropout_prob)
    return nn.Identity()

def make_projection_layer(dim_in, dim_out):
    """Creates a projection layer if necessary to address dimension missmatch."""
    if dim_in != dim_out:
        return nn.Linear(dim_in, dim_out, bias=False)
    return nn.Identity()    

In [None]:
class MLPModel(nn.Module):
    def __init__(self, dims, activation="relu", normalization=None, dropout=0.0):
        super().__init__()
        self.activation = activation
        # Build the networks as a sequence of layers
        self.layers = nn.Sequential()
        for i in range(len(dims)-2):
            self.layers.append(nn.Linear(dims[i], dims[i+1]))
            self.layers.append(make_normalization_layer(normalization, dims[i+1]))
            self.layers.append(make_activation_layer(activation))
            self.layers.append(make_dropout_layer(dropout))
        # final layer without activation or normalization
        self.head = nn.Linear(dims[-2], dims[-1])

        self._reset_parameters()

    def _reset_parameters(self):
        for layer in (m for m in self.layers if isinstance(m, nn.Linear)):
            if self.activation == "relu":
                nn.init.kaiming_uniform_(layer.weight, nonlinearity="relu")
            else:
                nn.init.xavier_uniform_(layer.weight)
            nn.init.zeros_(layer.bias)
        nn.init.xavier_uniform_(self.head.weight, gain=1.0)
        nn.init.zeros_(self.head.bias)
        
    def forward(self, x):
        x = self.layers(x)
        return self.head(x)

In [None]:
class ResidualBlock(nn.Module):
    def __init__(self, dim_in, dim_out, activation="relu", normalization="none", dropout=0.0):
        super().__init__()
        self.activation = (activation or "").lower()
        self.ff_net = nn.Sequential(
            make_normalization_layer(normalization, dim_in),
            make_activation_layer(activation),
            make_dropout_layer(dropout),
            nn.Linear(dim_in, dim_out),
        )
        self.projection = make_projection_layer(dim_in, dim_out)
        self._reset_parameters() 

    def _reset_parameters(self):
        # init residual branch linear(s)
        for m in self.ff_net:
            if isinstance(m, nn.Linear):
                if self.activation == "relu":
                    nn.init.kaiming_uniform_(m.weight, a=0.0, mode="fan_in", nonlinearity="relu")
                else:
                    gain = nn.init.calculate_gain("tanh") if self.activation == "tanh" else 1.0
                    nn.init.xavier_uniform_(m.weight, gain=gain)
                nn.init.zeros_(m.bias)
        # init projection (linear skip) if present
        if isinstance(self.projection, nn.Linear):
            nn.init.xavier_uniform_(self.projection.weight, gain=1.0)   # linear mapping

    def forward(self, x):
        return self.projection(x) + self.ff_net(x)

In [None]:
class ResidualNetwork(nn.Module):
    def __init__(self, dims, activation="relu", normalization=None, dropout=0.0):
        super().__init__()
        self.dims = dims        
        self.activation = activation
        self.normalization = normalization
        self.drop_out = dropout
        # Build the networks as a sequence of layers
        self.layers = nn.Sequential()
        for i in range(len(dims)-2):
            self.layers.append(ResidualBlock(dims[i], dims[i+1], activation, normalization, dropout))
        # final layer without activation or normalization
        self.head = nn.Linear(dims[-2], dims[-1])
        self._reset_parameters()
    
    def _reset_parameters(self):
        nn.init.xavier_uniform_(self.head.weight, gain=1.0)
        nn.init.zeros_(self.head.bias)

    def forward(self, x):
        x = self.layers(x)
        return self.head(x)

<link rel="stylesheet" href="berkeley.css">

<h2 class="cal cal-h2">Playing with the Network</h2>



In [None]:
def make_datasets_from_numpy(x, t, split_ratio=0.8, seed=189):
    """Create training and test datasets from numpy data arrays."""
    from torch.utils.data import random_split, TensorDataset
    data = TensorDataset(
        torch.tensor(x, dtype=torch.float32, device=device), 
        torch.tensor(t, dtype=torch.long, device=device),
    )
    generator = torch.Generator(device=device).manual_seed(seed)
    train_data, test_data = random_split(
        data, [split_ratio, 1 - split_ratio], 
        generator=generator) 
    return train_data, test_data

def to_numpy(tensor):
    """Convert a PyTorch tensor to a NumPy array."""
    return tensor.cpu().numpy() 

In [None]:
from sklearn.datasets import make_blobs, make_moons, make_circles
n = 128
# centers = 5 # number of classes
# x, t = make_blobs(n_samples=n, centers=centers, cluster_std=1.5, random_state=36)
# centers = 2
# x, t = make_moons(n_samples=n, noise=0.2, random_state=42)
centers = 2
x, t = make_circles(n_samples=n, noise=0.2, factor=0.5, random_state=42)

training_data, validation_data = make_datasets_from_numpy(x, t, split_ratio=0.5, seed=189)

data = pd.DataFrame()
data[["x1", "x2"]] = to_numpy(torch.vstack([training_data[:][0], validation_data[:][0]]))
data["t"] = to_numpy(torch.hstack([training_data[:][1], validation_data[:][1]])).astype(str)
data["kind"] = ["train"] * len(training_data) + ["val"] * len(validation_data)
data_fig = px.scatter(data, x="x1", y="x2", color="t", symbol="kind",
                 category_orders={'t': [str(i) for i in range(centers)]},
                 labels={'x1': 'Feature 1', 'x2': 'Feature 2', 'color': 'Class'},
                 width=500, height=500)
data_fig.update_traces(marker=dict(size=10, line=dict(width=1, color="DarkSlateGrey")))
data_fig.show()

In [None]:
def minibatch_gd(model, loss_fn, 
                 training_data,
                 batch_size, 
                 nepochs, 
                 learning_rate,
                 visualizer=None,
                 weight_decay=1e-4):
    
    # Create a dataloader for training
    from torch.utils.data import DataLoader
    generator = torch.Generator(device=device)
    generator.manual_seed(189)
    loader = DataLoader(training_data, 
                        batch_size=batch_size, 
                        shuffle=True, # shuffles each epoch
                        generator=generator)
    
    # Define the optimizer (this is the update rule)
    # optimizer = torch.optim.SGD(model.parameters(), learning_rate)
    # Alternatively, you can use Adam optimizer
    optimizer = torch.optim.AdamW(model.parameters(), learning_rate, weight_decay=weight_decay)
    
    # Loop through the epochs
    for epoch in range(nepochs):
        # Loop through all the batches
        for (x, t) in loader:
            # Zero the gradients to start the next step
            optimizer.zero_grad()
            # Compute prediction and loss
            pred = model(x)
            loss = loss_fn(pred, t)
            # Backpropagation (compute the gradient)
            loss.backward()
            # Update the parameters using the optimizer's update rule
            optimizer.step()
        
        # Visualize the model (if a visualizer function is provided)
        if visualizer is not None:
            model.eval() # disable dropout/batchnorm
            with torch.no_grad():
                visualizer(epoch, model, loss_fn)
            model.train()


In [None]:
from ipywidgets import interactive, IntSlider, FloatSlider, Dropdown, HBox, VBox, Layout

model = MLPModel(dims=[2, centers], activation="relu")
pred_fig = go.FigureWidget(data=data_fig.data, layout=data_fig.layout)
# hide the color scale for the contour plot
pred_fig.update_layout(coloraxis_showscale=False, 
                       margin=dict(l=10, r=10, t=10, b=10),
                       height=400, width=500)
loss_fig = go.FigureWidget()
loss_fig.add_trace(go.Scatter(x=[], y=[], mode='lines', name='Val. Loss'))
loss_fig.add_trace(go.Scatter(x=[], y=[], mode='lines', name='Train. Loss'))
loss_fig.add_trace(go.Scatter(x=[], y=[], mode='lines', name='Val. Accuracy'))
loss_fig.update_layout(
    height=300, xaxis_title='Epochs', 
    margin=dict(l=10, r=10, t=10, b=10),
    yaxis_title='Loss (Cross Entropy)')
x_val, t_val = validation_data[:]
loss_visualizer = LossVisualizer(
    training_data=training_data,
    validation_data=validation_data,
    loss_fig=loss_fig)
boundary_visualizer = DecisionBoundaryVisualizer(
    pred_fig=pred_fig,
    x1_range=(to_numpy(x_val[:, 0]).min(), to_numpy(x_val[:, 0]).max()),
    x2_range=(to_numpy(x_val[:, 1]).min(), to_numpy(x_val[:, 1]).max()),
    num_points=50,
    plot_probs=True)
pred_fig.add_trace(boundary_visualizer.plot_decision_boundary(model))


def update_model(
        model_type,
        n_layers, neurons_per_layer, activation_fn, normalization, dropout, learning_rate, 
        weight_decay, batch_size, epochs):
    # setup the model
    layers = [2] + ([neurons_per_layer] * n_layers) + [centers]
    if model_type == 'MLP':
        model = MLPModel(dims=layers, activation=activation_fn, normalization=normalization, dropout=dropout)
    elif model_type == 'ResNet':
        model = ResidualNetwork(dims=layers, activation=activation_fn, normalization=normalization, dropout=dropout)
    loss_visualizer.reset()
    minibatch_gd(
        model=model, 
        loss_fn=nn.CrossEntropyLoss(), 
        training_data=training_data,
        batch_size=batch_size, 
        nepochs=epochs, 
        learning_rate=learning_rate,
        visualizer=lambda epoch, model, loss_fn: [
            boundary_visualizer(epoch, model, loss_fn),
            loss_visualizer(epoch, model, loss_fn)
        ],
        weight_decay=weight_decay)

controls = interactive(
    update_model, 
    model_type=Dropdown(options=['MLP', 'ResNet'], value='MLP', description="Model Type", continuous_update=False),
    n_layers=IntSlider(min=1, max=20, step=1, value=5, description="Layers", continuous_update=False),
    neurons_per_layer=IntSlider(min=4, max=128, step=1, value=10, description="Neurons/Layer", continuous_update=False),
    activation_fn=Dropdown(options=['relu', 'tanh',], value='tanh', description="Activation", continuous_update=False),
    normalization=Dropdown(options=['batch', 'layer', 'none'], value='none', description="Normalization", continuous_update=False),
    dropout=FloatSlider(min=0, max=0.9, step=0.1, value=0.0, description="Dropout", continuous_update=False),
    learning_rate=FloatSlider(min=0.00001, max=.1, step=0.0005, value=0.001, description="Learning Rate", continuous_update=False),
    weight_decay=FloatSlider(min=1e-7, max=1, step=1e-5, value=1e-4, description="Weight Decay", continuous_update=False),
    batch_size=IntSlider(min=1, max=128, step=8, value=32, description="Batch Size", continuous_update=False),
    epochs=IntSlider(min=10, max=500, step=10, value=100, description="Epochs", continuous_update=False))
display(VBox([HBox([controls, pred_fig,], layout=Layout(align_items="center")), loss_fig]))
    
