<a href="https://colab.research.google.com/github/zetaqubit/udlbook/blob/main/Notebooks/Chap07/7_3_Initialization_soln.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Notebook 7.3: Initialization**

This notebook explores weight initialization in deep neural networks as described in section 7.5 of the book.

Work through the cells below, running each cell in turn. In various places you will see the words "TO DO". Follow the instructions at these places and make predictions about what is going to happen or write code to complete the functions.

Contact me at udlbookmail@gmail.com if you find any mistakes or have any suggestions.

In [1]:
import numpy as np
import matplotlib.pyplot as plt

First let's define a neural network.  We'll just choose the weights and biases randomly for now

In [2]:
def init_params(K, D, sigma_sq_omega):
  # Set seed so we always get the same random numbers
  np.random.seed(0)

  # Input layer
  D_i = 1
  # Output layer
  D_o = 1

  # Make empty lists
  all_weights = [None] * (K+1)
  all_biases = [None] * (K+1)

  # Create input and output layers
  all_weights[0] = np.random.normal(size=(D, D_i))*np.sqrt(sigma_sq_omega)
  all_weights[-1] = np.random.normal(size=(D_o, D)) * np.sqrt(sigma_sq_omega)
  all_biases[0] = np.zeros((D,1))
  all_biases[-1]= np.zeros((D_o,1))

  # Create intermediate layers
  for layer in range(1,K):
    all_weights[layer] = np.random.normal(size=(D,D))*np.sqrt(sigma_sq_omega)
    all_biases[layer] = np.zeros((D,1))

  return all_weights, all_biases

In [3]:
# Define the Rectified Linear Unit (ReLU) function
def ReLU(preactivation):
  activation = preactivation.clip(0.0)
  return activation

In [4]:
def compute_network_output(net_input, all_weights, all_biases):

  # Retrieve number of layers
  K = len(all_weights) -1

  # We'll store the pre-activations at each layer in a list "all_f"
  # and the activations in a second list[all_h].
  all_f = [None] * (K+1)
  all_h = [None] * (K+1)

  #For convenience, we'll set
  # all_h[0] to be the input, and all_f[K] will be the output
  all_h[0] = net_input

  # Run through the layers, calculating all_f[0...K-1] and all_h[1...K]
  for layer in range(K):
      # Update preactivations and activations at this layer according to eqn 7.5
      all_f[layer] = all_biases[layer] + np.matmul(all_weights[layer], all_h[layer])
      all_h[layer+1] = ReLU(all_f[layer])

  # Compute the output from the last hidden layer
  all_f[K] = all_biases[K] + np.matmul(all_weights[K], all_h[K])

  # Retrieve the output
  net_output = all_f[K]

  return net_output, all_f, all_h

Now let's investigate how this the size of the outputs vary as we change the initialization variance:


In [13]:
# Number of layers
K = 50
# Number of neurons per layer
D = 80
  # Input layer
D_i = 1
# Output layer
D_o = 1
# Set variance of initial weights to 1
sigma_sq_omega = 0.03
# Initialize parameters
all_weights, all_biases = init_params(K,D,sigma_sq_omega)

n_data = 1000
data_in = np.random.normal(size=(1,n_data))
net_output, all_f, all_h = compute_network_output(data_in, all_weights, all_biases)

for layer in range(K):
  print("Layer %d, std of hidden units = %3.3f"%(layer, np.std(all_h[layer])))

Layer 0, std of hidden units = 0.981
Layer 1, std of hidden units = 0.108
Layer 2, std of hidden units = 0.093
Layer 3, std of hidden units = 0.110
Layer 4, std of hidden units = 0.145
Layer 5, std of hidden units = 0.175
Layer 6, std of hidden units = 0.171
Layer 7, std of hidden units = 0.174
Layer 8, std of hidden units = 0.197
Layer 9, std of hidden units = 0.188
Layer 10, std of hidden units = 0.179
Layer 11, std of hidden units = 0.206
Layer 12, std of hidden units = 0.199
Layer 13, std of hidden units = 0.212
Layer 14, std of hidden units = 0.233
Layer 15, std of hidden units = 0.250
Layer 16, std of hidden units = 0.282
Layer 17, std of hidden units = 0.328
Layer 18, std of hidden units = 0.386
Layer 19, std of hidden units = 0.415
Layer 20, std of hidden units = 0.590
Layer 21, std of hidden units = 0.546
Layer 22, std of hidden units = 0.599
Layer 23, std of hidden units = 0.692
Layer 24, std of hidden units = 0.865
Layer 25, std of hidden units = 1.037
Layer 26, std of hidde

In [None]:
# You can see that the values of the hidden units are increasing on average (the variance is across all hidden units at the layer
# and the 1000 training examples

# TO DO
# Change this to 50 layers with 80 hidden units per layer

# TO DO
# Now experiment with sigma_sq_omega to try to stop the variance of the forward computation explode

Now let's define a loss function.  We'll just use the least squares loss function. We'll also write a function to compute dloss_doutput


In [14]:
def least_squares_loss(net_output, y):
  return np.sum((net_output-y) * (net_output-y))

def d_loss_d_output(net_output, y):
    return 2*(net_output -y);

Here's the code for the backward pass

In [15]:
# We'll need the indicator function
def indicator_function(x):
  x_in = np.array(x)
  x_in[x_in>=0] = 1
  x_in[x_in<0] = 0
  return x_in

# Main backward pass routine
def backward_pass(all_weights, all_biases, all_f, all_h, y):
  # We'll store the derivatives dl_dweights and dl_dbiases in lists as well
  all_dl_dweights = [None] * (K+1)
  all_dl_dbiases = [None] * (K+1)
  # And we'll store the derivatives of the loss with respect to the activation and preactivations in lists
  all_dl_df = [None] * (K+1)
  all_dl_dh = [None] * (K+1)
  # Again for convenience we'll stick with the convention that all_h[0] is the net input and all_f[k] in the net output

  # Compute derivatives of net output with respect to loss
  all_dl_df[K] = np.array(d_loss_d_output(all_f[K],y))

  # Now work backwards through the network
  for layer in range(K,-1,-1):
    # Calculate the derivatives of biases at layer from all_dl_df[K]. (eq 7.13, line 1)
    all_dl_dbiases[layer] = np.array(all_dl_df[layer])
    # Calculate the derivatives of weight at layer from all_dl_df[K] and all_h[K] (eq 7.13, line 2)
    all_dl_dweights[layer] = np.matmul(all_dl_df[layer], all_h[layer].transpose())

    # Calculate the derivatives of activations from weight and derivatives of next preactivations (eq 7.13, line 3 second part)
    all_dl_dh[layer] = np.matmul(all_weights[layer].transpose(), all_dl_df[layer])
    # Calculate the derivatives of the pre-activation f with respect to activation h (eq 7.13, line 3, first part)
    if layer > 0:
      all_dl_df[layer-1] = indicator_function(all_f[layer-1]) * all_dl_dh[layer]

  return all_dl_dweights, all_dl_dbiases, all_dl_dh, all_dl_df

Now let's look at what happens to the magnitude of the gradients on the way back.

In [28]:
# Number of layers
K = 50
# Number of neurons per layer
D = 80
  # Input layer
D_i = 1
# Output layer
D_o = 1
# Set variance of initial weights to 1
sigma_sq_omega = 0.028
# Initialize parameters
all_weights, all_biases = init_params(K,D,sigma_sq_omega)

# For simplicity we'll just consider the gradients of the weights and biases between the first and last hidden layer
n_data = 100
aggregate_dl_df = [None] * (K+1)
for layer in range(1,K):
  # These 3D arrays will store the gradients for every data point
  aggregate_dl_df[layer] = np.zeros((D,n_data))


# We'll have to compute the derivatives of the parameters for each data point separately
for c_data in range(n_data):
  data_in = np.random.normal(size=(1,1))
  y = np.zeros((1,1))
  net_output, all_f, all_h = compute_network_output(data_in, all_weights, all_biases)
  all_dl_dweights, all_dl_dbiases, all_dl_dh, all_dl_df = backward_pass(all_weights, all_biases, all_f, all_h, y)
  for layer in range(1,K):
    aggregate_dl_df[layer][:,c_data] = np.squeeze(all_dl_df[layer])

for layer in range(1,K):
  print("Layer %d, std of dl_dh = %3.3f"%(layer, np.std(aggregate_dl_df[layer].ravel())))


Layer 1, std of dl_dh = 8.810
Layer 2, std of dl_dh = 8.778
Layer 3, std of dl_dh = 9.466
Layer 4, std of dl_dh = 8.765
Layer 5, std of dl_dh = 7.728
Layer 6, std of dl_dh = 7.596
Layer 7, std of dl_dh = 7.508
Layer 8, std of dl_dh = 6.349
Layer 9, std of dl_dh = 6.019
Layer 10, std of dl_dh = 6.465
Layer 11, std of dl_dh = 6.231
Layer 12, std of dl_dh = 5.920
Layer 13, std of dl_dh = 5.109
Layer 14, std of dl_dh = 5.014
Layer 15, std of dl_dh = 4.963
Layer 16, std of dl_dh = 4.269
Layer 17, std of dl_dh = 3.449
Layer 18, std of dl_dh = 3.304
Layer 19, std of dl_dh = 2.798
Layer 20, std of dl_dh = 3.248
Layer 21, std of dl_dh = 2.822
Layer 22, std of dl_dh = 2.362
Layer 23, std of dl_dh = 2.309
Layer 24, std of dl_dh = 1.917
Layer 25, std of dl_dh = 1.859
Layer 26, std of dl_dh = 2.447
Layer 27, std of dl_dh = 1.915
Layer 28, std of dl_dh = 2.005
Layer 29, std of dl_dh = 2.048
Layer 30, std of dl_dh = 2.093
Layer 31, std of dl_dh = 2.028
Layer 32, std of dl_dh = 1.852
Layer 33, std of 

In [None]:
# You can see that the values of the hidden units are increasing on average (the variance is across all hidden units at the layer
# and the 1000 training examples

# TO DO
# Change this to 50 layers with 80 hidden units per layer

# TO DO
# Now experiment with sigma_sq_omega to try to stop the variance of the gradients exploding
