In [2]:
import numpy as np
import matplotlib.pyplot as plt

plt.style.use("utils/deeplearning.mplstyle")

import tensorflow as tf

from utils.lab_utils_common import sigmoid
import logging

logging.getLogger("tensorflow").setLevel(logging.ERROR)
tf.autograph.set_verbosity(0)

In [3]:
g = sigmoid

## Original Implementation 

<center> <img src="images/numpy_nn_implementation.png" width="750" height="450"> </center> 

In [4]:
def my_dense(a_in, W, b):
    """
    Computes dense layer
    Args:
      a_in (ndarray (n, )) : 1 example or row of the data
      W    (ndarray (n, j)) : weight matrix, n features (number of columns of design matrix minus intercept column) per unit, j units
      b    (ndarray (j, )) : bias vector, j units
    Returns
      a_out (ndarray (j,))  : j units
    """
    # Obtain the number of units (neurons) in the layer from the second axis of the weight matrix (columns)
    # Each column of the weight matrix is a vector of weights for a unit, so j units means j columns in the weight matrix
    units = W.shape[1]
    # Initialize the vector of activation values to zeros
    a_out = np.zeros(units)
    for j in range(units):
        w = W[
            :, j
        ]  # Subset the jth column of the weight matrix, a vector of weights w for the jth unit
        z = (
            np.dot(w, a_in) + b[j]
        )  # Compute linear predictor z = w^T a_in + b for the jth unit
        a_out[j] = g(
            z
        )  # Compute activation value (logit transformation) a_out[j] = g(z) for the jth unit
    return a_out

In [5]:
x = np.array([200, 17])
W = np.array([[1, -3, 5], [-2, 4, -6]])
b = np.array([-1, 1, 2])

x, W, b

(array([200,  17]),
 array([[ 1, -3,  5],
        [-2,  4, -6]]),
 array([-1,  1,  2]))

In [6]:
my_dense(x, W, b)

array([1.00e+000, 7.12e-218, 1.00e+000])

## Vectorized Implementation

Instead of for loop

```python
a_out = np.zeros(units)
for j in range(units):               
    w = W[:,j]                       
    z = np.dot(w, a_in) + b[j]       
    a_out[j] = g(z) 
```

which computes and logit transforms the linear predictor for each neuron $j$ in the layer sequentially, we can use matrix multiplication to vectorize these computations. Note that the following computations are equivalent. Suppose we have the following layers and training data:

* layer $l$ with $j$ neuron units
* layer $l-1$ with $n$ neuron units

\begin{align*}
a^{l}_{1} &= g(\underbrace{\mathbf{w}^{l}_{1}}_{n \times 1} \cdot \underbrace{a^{l-1}}_{n \times 1} + b^{l}_{1}) = 
g(\underbrace{(\vec{a}^{l-1})^{T}}_{1 \times n}\underbrace{\mathbf{w}^{l}_{1}}_{n\times1} + b^{l}_{1}) \\
a^{l}_{2} &= g(\underbrace{\mathbf{w}^{l}_{2}}_{n \times 1} \cdot \underbrace{a^{l-1}}_{n \times 1} + b^{l}_{2}) =
g(\underbrace{(\vec{a}^{l-1})^{T}}_{1 \times n}\underbrace{\mathbf{w}^{l}_{2}}_{n\times1} + b^{l}_{2}) \\
& \hspace{10mm} \vdots \\
a^{l}_{j} &= g(\underbrace{\mathbf{w}^{l}_{j}}_{n \times 1} \cdot \underbrace{a^{l-1}}_{n \times 1} + b^{l}_{j}) = 
g(\underbrace{(\vec{a}^{l-1})^{T}}_{1 \times n}\underbrace{\mathbf{w}^{l}_{j}}_{n\times1} + b^{l}_{j})
\end{align*}

If we load the above $j$ scalars into a row vector $\underbrace{(\vec{a}^{l})^{T}}_{1 \times j} = \begin{bmatrix}a^{l}_{1} & a^{l}_{2} & \ldots & a^{l}_{j}\end{bmatrix}=g(\underbrace{\vec{z}^{l}}_{1 \times j})$, we notice that the computations above can be first expressed as a matrix product as follows:

\begin{align*}
\underbrace{\vec{z}^{l}}_{1 \times j}&=\textcolor{orange}{\underbrace{a^{l-1}}_{1 \times n}
\underbrace{
\begin{bmatrix} 
\vert & \vert & & \vert  \\
\mathbf{w}^{l}_{1} & \mathbf{w}^{l}_{2} & \ldots & \mathbf{w}^{l}_{j} \\
\vert & \vert & & \vert 
\end{bmatrix}}_{n \times j}} + \underbrace{\begin{bmatrix}b^{l}_{1} & b^{l}_{2} & \ldots & b^{l}_{j}\end{bmatrix}}_{1\times j} \\
&=\underbrace{\begin{bmatrix}a^{l-1}_{1} & a^{l-1}_{2} & \ldots & a^{l-1}_{n}\end{bmatrix}}_{1 \times n}
\underbrace{
\begin{bmatrix} 
\vert & \vert & & \vert  \\
\mathbf{w}^{l}_{1} & \mathbf{w}^{l}_{2} & \ldots & \mathbf{w}^{l}_{j} \\
\vert & \vert & & \vert 
\end{bmatrix}}_{n \times j} + \underbrace{\begin{bmatrix}b^{l}_{1} & b^{l}_{2} & \ldots & b^{l}_{j}\end{bmatrix}}_{1\times j} \\
&=\begin{bmatrix}\underbrace{(\vec{a}^{l-1})^{T}}_{1 \times n}\underbrace{\mathbf{w}^{l}_{1}}_{n\times1} + b^{l}_{1} & \underbrace{(\vec{a}^{l-1})^{T}}_{1 \times n}\underbrace{\mathbf{w}^{l}_{2}}_{n\times1} + b^{l}_{2} & \ldots & \underbrace{(\vec{a}^{l-1})^{T}}_{1 \times n}\underbrace{\mathbf{w}^{l}_{j}}_{n\times1} + b^{l}_{j} \end{bmatrix} \\
&=\underbrace{\begin{bmatrix}z^{l}_{1} & z^{l}_{2} & \ldots & z^{j}_{1}\end{bmatrix}}_{1\times j}
\end{align*}

The expression highlighted in orange is the dot product between the row vector ($\underbrace{a^{l-1}}_{1 \times n}$) and each of the column vectors ($\underbrace{\mathbf{w}^{l}_{i}}_{n \times 1}$). 

Then, we can apply the sigmoid function to each element of the row vector above:

\begin{align*}
\underbrace{(\vec{a}^{l})^{T}}_{1 \times j}&=\begin{bmatrix}g(z^{l}_{1}) & g(z^{l}_{2}) & \ldots & g(z^{l}_{j})\end{bmatrix}
\end{align*}

In [7]:
def my_dense_vectorized(A_in, W, B):
    """
    Computes dense layer
    Args:
      A_in (ndarray (1, n)) : 1 example or row of the data with n features
      W    (ndarray (n, j)) : weight matrix, n features (number of columns of design matrix minus intercept column) per unit, j units
      B    (ndarray (1, j)) : bias raw vector with 1 row and j units (columns)
    Returns
      A_out (ndarray (1, j))  : j units
    """
    Z = np.matmul(A_in, W) + B  # Compute linear predictor Z = a_in W + b for all units
    A_out = g(
        Z
    )  # Compute activation values (logit transformation) A_out = g(Z) for all units
    return A_out

In [8]:
X = np.array([[200, 17]])
W = np.array([[1, -3, 5], [-2, 4, -6]])
B = np.array([[-1, 1, 2]])

X.shape, W.shape, B.shape

((1, 2), (2, 3), (1, 3))

In [9]:
my_dense_vectorized(X, W, B)

array([[1.00e+000, 7.12e-218, 1.00e+000]])

The difference between the two implementations is that the first implementation based on a for loop uses 1-D arrays (both as inputs `a_in` and `b` and output) whereas the second implementation takes as inputs 2-D arrays (matrices for all `A_in`, `B`, and `W`).