In [2]:
import numpy as np
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
plt.style.use('seaborn-whitegrid')

# Gradient Descent with Neural Networks

Given $X$ ant $T$, find $w_k$ that minimizes the squared error in the $k^{th}$ output, then use it to make predictions.

Collect all $w_k$ as columns in $W$. $\tilde{X}$ denotes $X$ with a column of constant 1's prepended as the first column. The target value for the $k^{th}$ output for the $n^{th}$ sample is $t_{n,k}$.

$$
\begin{align*}
  E(W) &= \sum_{n=1}^N \sum_{k=1}^K (t_{n,k} - \tilde{x}_n^T w_k)^2 \\
   W &= (\tilde{X}^T \tilde{X})^{-1} \tilde{X}^T T \\
\\
W &= \begin{bmatrix}
  w_{0,1} & w_{0,2} & \cdots & w_{0,K}\\
  w_{1,1} & w_{1,2} & \cdots & w_{1,K}\\
  \vdots\\
  w_{D,1} & w_{D,2} & \cdots & w_{D,K}
\end{bmatrix} \\
\\
Y &= \tilde{X} W \\
\\
\tilde{X} & \text{ is } N \times (D+1) \\
W & \text{ is } (D+1) \times K \\
Y & \text{ is } N \times K \\

\end{align*}
$$

## Adding Nonlinear Combinations of Inputs

Transform $X$ into $\Phi(X)$.

$$
\Phi(X) = \Phi\left ( 
\begin{bmatrix}
    x_{0,1} & x_{0,2} & \cdots \\
    x_{1,1} & x_{1,2} & \cdots \\
    \vdots\\
    x_{N-1,1} & x_{N-1,2} & \cdots
  \end{bmatrix}
\right ) = 
\begin{bmatrix}
    x_{0,1} & x_{0,2}^5  & x_{0,2}^3 x_{0,4}^2 & \cdots \\
    x_{1,1} & x_{1,2}^5  & x_{1,2}^3 x_{1,4}^2 & \cdots\\
    \vdots\\
    x_{N-1,1} & x_{N-1,2}^5 & x_{N-1,2}^3 x_{N-1,4}^2 & \cdots
  \end{bmatrix}
$$

Replace $X$ by $\Phi$, where $\Phi_n = \Phi(x_n)$, and minimize:

$$
E_k = \sum_{n=1}^N (t_{n,k} - \tilde{\Phi}_n^T w_k)^2
$$

where we get:

$$
\begin{align*}
W &= (\tilde{\Phi}^2 \tilde{\Phi})^{-1} \tilde{\Phi}^T T \\
Y &= \tilde{\Phi} W \\
\end{align*}
$$

## Structure of a Two Layer Network

$$
\begin{align*}
\tilde{Z} & = h(\tilde{X} V)\\
Y & = \tilde{Z} W\\
Y & = \tilde{h}(\tilde{X} V) W 
\end{align*}
$$

We have a hidden $Z$ and output layer $Y$ with an activation function $h$.

$$
\begin{array}{l c c}
Name & Function & Derivative \\
\hline \\
\text{Logistic} & \sigma(x) = \frac{1}{1 + e^{-x}} & \sigma(x)(1 - \sigma(x)) \\
\\
\text{Hyperbolic Tangent} & tanh(x) = \frac{e^x - e^{-x}}{e^x + e^{-x}} & 1 - tanh(x)^2 \\
\\
\text{Rectified Linear Unit} & (x)^+ = max(0, x) & 0 \text{ if } x < 0 \text{ else } 1 \\
\hline \\
\end{array}
$$

### Gradient Descent

The mean squared error between each target value $t_{n,k}$ and output value $y_{n,k}$ is

$$
  E = \frac{1}{N} \frac{1}{K} \sum_{n=1}^N \sum_{k=1}^K ( t_{n,k} - y_{n,k})^2
$$

Make small changes to weights $v_{j,m}$ and $w_{m,k}$ in the negative gradient direction by step size $\rho$.

$$
\begin{align*}
  v_{j,m} &\leftarrow v_{j,m} - \rho \frac{\partial E}{\partial v_{j,m}}\\
  w_{m,k} &\leftarrow w_{m,k} - \rho \frac{\partial E}{\partial w_{m,k}}
\end{align*}
$$

Now the derivation

$$
\begin{align*}
  E &= \frac{1}{N} \frac{1}{K} \sum_{n=1}^N \sum_{k=1}^K (t_{n,k} - y_{n,k})^2\\
  
  y_{n,k'} &= \sum_{m'=0}^M  w_{m',k'} \tilde{z}_{n,m'}\\
  
  &  = \sum_{m'=0}^M  w_{m',k'} \tilde{h}\left ( \sum_{j'=0}^D
    v_{j,m'} \tilde{x}_{n,j'} \right )\\

  \frac{\partial E}{\partial w_{m,k}} & = -2 \frac{1}{N} \frac{1}{K} \sum_{n=1}^N \sum_{k'=1}^K (t_{n,k'} - y_{n,k'}) \frac{\partial y_{n,k'}}{\partial w_{m,k}}\\
  
  & = -2 \frac{1}{N} \frac{1}{K} \sum_{n=1}^N \sum_{k'=1}^K (t_{n,k'} - y_{n,k'}) 
  
  \frac{\partial \left ( \sum_{m'=0}^M w_{m',k'} \tilde{z}_{n,m'}  \right )}{\partial w_{m,k}}\\
  
  & = -2 \frac{1}{N} \frac{1}{K} \sum_{n=1}^N (t_{n,k} - y_{n,k}) \tilde{z}_{n,m} \\
    \frac{\partial E}{\partial v_{j,m}} & = -2 \frac{1}{N} \frac{1}{K} \sum_{n=1}^N \sum_{k'=1}^K (t_{n,k'} - y_{n,k'}) \frac{\partial y_{n,k'}}{\partial  v_{j,m}}\\

  & = -2 \frac{1}{N} \frac{1}{K} \sum_{n=1}^N \sum_{k'=1}^K (t_{n,k'} - y_{n,k'}) 
  \frac{\partial \left ( \sum_{m'=0}^M  w_{m',k'} \tilde{h} \left ( \sum_{j'=0}^D v_{j',m'} \tilde{x}_{n,j'} \right ) \right )}{\partial v_{j,m}}\\ 
  \text{Let } a_{n,m'} & = \tilde{h} \left ( \sum_{j'=0}^D v_{j',m'} \tilde{x}_{n,j'}  \right )\\
  \frac{\partial E}{\partial v_{j,m}} & = -2 \frac{1}{N} \frac{1}{K} \sum_{n=1}^N \sum_{k'=1}^K (t_{n,k'} - y_{n,k'})  \sum_{m'=0}^M  w_{m',k'} 
  \frac{\partial \tilde{h}(a_{n,m'})}{\partial  a_{n,m'}} \frac{\partial a_{n,m'}}{\partial v_{j,m}}\\
  & = -2 \frac{1}{N} \frac{1}{K} \sum_{n=1}^N \sum_{k'=1}^K (t_{n,k'} - y_{n,k'})  \sum_{m'=0}^M  w_{m',k'} \frac{\partial \tilde{h}(a_{n,m'})}{\partial a_{n,m'}} \tilde{x}_{n,j}

\end{align*}
$$

$$
$$

To summarize:

$$
\begin{align*}
      E &= \frac{1}{N} \frac{1}{K} \sum_{n=1}^N \sum_{k=1}^K (t_{n,k} - y_{n,k})^2\\
      \frac{\partial E}{\partial w_{m,k}} & = -2 \frac{1}{N}  \frac{1}{K}
       \sum_{n=1}^N (t_{n,k} - y_{n,k}) \tilde{z}_{n,m}\\
      \frac{\partial E}{\partial v_{j,m}} & = -2 \frac{1}{N} \frac{1}{K} \sum_{n=1}^N \sum_{k=1}^K (t_{n,k} - y_{n,k})   w_{m,k} (1-z_{n,m}^2) \tilde{x}_{n,j}
\end{align*}
$$

Forward pass:

$$
\begin{align*}
      z_{n,m} &= h(\sum_{j=0}^D v_{j,m} \tilde{x}_{n,j})\\
      y_{n,k} &= \sum_{m=1}^M w_{m,k} \tilde{z}_{n,m}
\end{align*}
$$

Backward pass:

$$
\begin{align*}
      v_{j,m} & \leftarrow v_{j,m} + \rho_h \frac{1}{N} \frac{1}{K} \sum_{n=1}^N \sum_{k=1}^K (t_{n,k} - y_{n,k})   w_{m,k} (1-z_{n,m}^2) \tilde{x}_{n,j}\\
 w_{m,k} & \leftarrow w_{m,k} + \rho_o \frac{1}{N}  \frac{1}{K}  \sum_{n=1}^N (t_{n,k} - y_{n,k}) \tilde{z}_{n,m}
\end{align*}
$$