In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import matplotlib
matplotlib.pyplot.style.use('seaborn')
# matplotlib.rcParams['figure.figsize'] = (15, 5)

%matplotlib inline

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
np.set_printoptions(precision=2, suppress=True)

In [None]:
import math
import copy

import scipy.stats as stats

In [None]:
from sklearn import model_selection, metrics, datasets

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Expand layers

$L$ - next layer amount of neurons

Each feature vector multiplied on matrix $J \cdot L$ _(Not a vector like in regression)_ defining next leve dimensionality (by $L$)

$
\begin{bmatrix}
x_0^0 & x_0^1 & ... & x_0^j \\
... \\
x_n^0 & x_n^1 & ... & x_n^j
\end{bmatrix}
$
$
\begin{bmatrix}
w_0^0 & ... & w_0^{L} \\
... \\
w_j^0 & ... & w_j^{L}
\end{bmatrix}
\to$
$
\begin{bmatrix}
x_0^0 w_0^0 & ... & x_0^j w_0^{L} \\
...\\
x_n^0 w_j^0 & ... & x_n^j w_j^{L}
\end{bmatrix}
$

# Model
##### $Y = XW + \epsilon$

$
\begin{bmatrix}
y_0 & ... & y_0^{L} \\
... \\
y_n & ... & y_n^{L}
\end{bmatrix}
=
$
$
\begin{bmatrix}
x_0^0 & x_0^1 & ... & x_0^j \\
... \\
x_n^0 & x_n^1 & ... & x_n^j
\end{bmatrix}
$
$
\begin{bmatrix}
w_0^0 & ... & w_0^{L} \\
... \\
w_j^0 & ... & w_j^{L}
\end{bmatrix}
$
$
+
\begin{bmatrix}
\epsilon_0 \\
...\\
\epsilon_n
\end{bmatrix}
$

Previous level $Y$ becomes featrue vector for current level.

In [None]:
X = np.array([
    [1, 2, 5], # sum -> 8
    [1, 5, 5], # -> 11
    [1, 8, 5], # -> 14
], dtype=np.float64)

W = np.array([
    [1, 2, 1, 0.5],
    [1, 2, 1, 0.5],
    [1, 2, 1, 0.5],
], dtype=np.float64)

Y = X.dot(W)
Y

In [None]:
X.sum(axis=1)

### Transpose for more clarity

In [None]:
W2 = np.array([
    [1, 1, 1],
    [2, 2, 2],
    [1, 1, 1],
    [0.5, 0.5, 0.5],
], dtype=np.float64)

X.dot(W2.T)

### Another order of matrix multiplication

$Y = W \cdot X + \epsilon$

In [None]:
# Emulate external source:
source_vectors = []
source_vectors.append([1, 2, 5]) # sum -> 8
source_vectors.append([1, 5, 5]) # -> 11
source_vectors.append([1, 8, 5]) # -> 14

X = np.array(source_vectors, dtype=np.float64)

W = np.array([
    [1, 1, 1],
    [2, 2, 2],
    [1, 1, 1],
    [.5, .5, .5],
], dtype=np.float64)

Y = W.dot(X.T)
Y
Y.T

# Activation functions

### Logistic function (sigmoid, logit)

For binary clasification output layer

### $\sigma(X,w) = \frac{1}{1 + e^{-Xw}}$

### $\frac{d}{dX} \sigma(X,w) = \frac{1}{1 + e^{-Xw}} \left(1 - \frac{1}{1 + e^{-Xw}}\right)$

In [None]:
def sigmoid(M):
    return 1 / (1 + np.exp(-M))

In [None]:
xs = np.linspace(-5, 5, 100)
ys = [sigmoid(x) for x in xs]
plt.plot(xs, ys);

# Tanh

### $t(X, w) = \frac{e^{Xw} - e^{-Xw}}{e^{Xw} + e^{-Xw}}$

### $\frac{d}{dX} t(X, w) = 1 - \left(\frac{e^{Xw} - e^{-Xw}}{e^{Xw} + e^{-Xw}}\right)^2$

In [None]:
def tanh(M):
    numerator = np.exp(M) - np.exp(-M)
    denominator = np.exp(M) + np.exp(-M)
    return numerator / denominator

In [None]:
xs = np.linspace(-5, 5, 100)
ys = [tanh(x) for x in xs]
plt.plot(xs, ys);

# Relu

$r(X, w) = max(0, Xw)$

In derivative hack in zero by $1 \mbox{ if } x \ge 0$:

$
r'(X, w) =
\left\{
    \begin{array}{ll}
		0 & \mbox{if } Xw < 0 \\
		1 & \mbox{if } Xw \ge 0
	\end{array}
\right.
$

In [None]:
def relu(M):
    return np.maximum(0, M)

def d_relu(x):
    x[x <= 0] = 0
    x[x > 0] = 1
    return x

In [None]:
xs = np.linspace(-5, 5, 100)
ys = [relu(x) for x in xs]
plt.plot(xs, ys);

# Forward propagation

$Z^{[1]} = W^{[1]} X + b^{[1]}$

$A^{[1]} = relu^{[1]}(Z^{[1]})$

---

$Z^{[2]} = W^{[2]} A^{[1]} + b^{[2]}$

$A^{[2]} = relu^{[2]}(Z^{[2]})$

# Backward propagtion derivation intuition

---

##### Forward propagation graph:
$X \to \sigma(W^{[1]}X + b^{[1]}) \to \sigma(W^{[2]} A^{[1]} + b^{[2]}) \to \sigma(W^{[l]} A^{[l-1]} + b^{[l]}) \to  ... \to \sigma(W^{[L]} A^{[L-1]} + b^{[L]}) \to \frac{1}{N} \sum Cost \left( \hat{y_i}, y_i \right)$

Ommit subscript indexes, pretend 'matrix' form:

$z^{[L]} = w^{[L]} a^{[L-1]} + b^{[L]}$

$a^{[L]} = \sigma(z^{[L]})$

# Chain rule (Derivation for last layer):

Assume cost is MSE: $Cost = \left( a^{[L]} - y \right)^2$

Chain rule graph:

$Cost \to a^{L} \to z^{L} \to (w^{L}, a^{L-1}, b^{L})$

---

## $\partial w^{[L]}$:

# $\frac{\partial Cost}{\partial w^{[L]}} =
\frac{\partial  z^{[L]}}{\partial w^{[L]}}
\frac{\partial  a^{[L]}}{\partial z^{[L]}}
\frac{\partial  Cost}{\partial a^{[L]}}
$

# $\frac{\partial Cost}{\partial w^{[L]}} =
a^{[L - 1]} \cdot
\sigma '(z^{[L]}) \cdot
2(a^{[L]} - y) \cdot (a^{[L]} - y)'
$

# $\frac{\partial Cost}{\partial w^{[L]}} =
a^{[L - 1]} \cdot
\sigma '(z^{[L]}) \cdot
2(a^{[L]} - y) \cdot 1
$

---

## $\partial a^{[L-1]}$:

# $\frac{\partial Cost}{\partial a^{[L-1]}} =
\frac{\partial  z^{[L]}}{\partial a^{[L-1]}}
\frac{\partial  a^{[L]}}{\partial z^{[L]}}
\frac{\partial  Cost}{\partial a^{[L]}}
$

# $\frac{\partial Cost}{\partial a^{[L-1]}} =
w^{[L]} \cdot
\sigma '(z^{[L]}) \cdot
2(a^{[L]} - y)
$

---

## $\partial b^{[L]}$:

# $\frac{\partial Cost}{\partial b^{[L]}} =
\frac{\partial  z^{[L]}}{\partial b^{[L]}}
\frac{\partial  a^{[L]}}{\partial z^{[L]}}
\frac{\partial  Cost}{\partial a^{[L]}}
$

# $\frac{\partial Cost}{\partial b^{[L]}} =
1 \cdot
\sigma '(z^{[L]}) \cdot
2(a^{[L]} - y)
$

# (Additional intuition) Backpropagtion graph in depth:

$Cost \to \left[ A^{L} \to Z^{L} \to (W^{L}, A^{L-1}, b^{L}) \right] \to
\left[ Z^{[L-1]} \to (W^{L-1}, A^{L-2}, b^{L-1}) \right]
\to ... \to
\left[ Z^{[1]} \to (W^{1}, X, b^{1}) \right]
$ 

# How to derivate next level:

---

## $\partial W^{[L-1]}$:

# $\frac{\partial Cost}{\partial W^{[L-1]}} =
\frac{\partial  Z^{[L-1]}}{\partial W^{[L-1]}}
\frac{\partial  A^{[L-1]}}{\partial Z^{[L-1]}}
\frac{\partial  Z^{[L]}}{\partial A^{[L-1]}}
\frac{\partial  A^{[L]}}{\partial Z^{[L]}}
\frac{\partial  Cost}{\partial A^{[L]}}
$

# $\frac{\partial Cost}{\partial W^{[L-1]}} =
A^{[L-2]} \cdot
\sigma '(Z^{[L-1]}) \cdot
\left(
  W^{[L]} \cdot
  \sigma '(Z^{[L]}) \cdot
  2(A^{[L]} - y)
\right)
$

# $\frac{\partial Cost}{\partial W^{[L-1]}} =
A^{[L-2]} \cdot
\sigma '(Z^{[L-1]}) \cdot
\frac{\partial Cost}{\partial A^{[L-1]}}
$

---

## $\partial A^{[L-2]}$:

# $\frac{\partial Cost}{\partial A^{[L-2]}} =
\frac{\partial  Z^{[L-1]}}{\partial A^{[L-2]}}
\frac{\partial  A^{[L-1]}}{\partial Z^{[L-1]}}
\frac{\partial  Z^{[L]}}{\partial A^{[L-1]}}
\frac{\partial  A^{[L]}}{\partial Z^{[L]}}
\frac{\partial  Cost}{\partial A^{[L]}}
$

# $\frac{\partial Cost}{\partial A^{[L-2]}} =
W^{[L-1]} \cdot
\sigma '(Z^{[L-1]})
\cdot
\left(
   W^{[L]} \cdot
   \sigma '(Z^{[L]}) \cdot
   2(a^{[L]} - y)
\right)
$

# $\frac{\partial Cost}{\partial A^{[L-2]}} =
W^{[L-1]} \cdot
\sigma '(Z^{[L-1]})
\cdot
\frac{\partial Cost}{\partial A^{[L-1]}}
$

---

## $\partial b^{[L-1]}$:

# $\frac{\partial Cost}{\partial b^{[L-1]}} =
\frac{\partial  Z^{[L-1]}}{\partial b^{[L-1]}}
\frac{\partial  A^{[L-1]}}{\partial Z^{[L-1]}}
\frac{\partial  Z^{[L]}}{\partial A^{[L-1]}}
\frac{\partial  A^{[L]}}{\partial Z^{[L]}}
\frac{\partial  Cost}{\partial A^{[L]}}
$

# $\frac{\partial Cost}{\partial b^{[L-1]}} =
1 \cdot
\sigma '(Z^{[L-1]}) \cdot
W^{[L]} \cdot
\sigma '(Z^{[L]}) \cdot
2(a^{[L]} - y)
$

# $\frac{\partial Cost}{\partial b^{[L-1]}} =
\sigma '(Z^{[L-1]}) \cdot
\frac{\partial Cost}{\partial A^{[L-1]}}
$

# Next levels - same pattern recursevly, ex:

$\frac{\partial Cost}{\partial W^{[L-2]}} =
\frac{\partial  Z^{[L-2]}}{\partial W^{[L-2]}}
\frac{\partial  A^{[L-2]}}{\partial Z^{[L-2]}}
\left(
  \frac{\partial  Z^{[L-1]}}{\partial A^{[L-2]}}
  \frac{\partial  A^{[L-1]}}{\partial Z^{[L-1]}}
  \left(
    \frac{\partial  Z^{[L]}}{\partial A^{[L-1]}}
    \frac{\partial  A^{[L]}}{\partial Z^{[L]}}
    \frac{\partial  Cost}{\partial A^{[L]}}
  \right)
\right)
$

$\frac{\partial Cost}{\partial W^{[L-2]}} =
A^{[L-3]} \cdot
\sigma '(Z^{[L-2]}) \cdot
\left(
  W^{[L-1]} \cdot
  \sigma '(Z^{[L-1]}) \cdot
  \left(
    W^{[L]} \cdot
    \sigma '(Z^{[L]}) \cdot
    2(a^{[L]} - y)
  \right)
\right)
$

$\frac{\partial Cost}{\partial W^{[L-2]}} =
A^{[L-3]} \cdot
\sigma '(Z^{[L-2]}) \cdot
\left(
  W^{[L-1]} \cdot
  \sigma '(Z^{[L-1]}) \cdot
  \frac{\partial Cost}{\partial A^{[L-1]}}
\right)
$

$\frac{\partial Cost}{\partial W^{[L-2]}} =
A^{[L-3]} \cdot
\sigma '(Z^{[L-2]}) \cdot
\frac{\partial Cost}{\partial A^{[L-2]}}
$

---

$\frac{\partial Cost}{\partial A^{[L-3]}} =
\frac{\partial  Z^{[L-2]}}{\partial A^{[L-3]}}
\frac{\partial  A^{[L-2]}}{\partial Z^{[L-2]}}
\left(
  \frac{\partial  Z^{[L-1]}}{\partial A^{[L-2]}}
  \frac{\partial  A^{[L-1]}}{\partial Z^{[L-1]}}
  \left(
    \frac{\partial  Z^{[L]}}{\partial A^{[L-1]}}
    \frac{\partial  A^{[L]}}{\partial Z^{[L]}}
    \frac{\partial  Cost}{\partial A^{[L]}}
  \right)
\right)
$

$\frac{\partial Cost}{\partial A^{[L-3]}} =
W^{[L-2]} \cdot
\sigma '(Z^{[L-2]}) \cdot
\left(
  W^{[L-1]} \cdot
  \sigma '(Z^{[L-1]}) \cdot
  \left(
    W^{[L]} \cdot
    \sigma '(Z^{[L]}) \cdot
    2(a^{[L]} - y)
  \right)
\right)
$

$\frac{\partial Cost}{\partial A^{[L-3]}} =
W^{[L-2]} \cdot
\sigma '(Z^{[L-2]}) \cdot
\left(
W^{[L-1]} \cdot
\sigma '(Z^{[L-1]}) \cdot \frac{\partial Cost}{\partial A^{[L-1]}}
\right)
$

$\frac{\partial Cost}{\partial A^{[L-3]}} =
W^{[L-2]} \cdot
\sigma '(Z^{[L-2]}) \cdot \frac{\partial Cost}{\partial A^{[L-2]}}
$

---

$\frac{\partial Cost}{\partial b^{[L-2]}} =
\frac{\partial  Z^{[L-2]}}{\partial b^{[L-2]}}
\frac{\partial  A^{[L-2]}}{\partial Z^{[L-2]}}
\left(
  \frac{\partial  Z^{[L-1]}}{\partial A^{[L-2]}}
  \frac{\partial  A^{[L-1]}}{\partial Z^{[L-1]}}
  \left(
    \frac{\partial  Z^{[L]}}{\partial A^{[L-1]}}
    \frac{\partial  A^{[L]}}{\partial Z^{[L]}}
    \frac{\partial  Cost}{\partial A^{[L]}}
  \right)
\right)
$

$\frac{\partial Cost}{\partial b^{[L-2]}} =
1 \cdot \sigma '(Z^{[L-2]}) \cdot
\left(
  W^{[L-1]} \cdot
  \sigma '(Z^{[L-1]}) \cdot
  \left(
    W^{[L]} \cdot
    \sigma '(Z^{[L]}) \cdot
    2(a^{[L]} - y)
  \right)
\right)
$

$\frac{\partial Cost}{\partial b^{[L-2]}} =
\sigma '(Z^{[L-2]}) \cdot
\left(
W^{[L-1]} \cdot
\sigma '(Z^{[L-1]}) \cdot \frac{\partial Cost}{\partial A^{[L-1]}}
\right)
$

$\frac{\partial Cost}{\partial b^{[L-2]}} =
\sigma '(Z^{[L-2]}) \cdot \frac{\partial Cost}{\partial A^{[L-2]}}
$