# Zajęcie 2: Metoda gradientu prostego. Stosowanie do algorytmu wstecznej propagacji błędu

https://towardsdatascience.com/lets-code-a-neural-network-in-plain-numpy-ae7e74410795
https://github.com/SkalskiP/ILearnDeepLearning.py/blob/master/01_mysteries_of_neural_networks/03_numpy_neural_net/Numpy%20deep%20neural%20network.ipynb

## Wizualizacja metody gradientu z użyciem tensorflow

### Auxiliary function

In [1]:
# precede the number with zeros, creating a thong of a certain length
def makeIndexOfLength(index, length):
    indexStr = str(index)
    return ('0' * (length - len(indexStr)) + indexStr)

### Imports

In [2]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

### Example 1

In [None]:
x = tf.Variable(2.0)
y = tf.Variable(5.0)
target_fn = lambda x,y: 3 * x * x+ 2 * y * y
sgd = tf.keras.optimizers.SGD(0.1)

In [None]:
""" derivative_tar_on_x = 6x = 12
    derivetive_tar_on_y = 4y = 20 """
with tf.GradientTape(persistent=True) as g:
    g.watch(x)
    g.watch(y)
    t= target_fn(x,y)
    gradients = g.gradient(t, sources=[x, y])
    print(gradients) # 12, 20

In [None]:
sgd.apply_gradients(zip(gradients, [x, y]))
print(x) # 2 - 0.1 * 12 = 0.8
print(y) # 5 - 0.1 * 20 = 3.0

In [None]:
while abs(target_fn(x,y) - 0) >= 0.01:
     with tf.GradientTape(persistent=True) as tp:
            t= target_fn(x,y)
            gradients = g.gradient(t, sources=[x, y])
            n = sgd.minimize(target_fn(x,y), var_list=[x, y],tape=tp)

In [None]:
# 3D cost figure
for angle in range(0, 180):
    plt.style.use('dark_background')
    fig = plt.figure(figsize=(8,8))
    ax = Axes3D(fig)
    x3D, y3D = np.meshgrid(np.linspace(-1, 1, 100), np.linspace(-1, 1, 100))  # parameter space
    cost3D = np.array([np.mean(np.square(target_fn(x_,y_) - y)) for x_, y_ in zip(x3D.flatten(), y3D.flatten())]).reshape(x3D.shape)
    ax.plot_surface(x3D, y3D, cost3D, rstride=1, cstride=1, cmap=plt.get_cmap('rainbow'), alpha=0.6)
    ax.scatter(x_list[0], y_list[0], zs=cost_list[0], s=300, c='r')  # initial parameter place
    ax.set_xlabel('x'); ax.set_ylabel('y')
    ax.plot(a_list[:angle], b_list[:angle], zs=cost_list[:angle], zdir='z', c='r', lw=3)    # plot 3D gradient descent
    ax.view_init(30 + (90 - angle)/5, 45 + angle*2)
    plt.savefig("./" + OUTPUT_DIR + "/" + makeIndexOfLength(angle, 3) + ".png")
    plt.close()

### Example 2

### Settings

In [None]:
# learning rate
LR = 0.04
# parameters a and b of the real function
REAL_PARAMS = [1.2, 2.5]
# starting point for gradient descent
INIT_PARAMS = [-1, -1.5]
# output directory (the folder must be created on the drive)
OUTPUT_DIR = "gradient_descent"

### Performing the simulation

In [None]:
#x_ = np.linspace(-1, 1, 200, dtype=np.float32)
x_ = [-1,1]
x = [tf.Variable(initial_value=p, dtype=tf.float32) for p in x_]

y_fun = lambda: np.sin(b*np.cos(a*x))
tf_y_fun = lambda: tf.sin(b*tf.cos(a*x))

#noise = np.random.randn(200)/10
noise = np.random.randn(2)/10
#y = y_fun(*REAL_PARAMS) + noise
y = y_fun() + noise

# tensorflow graph
a, b = [tf.Variable(initial_value=p, dtype=tf.float32) for p in INIT_PARAMS]
pred = tf_y_fun()
mse = tf.reduce_mean(tf.square(y-pred))
#train_op = tf.train.GradientDescentOptimizer(LR).minimize(mse)

with tf.GradientTape() as g:
    g.watch(a)
    g.watch(b)
    t= y_fun()
    gradients = g.gradient(t, sources=[a,b])

optimizer = tf.optimizers.SGD(LR)

train_op = optimizer.minimize(loss=mse,var_list=[a,b])


a_list, b_list, cost_list = [], [], []
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for s in range(180):
        a_, b_, mse_ = sess.run([a, b, mse])
        a_list.append(a_); b_list.append(b_); cost_list.append(mse_)
        result, _ = sess.run([pred, train_op])                          

In [None]:
x

###  Creates visualization

In [None]:
# 3D cost figure
for angle in range(0, 180):
    plt.style.use('dark_background')
    fig = plt.figure(figsize=(8,8))
    ax = Axes3D(fig)
    a3D, b3D = np.meshgrid(np.linspace(-5, 5, 100), np.linspace(-5, 5, 100))  # parameter space
    cost3D = np.array([np.mean(np.square(y_fun(a_, b_) - y)) for a_, b_ in zip(a3D.flatten(), b3D.flatten())]).reshape(a3D.shape)
    ax.plot_surface(a3D, b3D, cost3D, rstride=1, cstride=1, cmap=plt.get_cmap('rainbow'), alpha=0.6)
    ax.scatter(a_list[0], b_list[0], zs=cost_list[0], s=300, c='r')  # initial parameter place
    ax.set_xlabel('a'); ax.set_ylabel('b')
    ax.plot(a_list[:angle], b_list[:angle], zs=cost_list[:angle], zdir='z', c='r', lw=3)    # plot 3D gradient descent
    ax.view_init(30 + (90 - angle)/5, 45 + angle*2)
    plt.savefig("./" + OUTPUT_DIR + "/" + makeIndexOfLength(angle, 3) + ".png")
    plt.close()

### Expected results

Go to OUTPUT_DIR, which should now be filled with subsequent keyframes of our animation. All the resulting images look more or less like this.

<img src="./final_visualisations/frames.png" alt="All frames">

Now all you need to do is enter OUTPUT_DIR and use ImageMagick to create a final gift with one command.

```bash
convert -delay 10 -loop 0 *.png keras_class_boundaries.gif
```

<img src="./final_visualisations/gradient_descent.gif" alt="Gradient descent">

## Implementacja "od zera" (użycie numpy)

In [None]:
import numpy as np

## Architektura

In [None]:
nn_architecture = [
    {"input_dim": 2, "output_dim": 4, "activation": "relu"},
    {"input_dim": 4, "output_dim": 6, "activation": "relu"},
    {"input_dim": 6, "output_dim": 6, "activation": "relu"},
    {"input_dim": 6, "output_dim": 4, "activation": "relu"},
    {"input_dim": 4, "output_dim": 1, "activation": "sigmoid"},
]

In [None]:
def init_layers(nn_architecture, seed = 99):
    np.random.seed(seed)
    number_of_layers = len(nn_architecture)
    params_values = {}

    for idx, layer in enumerate(nn_architecture):
        layer_idx = idx + 1
        print(idx)
        print(layer)
        layer_input_size = layer["input_dim"]
        layer_output_size = layer["output_dim"]
        
        params_values['W' + str(layer_idx)] = np.random.randn(
            layer_output_size, layer_input_size) * 0.1
        params_values['b' + str(layer_idx)] = np.random.randn(
            layer_output_size, 1) * 0.1
        
    return params_values

## Activation functions

In [None]:
def sigmoid(Z):
    return 1/(1+np.exp(-Z))

def relu(Z):
    return np.maximum(0,Z)

def sigmoid_backward(dA, Z):
    sig = sigmoid(Z)
    return dA * sig * (1 - sig)

def relu_backward(dA, Z):
    dZ = np.array(dA, copy = True)
    dZ[Z <= 0] = 0;
    return dZ;

## Forward propagation

In [None]:
def single_layer_forward_propagation(A_prev, W_curr, b_curr, activation="relu"):
    print(A_prev.shape)
    Z_curr = np.dot(W_curr, A_prev) + b_curr
    
    if activation == "relu":
        activation_func = relu
    elif activation == "sigmoid":
        activation_func = sigmoid
    else:
        raise Exception('Non-supported activation function')
        
    return activation_func(Z_curr), Z_curr

In [None]:
def full_forward_propagation(X, params_values, nn_architecture):
    memory = {}
    A_curr = X
    
    for idx, layer in enumerate(nn_architecture):
        layer_idx = idx + 1
        A_prev = A_curr
        
        activ_function_curr = layer["activation"]
        W_curr = params_values["W" + str(layer_idx)]
        b_curr = params_values["b" + str(layer_idx)]
        A_curr, Z_curr = single_layer_forward_propagation(A_prev, W_curr, b_curr, activ_function_curr)
        
        memory["A" + str(idx)] = A_prev
        memory["Z" + str(layer_idx)] = Z_curr
       
    return A_curr, memory

# Loss function (binary cross-entropy)

In [None]:
def get_cost_value(Y_hat, Y):
    m = Y_hat.shape[1]
    cost = -1 / m * (np.dot(Y, np.log(Y_hat).T) + np.dot(1 - Y, np.log(1 - Y_hat).T))
    return np.squeeze(cost)

# an auxiliary function that converts probability into class
def convert_prob_into_class(probs):
    probs_ = np.copy(probs)
    probs_[probs_ > 0.5] = 1
    probs_[probs_ <= 0.5] = 0
    return probs_

def get_accuracy_value(Y_hat, Y):
    Y_hat_ = convert_prob_into_class(Y_hat)
    return (Y_hat_ == Y).all(axis=0).mean()

## Backward propagation

$$\boldsymbol{dW}^{[l]} = \frac{\partial L }{\partial \boldsymbol{W}^{[l]}} = \frac{1}{m} \boldsymbol{dZ}^{[l]} \boldsymbol{A}^{[l-1] T}$$



$$\boldsymbol{db}^{[l]} = \frac{\partial L }{\partial \boldsymbol{b}^{[l]}} = \frac{1}{m} \sum_{i = 1}^{m} \boldsymbol{dZ}^{[l](i)}$$

$$\boldsymbol{dA}^{[l-1]} = \frac{\partial L }{\partial \boldsymbol{A}^{[l-1]}} = \boldsymbol{W}^{[l] T} \boldsymbol{dZ}^{[l]}$$

$$\boldsymbol{dZ}^{[l]} = \boldsymbol{dA}^{[l]} * g'(\boldsymbol{Z}^{[l]})$$

In [None]:
def single_layer_backward_propagation(dA_curr, W_curr, b_curr, Z_curr, A_prev, activation="relu"):
    m = A_prev.shape[1]
    
    if activation == "relu":
        backward_activation_func = relu_backward
    elif activation == "sigmoid":
        backward_activation_func = sigmoid_backward
    else:
        raise Exception('Non-supported activation function')
    
    dZ_curr = backward_activation_func(dA_curr, Z_curr)
    dW_curr = np.dot(dZ_curr, A_prev.T) / m
    db_curr = np.sum(dZ_curr, axis=1, keepdims=True) / m
    dA_prev = np.dot(W_curr.T, dZ_curr)

    return dA_prev, dW_curr, db_curr

$$\frac{\partial L }{\partial \boldsymbol{\hat{Y}}} = -(\frac{\boldsymbol{Y}}{\boldsymbol{\hat{Y}}}- \frac{1-\boldsymbol{Y}}{1-\boldsymbol{\hat{Y}}})$$

In [None]:
def full_backward_propagation(Y_hat, Y, memory, params_values, nn_architecture):
    grads_values = {}
    m = Y.shape[1]
    Y = Y.reshape(Y_hat.shape)
   
    dA_prev = - (np.divide(Y, Y_hat) - np.divide(1 - Y, 1 - Y_hat));
    
    for layer_idx_prev, layer in reversed(list(enumerate(nn_architecture))):
        layer_idx_curr = layer_idx_prev + 1
        activ_function_curr = layer["activation"]
        
        dA_curr = dA_prev
        
        A_prev = memory["A" + str(layer_idx_prev)]
        Z_curr = memory["Z" + str(layer_idx_curr)]
        W_curr = params_values["W" + str(layer_idx_curr)]
        b_curr = params_values["b" + str(layer_idx_curr)]
        
        dA_prev, dW_curr, db_curr = single_layer_backward_propagation(
            dA_curr, W_curr, b_curr, Z_curr, A_prev, activ_function_curr)
        
        grads_values["dW" + str(layer_idx_curr)] = dW_curr
        grads_values["db" + str(layer_idx_curr)] = db_curr
    
    return grads_values

## Updating the parameters (Gradient descent)

In [None]:
def update(params_values, grads_values, nn_architecture, learning_rate):
    for layer_idx, layer in enumerate(nn_architecture):
        params_values["W" + str(layer_idx)] -= learning_rate * grads_values["dW" + str(layer_idx)]        
        params_values["b" + str(layer_idx)] -= learning_rate * grads_values["db" + str(layer_idx)]

    return params_values;

## Training (putting things together): Error backpropagation algorithm

In [None]:
def train(X, Y, nn_architecture, epochs, learning_rate):
    params_values = init_layers(nn_architecture, 2)
    cost_history = []
    accuracy_history = []
    
    for i in range(epochs):
        Y_hat, cashe = full_forward_propagation(X, params_values, nn_architecture)
        cost = get_cost_value(Y_hat, Y)
        cost_history.append(cost)
        accuracy = get_accuracy_value(Y_hat, Y)
        accuracy_history.append(accuracy)
        
        grads_values = full_backward_propagation(Y_hat, Y, cashe, params_values, nn_architecture)
        params_values = update(params_values, grads_values, nn_architecture, learning_rate)
        
    return params_values, cost_history, accuracy_history

## Example

In [None]:
x = np.array([[1,2],[3,4],[2,3],[4,5]])
y = np.array([[3],[4],[5],[3]])
#y = np.squeeze(np.asarray(y))

In [None]:
simple_model_1 = train(x,y, nn_architecture, 50, 0.01)

# Backpropagation with keras

https://colab.research.google.com/drive/1N2vNzxZrj5U3-51eYJobhNcBenlP00Pw

In [None]:
! pip install keras

In [None]:
! pip install tensorflow

In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense

In [None]:
x = [[1],[2],[3],[4]]
y = [[2],[4],[6],[8]]

In [None]:
model = Sequential()
model.add(Dense(1, activation='linear', input_shape=(1,)))
model.summary()

In [None]:
model.weights

In [None]:
model.get_weights()

In [None]:
from tensorflow.keras.optimizers import SGD
s= SGD(learning_rate = 0.01)
model.compile(optimizer=s,loss='mean_squared_error',metrics=['accuracy'])

In [None]:
from copy import deepcopy

In [None]:
w = deepcopy(model.get_weights())

In [None]:
w

In [None]:
w = [[[-0.07377076]], [0.]]
w

In [None]:
w[0]

In [None]:
def feed_forward(inputs, outputs, weights):
    hidden = np.dot(inputs,weights[0])
    out = hidden+weights[1]
    squared_error = (np.square(out - outputs))
    return squared_error

In [None]:
def update_weights(inputs, outputs, weights, epochs):  
    for epoch in range(epochs):
        org_loss = feed_forward(inputs, outputs, weights)  
        wts_tmp = deepcopy(weights)
        wts_tmp2 = deepcopy(weights)
        for ix, wt in enumerate(weights): 
            wts_tmp[-(ix+1)] += 0.0001
            # print('wts_tmp:', wts_tmp)
            loss = feed_forward(inputs, outputs, wts_tmp)
            # print('loss', loss)
            del_loss = np.sum(org_loss - loss)/(0.0001*len(inputs))
            wts_tmp2[-(ix+1)] += del_loss*0.01
            wts_tmp = deepcopy(weights)

        weights = deepcopy(wts_tmp2)
    return wts_tmp2

In [None]:
w = [2000, 0]
w[-1]

In [None]:
import numpy as np
from copy import deepcopy
update_weights(x,y,w,1)

In [None]:
w_val = []
b_val = []
for k in range(100):
    w_new, b_new = update_weights(x,y,w,(k+1))
    w_val.append(w_new)
    b_val.append(b_new)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(w_val)
plt.title('Weight value over different epochs when initial weight is 1.5')
plt.xlabel('epochs')
plt.ylabel('weight value')
plt.grid('off')

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(w_val)
plt.title('Weight value over different epochs when initial weight is 0.01')
plt.xlabel('epochs')
plt.ylabel('weight value')
plt.grid('off')

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(w_val)
plt.title('Weight value over different epochs when initial weight is 0.00001')
plt.xlabel('epochs')
plt.ylabel('weight value')
plt.grid('off')

In [None]:
w = list(model.get_weights().copy())
w

In [None]:
update_weights(x,y,w,100)

In [None]:
model.fit(np.array(x), np.array(y), epochs=100, batch_size = 4, verbose=1)

In [None]:
model.get_weights()