# Neural Network

## Import needed libraries:
* numpy
* tqdm
* skelarn.metrics

In [None]:
import numpy as np
from tqdm import tqdm
from sklearn.metrics import log_loss

### sigmoid():
The sigmoid function.

### d_sigmoid():
The derivative of sigmoid function.

### leakyReLu():
The leaky relu function.

### d_leakyReLu():
The derivative of leaky relu function.

In [None]:
def sigmoid(Z):
    # avoid overflow in exp function
    Z = np.clip( Z, -600, 600 )
    return 1.0 / (1 + np.exp(-Z))
    
def d_sigmoid(Z):
    s = sigmoid(Z)
    return s*(1-s)

def leakyReLu(Z):
    Z[Z<0]=0.01*Z[Z<0]
    return Z

def d_leakyReLu(Z):
    Z[Z<0]=0.01
    Z[Z>0]=1
    return Z

## Neural Network Model

## <span style="color:green">class</span>  <span style="color:blue">Layer</span>:

### forward_prop():
Implements the forward propagation for the current layer, using its weights, te bias term and the given input vector.

### back_prop():
Implements the back propagation(calculating the gradients) for the current layer using the upper layer's parameters and the given input vector.
<br />
Also, _lambda parameter is passed for L2 regularization. 

In [None]:
class Layer:
    def __init__(self, units, weights, activation='sigmoid'):
        np.random.seed(42)
        initialization_factor = np.sqrt(1/(weights-1)) if weights > 1 else 0.01
        self.W = np.random.randn(units, weights) * initialization_factor
        self.b = np.zeros((units, 1))
        if(activation=='sigmoid'):
            self.activation=sigmoid
            self.d_activation=d_sigmoid
        else:
            self.activation=leakyReLu
            self.d_activation=d_leakyReLu
         
    def forward_prop(self, A_previous):
        self.Z = np.dot(self.W, A_previous) + self.b
        self.A = self.activation(self.Z)
        return self.A
        
    def back_prop(self, upper_layer, A_previous, m, learning_rate, _lambda):
        regularization_term = _lambda * self.W
        
        # compute gradients
        self.dZ = upper_layer.W.T.dot(upper_layer.dZ) * self.d_activation(self.Z)
        self.dW = (1/m) * (self.dZ.dot(A_previous.T) - regularization_term)
        self.db = (1/m) * np.sum(self.dZ, axis=1, keepdims=True)
        # update weights
        self.W -= learning_rate*self.dW
        self.b -= learning_rate*self.db

## <span style="color:green">class</span>  <span style="color:blue">NeuralNetwork</span>:

### input_layer():
The input of the model:
* X: The training vector. Should be passed as X.shape=(m,n)
* y: The corresponding labels. Should be passed as y.shape=(m,1)

### add_layer():
Call for adding a new hidden layer or the output layer.
<br />
The input is an instance of <span style="color:green">class</span>  <span style="color:blue">Layer</span>.

### __forward_prop():
* Implements the forward propagation for the network over all layers and returns the y_hat-predictions of the training set.
<br />
* Calls **Layer.forward_prop()** for each layer.

### __back_prop():
* Implements the back propagation for the network over all layers, computes the gradients for each layer's weights and bias term and updates them using gradient descent with the given learning_rate and _lambda for L2 regularization.
<br />
* Calls **Layer.back_prop()** for each hidden layer.
<br />
* The gradients update for the output_layer is implemented in the __back_prop(), because of its different derivative computation.

### fit() :
Call to train the model. 
<br />
* If early_stopping arguement is True, the algorithm will stop training the model if the mean dev cost is increasing for 5 consecutive epochs or if it reaches the limit of the given epochs arguement.
* If early_stopping arguement is True the algorithm will iterate over the training examples for the given number of epochs.

### predict():
Predicts the output (0 or 1) of a given dataset X.

### accuracy():
Predicts the accuracy of the model on a given dataset X and its output y

In [None]:
class NeuralNetwork:
    def __init__(self):
        self.layers = []
        self.J = np.array([])
        
    def input_layer(self, X, y):
        self.X = X.T
        self.y = y
        self.m, self.n = X.shape
    
    def add_layer(self, layer):
        self.layers.append(layer)
        
    def __forward_prop(self, X, Y):
        A_previous = X
        for layer in self.layers:
            A_previous = layer.forward_prop(A_previous)
            
        y_hat = A_previous
        
        return y_hat
    
    def __back_prop(self, X, y, y_hat, learning_rate, _lambda):
        #update output layer
        regularization_term = _lambda * self.layers[-1].W
        
        self.layers[-1].dZ = y_hat - y
        if(len(self.layers) == 1):
            self.layers[-1].dW = (1/self.m) * self.layers[-1].dZ.dot(X.T) - regularization_term
        else:
            self.layers[-1].dW = (1/self.m) * self.layers[-1].dZ.dot(self.layers[-2].A.T) - regularization_term
        
        self.layers[-1].db = (1/self.m) * np.sum(self.layers[-1].dZ, axis=1, keepdims=True)
        
        self.layers[-1].W -=  learning_rate*self.layers[-1].dW
        self.layers[-1].b -=  learning_rate*self.layers[-1].db
        
        if(len(self.layers) == 1):
            return
        
        #update hidden layers except the first hidden layer
        for i in range(len(self.layers)-2, 0, -1):
            self.layers[i].back_prop(self.layers[i+1], self.layers[i-1].A, X.shape[1], learning_rate, _lambda)
        
        #update the first hidden layer
        self.layers[0].back_prop(self.layers[1], X, X.shape[1], learning_rate, _lambda)
        
    def fit(self, epochs=100, learning_rate=0.1, decay_rate=0.0, use_tqdm=True, early_stopping = False, X_dev=None, y_dev=None, _lambda=0.1, B=128):
        try:
            self.epochs = 0
            
            #total mini batches
            batches = self.X.shape[1] // B
             
            for epoch in tqdm(range(epochs)) if use_tqdm else range(epochs):
                for batch in range(batches):
                    X = self.X[:,batch*B:(batch+1)*B]
                    y = self.y[batch*B:(batch+1)*B]
                        
                    y_hat = self.__forward_prop(X, y)
                    self.__back_prop(X, y, y_hat, learning_rate, _lambda)
                        
                if (batch+1)*B < self.X.shape[1]:
                    y_hat = self.__forward_prop(self.X[:,(batch+1)*B:], self.y[(batch+1)*B:])
                    self.__back_prop(self.X[:,(batch+1)*B:], self.y[(batch+1)*B:], y_hat, learning_rate, _lambda)
                        
                # compute loss at dev data
                y_hat_dev = self.__forward_prop(X_dev.T, y_dev)
                cost = log_loss(y_dev.reshape(-1), y_hat_dev.reshape(-1))/X_dev.shape[0]
                self.J = np.append(self.J, cost)
                    
                self.epochs+=1
                #decrease learning rate in each iterationn if decay_
                learning_rate=learning_rate*(1/(1+decay_rate*epoch))
                
                # train for at least 10 epochs
                if self.epochs > 10 and early_stopping==True:
                    # early stop if cost is increasing for 5 epochs
                    if all(self.J[-1] >= self.J[-6:-1]):
                        break
            
        except:
            return
    
    
    def predict(self, X, y):
        predictions = self.__forward_prop(X.T, y)
        predictions = (predictions >= (0.5 + 1e-6)).astype(int)
    
        return predictions.reshape(-1)
    
    def accuracy(self, X, y):
        preds = self.predict(X, y)
        accuracy = sum(preds == y) / y.shape[0]
        
        return accuracy