In [16]:
import numpy as np

In [28]:
# Creating a class Ml to create a Deep learning model based on different conditions

class DeepLearning:
    
    def __init__(self, l_dims, activation, max_epochs = 1000, learning_rate = 0.0075, 
                 tolerance = 0.00000001, leaky_para = 0.01, cost_type = "cross entropy", 
                 model_type = "Binary classification"):
        
        '''
        Parameters: l_dims --> list containing hidden unit of each Neural Network layer
                           Note: 1. Neural Network layer does not include input layer.
                                 2. Input layer unit will be added in l_dims in "train" function later  
                                 
                    activation --> list containing the type of activation function used at each Neural Network layer
                           e.g.: "relu", "leaky_relu", "sigmoid", "tanh".  
                                             
                    max_epochs --> int, how many times the model should go through each training sample.
                           Default value --> 1000
                           Note: max_iter = max_epochs*batch_size
                           
                    learning_rate --> decide how much of a step size is taken in updating the parameters 
                           Default value --> 0.0075         
                           
                    tolerance --> float value, if abs(prev_cost - current_cost) < tolerance then stop training model
                           Default value --> 0.00001
                    
                           
                    leaky_para --> float value, used when activation function is leaky_relu.
                           Default value --> 0.01       
                           
                    cost_type --> type of cost function is used to calculate loss function during training model
                           Note: 1. loss function --> defined on one training sample
                                 2. cost function --> computed over the batch sample
                                 3. Default value --> "cross entropy"
                                 4. other possible value --> "multi_class cross entropy", "MSE", "MAE".  
                                 
                    model_type --> type of problem
                           Note: 1. Default value --> "Binary classification"
                                 2. other possible value --> "Multi_class", "Regression".
                                 
                    params --> dictionary to keep track of Weights and bais of the model.
                    linear_model --> dictionary to keep track of Z's of forward propogation.
                           Note: Z = W*A + b (genral form)
                    activation_model --> dictionary to keep track of A's of forward propogation.
                           Note: A = activation_function(Z)
                    grads --> dictionary to keep track of derivatives computed in backward propogation.
                           Note: dA's, dZ's, dW's and db's.
                    
                    
        '''
        
        self.l_dims = l_dims
        self.activation = activation
        self.max_epochs = max_epochs
        self.learning_rate = learning_rate
        self.tolerance = tolerance
        self.leaky_para = leaky_para
        self.cost_type = cost_type
        self.model_type = model_type
        
        self.params = {}  
        self.linear_model = {}
        self.activation_model = {}
        self.grads = {}
        
        
    # helper function to calculate activation function values for each layer
    # 1. relu function
    def ReLU(self, z):
        
        # it return the maximum of 0 and z
        return np.maximum(0, z)
    
    # 2. leaky_ReLU function
    def leaky_ReLU(self, z):
        
        # return z if z >0
        #       else returns leaky_para*z 
        # where leaky_para is user defined and can take a default value of 0.01
        
        return np.maximum(self.leaky_para*z, z)
    
    # 3. sigmoid function
    def sigmoid(self, z):
        
        # sigmoid(z) = 1/(1+e(-z))
        return 1/(1+np.exp(-z))
    
    # gradient of activation functions
    def grad_activation(self, z, activation):
        
        # given the type of activation function computed its derivative
        if activation == 'relu':
            
            # relu = max(0, z) so for z>0, derivative = 1, else derivative = 0.
            z[z>0.0] = 1
            z[z<=0.0] = 0
            
            return z
        
        elif activation == "sigmoid":
            
            # sigmoid(z) = 1/(1+exp(-x))
            # dreivative of sigmoid(z) = sigmoid(z)*(1-sigmoid(z))
            return self.sigmoid(z)*(1.0 - self.sigmoid(z))
        
        elif activation == "tanh":
            
            # tanh(z) = (exp(z) - exp(-z))/(exp(z)+exp(-z))
            # derivative of tanh(z) :- (1 - tanh(z)^2)
            return 1.0 - np.power(np.tanh(z), 2)
        
        else: # for leaky_relu activation function
            
            # leaky_relu = max(leaky_para*z, z), so for z>0 derivative = 1, else, it is = leaky_para
            
            z[z>0.0] = 1
            z[z<=0.0] = self.leaky_para
            
            return z
            
    
    # parameter initlization
    def params_set(self):
        
        '''
        Parameters: self: 
                        l_dims --> now it contains the units of each layer including input layer
                               Note: input layer units are added at 0th index in "train" function.
                                     This function "params_set" is called within "train" function.
                                     
                        params --> to store the Initilized value of each layers weights and bais
                              Note: 1. shape of W[l] = (l_dims[l], l_dims[l-1])
                                    2. shape of b[l] = (l_dims[l], 1)
                                    3. l --> 1,2,3,...,L where L dentoes the output layer
                                    4. multipled Wl's with 0.01 so that model can train properly
        '''
        
        for l in range(1, len(self.l_dims)):
            
            self.params["W"+str(l)] = np.random.randn(self.l_dims[l], self.l_dims[l-1])*0.1
            self.params["b"+str(l)] = np.zeros((self.l_dims[l], 1))
            #print("W"+str(l), " = ", self.params["W"+str(l)])
            #print("b"+str(l), " = ", self.params["b"+str(l)])
        
    
    # forward propogation
    def forward(self, X):
        
        '''
        Parameters: self: 
                        l_dims --> used just to iterate over the layers 
                        params --> used to calculate Z 
                        linear_model --> to keep track of Z
                        activation --> to get the type of activation function used at each layer
                        activation_model --> to keep track of A
                    
                    X --> training set
                        Note: stored in activation_model as "A0"
                              shape of X = (l_dims[0], batch_size)
                              l_dims[0] = n_features of X
                              
                        formula -->      
                                   Z[l] = W[l]*A[l-1] + b[l]
                                   A[l] = activation_function(Z[l])
                                   shape of Z[l] = (l_dims[l], batch_size)
                                   shape of A[l] = (l_dims[l], batch_size) 
        '''

        self.activation_model["A0"] = X
        for l in range(1, len(self.l_dims)): 
            
            #print("A"+str(l-1), " = ", self.activation_model["A"+str(l-1)])
            
            self.linear_model["Z"+str(l)] = np.dot(self.params["W"+str(l)], self.activation_model["A"+str(l-1)]) + self.params["b"+str(l)]
            if self.activation[l-1] == 'relu':
                
                self.activation_model["A"+str(l)] = self.ReLU(self.linear_model["Z"+str(l)])
                
            elif self.activation[l-1] == 'sigmoid':
                
                self.activation_model["A"+str(l)] = self.sigmoid(self.linear_model["Z"+str(l)])
                
            elif self.activation[l-1] == 'tanh' :
                
                self.activation_model["A"+str(l)] = np.tanh(self.linear_model["Z"+str(l)])
                
            else:  # activation = 'leaky_relu'
                
                self.activation_model["A"+str(l)] = self.leaky_ReLU(self.linear_model["Z"+str(l)])
                
            #print("activation function = ", self.activation[l-1])
                
    
    # cost of the model and dAL            
    def cost_and_dAL(self, Y):
        
        '''
        Parameters: self:
                        activation_model --> to get the output layer A (AL)
                        grads --> to store dAL (derivative of cost function w.r.t AL)
                        l_dims --> to access the AL
                    Y --> used in calculating cost function
                    formulas:
                           1. cost_type --> "cross entropy"
                               cost = (-1/m)*sum(Ylog(AL) + (1-Y)log(1-AL))
                               "dA[L]" --> d(cost)/dA[L] = (-Y/AL) + (1-Y)/(1-AL)
                               
                           2. cost_type --> "multi_class cross entropy"
                               cost = (-1/m)*sum(Ylog(AL))
                               "dAL" = (-Y/AL)
                               
                           3. cost_type --> "MSE" (mean squre error)
                               cost = (-1/m)*sum((Y-AL)^2)
                               "dAL" = -2*(Y-AL)
                               
                           4. cost_type --> "MAE" (mean absolute error)
                               cost = (-1/m)*sum(abs(Y-AL))
                               "dAL" = (AL-Y)/(abs(Y-AL))
                               
        Returns:  cost function value for a given cost_type
        '''
        
        m = Y.shape[1]
        AL = self.activation_model["A"+str(len(self.l_dims)-1)] 
        #print("m = ", m)
        #print("Y = ", Y)
        print("cost_type = ", self.cost_type)
        
        # for binary cross entropy or cross entropy
        if self.cost_type == "cross entropy": 
            
            cost = (-1/m)*(np.dot(Y, np.log(AL).T)+np.dot(1-Y, np.log(1-AL).T))
            self.grads["dA"+str(len(self.l_dims)-1)] = -1*np.divide(Y, AL) + np.divide(1-Y, 1-AL)
            cost = np.squeeze(cost)    # if by chance cost is not a value and an array, np.squeeze is used. 
            
        # for multi-class cross entropy
        elif self.cost_type == "multi_class cross entropy":
            
            cost = (-1/m)*np.sum(np.multiply(Y, np.log(AL)))
            self.grads["dA"+str(len(self.l_dims)-1)] = -Y/AL
            cost = np.squeeze(cost)
            
        # for mean squared error
        elif self.cost_type == "MSE":
            
            cost = (-1/m)*np.sum((Y-AL)*(Y-AL))
            self.grads["dA"+str(len(self.l_dims)-1)] = -2*(Y-AL)
            cost = np.squeeze(cost)
            
#         # for root mean squared error
#         elif self.cost_type = "RMSE":
            
#             cost = np.sqrt((1/m)*np.sum(np.power(Y-AL, 2)))
#             self.grads[""dA"+str(len(self.l_dims)-1)"] = 
        # for mean absolute error
        else:
            
            cost = (-1/m)*np.sum(np.abs(Y-AL))
            self.grads["dA"+str(len(self.l_dims)-1)] = (AL-Y)/(np.abs(Y-AL))
            cost = np.squeeze(cost)
            
        #print("cost = ", cost)
        #print("dA"+str(len(self.l_dims)-1), " = ", self.grads["dA"+str(len(self.l_dims)-1)])
        
        return cost
    
    
    # back propogation
    def backward(self):
        
        '''
        Parameters: self: 
                        batch_size --> used in calculating grads
                        l_dims --> to  iterate over each layer
                        grads --> to store derivatives 
                        linear_model, activation_model, params --> used to calculate grads
                        
                Formulas:
                         1. "dZ[l]" --> d(cost)/dZ[l] --> (d(cost)/dA[l])*(dA[l]/dZ[l]) --> "dA[l]"*(dA[l]/dZ[l])
                                      A[l] = activation_function(Z[l])
                                      "dZ[l]" = "dA[l]"*grad_activation(Z[l])   
                                      # element-wise product because
                                      shape of "dZ[l]" --> shape of Z[l] --> shape of A[l] --> shape of "dA[l]" 
                         
                         
                         2. "dW[l]" --> d(cost)/dW[l] --> (d(cost)/dZ[l])*(dZ[l]/dW[l]) --> "dZ[l]"*(dZ[l]/dW[l])
                                      Z[l] = W[l]*A[l-1] + b[l]
                                      # these dW[l] are calculated over batch_size and not after each training sample,
                                      # that's why we need to divide it by batch_size
                                      "dW[l]" = "dZ[l]"*A[l-1] # this is genral form 
                                      # correct formula:
                                                      "dW[l]" = (1/batch_size)*np.dot("dZ[l]" , A[l-1].T)
                                                      shape of "dW[l]" --> shape of W[l] --> (l_dims[l], l_dims[l-1])
                                                      shape of "dZ[l]" --> shape of Z[l] --> (l_dims[l], batch_size)
                                                      shape of A[l-1].T --> shape of Z[l-1].T --> (batch_size, l_dims[l-1])
                         
                         
                         3. "db[l]" --> d(cost)/db[l] --> (d(cost)/dZ[l])*(dZ[l]/db[l]) --> "dZ[l]"*(dZ[l]/db[l])
                                      Z[l] = W[l]*A[l-1] + b[l]
                                      # Similarly this is also calculated over batch_size
                                      "db[l]" = sum("dZ[l]")/batch_size        # sum along rows
                                      # shape of "db[l]" --> shape of b[l] --> (l_dims[l], 1)
                                      # shape of "dZ[l]" --> shape of Z[l] --> (l_dims[l], batch_size)
                                      
                        4. "dA[l-1]" --> d(cost)/dA[l-1] --> (d(cost)/dZ[l])*(dZ[l]/dA[l-1]) --> "dZ[l]"*(dZ[l]/dA[l-1])
                                      "dA[l-1]" = "dZ[l]"*W[l]
                                      # above formula is just a representation
                                      # correct formula:
                                                      "dA[l-1]" = np.dot(W[l].T, "dZ[l]")
                                                      shape of "dA[l-1]" --> shape of A[l-1] --> (l_dims[l-1], batch_size)
                                                      shape of W[l].T --> (l_dims[l-1], l_dims[l])
                                                      shape of "dZ[l]" --> shape of Z[l] --> (l_dims[l], batch_size)
                         
        '''
        
        m = self.activation_model["A"+str(len(self.l_dims)-1)].shape[1]
        for l in reversed(range(1, len(self.l_dims))):
            
            self.grads["dZ"+str(l)] = self.grads["dA"+str(l)]*self.grad_activation(self.linear_model["Z"+str(l)],
                                                                                             self.activation[l-1])
            
            self.grads["dW"+str(l)] = (1/m)*np.dot(self.grads["dZ"+str(l)], self.activation_model["A"+str(l-1)].T)
            
            self.grads["db"+str(l)] = (1/m)*np.sum(self.grads["dZ"+str(l)], axis = 1, keepdims= True)
            
            self.grads["dA"+str(l-1)] = np.dot(self.params["W"+str(l)].T, self.grads["dZ"+str(l)])
            
            #print("dA"+str(l), " = ", self.grads["dA"+str(l)])
            #print("dZ"+str(l), " = ", self.grads["dZ"+str(l)])
            #print("dW"+str(l), " = ", self.grads["dW"+str(l)])
            #print("db"+str(l), " = ", self.grads["db"+str(l)])
            
            
    # updating the parameters Wl's and bl's        
    def update_params(self):
        
        '''
        Parameters: self: 
                        l_dims --> to iterate over each layer
                        params, learning_rate, grads --> to update the parameter of the model
                        
                    Formula:
                           W[l] = W[l] - learning_rate*"dW[l]"
                           b[l] = b[l] - learning_rate*"db[l]"
        '''
        
        for l in range(1, len(self.l_dims)):
            
            #print("before W"+str(l), " = ", self.params["W"+str(l)])
            #print("before b"+str(l), " = ", self.params["b"+str(l)])
            
            self.params["W"+str(l)] -= self.learning_rate*self.grads["dW"+str(l)]
            self.params["b"+str(l)] -= self.learning_rate*self.grads["db"+str(l)]
            
            #print("after W"+str(l), " = ", self.params["W"+str(l)])
            #print("after b"+str(l), " = ", self.params["b"+str(l)])
    
    
    # training the model to get optimum values of Wl's and bl's
    def train(self, X, Y):
        
        '''
        Parameters: self: 
                        l_dims, params_set, forward, cost_and_dAL, backward, update_params --> to train the model
                    X --> whole training set
                    Y --> true label set
                    shape of X --> (no. of training examples, no. of features)
                    shape of Y --> (no. of training examples, 1)
                    
        '''
        
        X = X.T                             # to change the shape of X to (no. of features , no. of training example)
        Y = Y.T.reshape(1, Y.shape[0])      # to change the shape of Y to (1, no. of training ex.) 
        n_x = X.shape[0]                    # no. of features of X or dimension of zeroth layer or input layer
        self.l_dims.insert(0, n_x)          # inserting the n_x at i = 0 position in l_dims list.
        M = X.shape[1]                      # total no. of training example
        self.params_set()
        costs = []
        epochs = []
        #print("shape of X = ", X.shape)
        #print("shape of Y = ", Y.shape)
        for epoch in range(self.max_epochs):       
            epochs.append(epoch)
            self.forward(X)
            cost = self.cost_and_dAL(Y)
            self.backward()
            self.update_params()
                
            costs.append(cost)
            print(cost)
            
            if len(costs) > 2 and abs(cost-costs[-2])< self.tolerance:
                break
                
     
         
    # predicting the training_set
    def predict(self, X):
        
        # print("predicting the output labels ! \n")
        # used to predict the output labels from model
        # calculate AL(output layer result)
        # input :- self, X
        #       X :- data on which we need to calculate output label with shape (m, n_x)
        #       so convert it into (n_x, m) matrix before use
        #       forward function to compute Zl's and Al's
        #       activation_model an n_layer to get output layer 
        #       model_type to get final predicated output of the sample
        
        X = X.T
        m = X.shape[1]
        # print("final parameter for learning are : \n", self.params)
        self.forward(X)
        AL = self.activation_model["A"+str(len(self.l_dims)-1)]
        # print("final layer values = \n", AL)
        
        # for Binary classification
        if self.model_type:
            
            #print("type of model = ", self.model_type, "\n")
            # for binary classification we used sigmoid model at output layer
            # so our values are between 0 and 1
            # so if value > 0.05 output = 1
            # else , output = 0
            
            AL[AL>0.5] = 1
            AL[AL<= 0.5] = 0
        
        # for Multi_class classification
        elif self.model_type == "Multi_class":
            
            # print("type of model = ", self.model_type, "\n")
            # shape of AL(output layer) = (n_class, m_example) where n_class is given as an input while calling the class DeepLearning
            # shape of Y(true output) = (n_class, m_example) 
            # but we want our output/y_hat to be (1, m_example) where in each column we will store the index of the 
            # maximum element in each column of AL
            
            AL = np.argmax(AL, axis = 0)
        # print("output label \n", AL)    
        # for Regression
        # no need to change anything since it is the expected output we wanted
        # last layer will never use sigmoid, it will use ReLU or leaky_ReLU functions to calculate AL
        
        return AL.reshape(m, 1)      