# Multilayer Perceptron (MLP)
Multilayer Perceptron (MLP) is probably the simplist form of neuroal networks. It is mainly used for classification problem where there is stron non-linearality among features. A MLP consists of an input layer, one or more hidden layers (including the output layer), each of which is a full connected layer. The output of each layer goes through an activation function (usually ReLu) before going to the next layer.

![MLP](./imgs/multilayer_perceptron.png)



In [60]:
import numpy as np

class MultiLayerPerceptron:
    def __init__(self, ndim, hidden_layer_dims):
        self.all_dims = [ndim] + hidden_layer_dims
        np.random.seed(99)
        self.parameters = {}
        for layer in range(len(self.all_dims) - 1):
            self.parameters['W%d' % layer] = np.random.randn(self.all_dims[layer + 1], self.all_dims[layer]) * 0.1
            self.parameters['b%d' % layer] = np.random.randn(self.all_dims[layer + 1], 1) * 0.1
        
    def relu_(self, z):
        return np.maximum(0, z)
    
    def relu_backward_(self, dActivation, z):
        dz = np.array(dActivation, copy=True)
        dz[z <= 0] = 0
        dz[z > 0] = 1
        return dz
    
    def forward_one_layer_(self, activation_prev, W, b):
        z = W.dot(activation_prev) + b
#         print('W.shape:', W.shape, 'a_prev.shape:', activation_prev.shape, 'b.shape:', b.shape, 'z.shape:', z.shape)
        return self.relu_(z), z
    
    def forward(self, x):
        """Conduct feed forward for the NN."""
        activation_cur = np.expand_dims(x, axis=1)
        intermediate_results = {}  # store the intermediate results for backpropagation
        
        for i in range(len(self.all_dims) - 1):
            W = self.parameters['W%d' % i]
            b = self.parameters['b%d' % i]
            activation_prev = activation_cur
            activation_cur, z_cur = self.forward_one_layer_(activation_prev, W, b)
            intermediate_results['a%d' % i] = activation_prev
            intermediate_results['Z%d' % i] = z_cur
        
        return activation_cur, intermediate_results
    
    def cost_(self, y, hat_y):
        """Cross-entropy cost"""
        n = hat_y.shape[1]
        print('y:', y, 'hat_y:', hat_y)
        print(np.dot(y, np.log(hat_y)), np.dot(1 - y, np.log(1 - hat_y)))
        cost = -1 / n * (np.dot(y, np.log(hat_y)) + np.dot(1 - y, np.log(1 - hat_y)))
        return np.squeeze(cost)
    
    def backward_propogation_one_layer_(self, dA_cur, W, b, z, acivation_prev):
        n = activation_prev.shape[1]
        
        dz = self.relu_backward_(dA_cur, z)  # dZ = dA * g'(z)
        
        dW = dZ_curr.dot(activation_prev.T) / n  # dW = 1/n * dZ * a_prev
        db = np.sum(dz, axis=1, keepdims=True) / n  # db = 1/n * \sum_i^n dz
        da_prev = W.T.dot(dz)  # da = W^T dZ
        
        return da_prev, dW, db

    def backward_propagation(self, hat_y, y, intermediate_results):
        n = y.shape[1]
        y = y.reshape(hat_y.shape)
        grads = {}
        
        da_prev = -(np.divide(y, y_hat))  # dcross_entropy/da
        
        for layer in reversed(len(self.all_dims) - 1):
            da_cur = da_prev
            
            a_prev = intermediate_results['a%d' % layer]
            z_cur = intermediate_results['z%d' % layer]
            W_cur = self.parameters['W%d' % layer]
            b_cur = self.parameters['b%d' % layer]
            
            da_prev, dW, db = self.backward_propogation_one_layer_(da_cur, W, b, z, a_prev)
            grads['dW%d' % layer] = dW
            grads['db%d' % layer] = db
            
        return grads
    
    def update(self, grads, lr):
        for layer in range(len(self.all_dims)):
            self.parameters['W%d' % layer] -= learning_rate * grads['dW%d' % layer]
            self.parameters['b%d' % layer] -= learning_rate * grads['db%d' % layer]

    def train(self, X, y, epochs, lr):
        
        for e in range(epochs):
            for x, y in zip(X, y):
                hat_y, intermediate_results = self.forward(x)
                cost = self.cost_(y, hat_y)
                print(cost)
        

In [61]:
n = 1
n_dim = 15
hidden_layer_dims = [8, 2]

model = MultiLayerPerceptron(n_dim, hidden_layer_dims)
for name in model.parameters:
    print(name, model.parameters[name].shape)
    
X = np.random.randn(n, n_dim)
y = np.array([[[1, 0]] if r >= 2 else [[0, 1]] for r in np.sum(X, axis=1)])  # dim: (n, n_out)

model.train(X, y, epochs=1, lr=0.01)
# for it in range(1):
#     X = np.random.randn(n_batch, n_dim)
#     y = np.array([[1, 0] if r >= 2 else [0, 1] for r in np.sum(X, axis=1)])  # dim: (n_batch, n_out)
#     loss = model.train(X, y, lr=0.001)
#     if it % 100 == 0:
#         print('iteration %d, loss: %.3f' % (it, loss))

W0 (8, 15)
b0 (8, 1)
W1 (2, 8)
b1 (2, 1)
y: [[0 1]] hat_y: [[0.]
 [0.]]
[[nan]] [[0.]]
nan




In [160]:
def sigmoid(Z):
    return 1/(1+np.exp(-Z))

def relu(Z):
    return np.maximum(0,Z)

def sigmoid_backward(dA, Z):
    sig = sigmoid(Z)
    return dA * sig * (1 - sig)

def relu_backward(dA, Z):
    dZ = np.array(dA, copy = True)
    dZ[Z <= 0] = 0;
    dZ[Z > 0] = 1
    return dZ

In [176]:
class MLP:
    def __init__(self, nn_architecture):
        self.nn_architecture = nn_architecture
        number_of_layers = len(nn_architecture)
        self.params_values = {}

        for idx, layer in enumerate(nn_architecture):
            layer_idx = idx + 1
            layer_input_size = layer["input_dim"]
            layer_output_size = layer["output_dim"]

            self.params_values['W' + str(layer_idx)] = np.random.randn(
                layer_output_size, layer_input_size) * 0.1
            self.params_values['b' + str(layer_idx)] = np.random.randn(
                layer_output_size, 1) * 0.1
    
    def single_layer_forward_propagation(self, A_prev, W_curr, b_curr, activation="relu"):
        Z_curr = np.dot(W_curr, A_prev) + b_curr
        
        if activation is "relu":
            activation_func = relu
        elif activation is "sigmoid":
            activation_func = sigmoid
        else:
            raise Exception('Non-supported activation function')

        return activation_func(Z_curr), Z_curr
    
    def full_forward_propagation(self, X):
        memory = {}
        A_curr = X

        for idx, layer in enumerate(self.nn_architecture):
            layer_idx = idx + 1
            A_prev = A_curr
            
            activ_function_curr = layer["activation"]
            W_curr = self.params_values["W" + str(layer_idx)]
            b_curr = self.params_values["b" + str(layer_idx)]
            A_curr, Z_curr = self.single_layer_forward_propagation(A_prev, W_curr, b_curr, activ_function_curr)

            memory["A" + str(idx)] = A_prev
            memory["Z" + str(layer_idx)] = Z_curr

        return A_curr, memory
    
    def get_cost_value(self, Y_hat, Y):
        m = Y_hat.shape[1]
        cost = -1 / m * (np.dot(Y, np.log(Y_hat).T) + np.dot(1 - Y, np.log(1 - Y_hat).T))
        return np.squeeze(cost)
    
    def single_layer_backward_propagation(self, dA_curr, W_curr, b_curr, Z_curr, A_prev, activation="relu"):
        m = A_prev.shape[1]

        if activation is "relu":
            backward_activation_func = relu_backward
        elif activation is "sigmoid":
            backward_activation_func = sigmoid_backward
        else:
            raise Exception('Non-supported activation function')

        dZ_curr = backward_activation_func(dA_curr, Z_curr)
        dW_curr = np.dot(dZ_curr, A_prev.T) / m
        db_curr = np.sum(dZ_curr, axis=1, keepdims=True) / m
        dA_prev = np.dot(W_curr.T, dZ_curr)

        return dA_prev, dW_curr, db_curr
    
    def full_backward_propagation(self, Y_hat, Y, memory):
        grads_values = {}
        m = Y.shape[1]
        Y = Y.reshape(Y_hat.shape)

        dA_prev = - (np.divide(Y, Y_hat) - np.divide(1 - Y, 1 - Y_hat));

        for layer_idx_prev, layer in reversed(list(enumerate(self.nn_architecture))):
            layer_idx_curr = layer_idx_prev + 1
            activ_function_curr = layer["activation"]

            dA_curr = dA_prev

            A_prev = memory["A" + str(layer_idx_prev)]
            Z_curr = memory["Z" + str(layer_idx_curr)]
            W_curr = self.params_values["W" + str(layer_idx_curr)]
            b_curr = self.params_values["b" + str(layer_idx_curr)]

            dA_prev, dW_curr, db_curr = self.single_layer_backward_propagation(
                dA_curr, W_curr, b_curr, Z_curr, A_prev, activ_function_curr)

            grads_values["dW" + str(layer_idx_curr)] = dW_curr
            grads_values["db" + str(layer_idx_curr)] = db_curr

        return grads_values
    
    def update(self, grads_values, learning_rate):
        for layer_idx, layer in enumerate(self.nn_architecture, 1):
            self.params_values["W" + str(layer_idx)] -= learning_rate * grads_values["dW" + str(layer_idx)]        
            self.params_values["b" + str(layer_idx)] -= learning_rate * grads_values["db" + str(layer_idx)]
    
    def train(self, X, Y, learning_rate):
        cost_history = []
        accuracy_history = []

        y_hat, intermediate_results = self.full_forward_propagation(X)
        cost = self.get_cost_value(y_hat, Y)
        cost_history.append(cost)
#             accuracy = get_accuracy_value(Y_hat, Y)
#             accuracy_history.append(accuracy)

        grads_values = self.full_backward_propagation(y_hat, Y, intermediate_results)
        params_values = self.update(grads_values, learning_rate)

        return cost_history

In [192]:
n = 1

nn_architecture = [
    {"input_dim": 15, "output_dim": 8, "activation": "relu"},
    {"input_dim": 8, "output_dim": 5, "activation": "relu"},
    {"input_dim": 5, "output_dim": 1, "activation": "sigmoid"},
]

model = MLP(nn_architecture)

losses = []
for i in range(1000000):
    X = np.random.randn(n, n_dim).T
    y = np.array([[1.0] if r >= 1 else [0.0] for r in np.sum(X, axis=0)])
    loss = model.train(X, y, learning_rate=0.01)
    losses.append(loss[0])
    if i % 10000 == 0:
        print(sum(losses[-100:]) / 100)

0.007579618537019549
0.6529857393631405
0.6780528066303901
0.686477591477436
0.6828785042141535
0.6699286410439346
0.6854692835061437
0.6652953669701144
0.6612776896652122
0.6606872294896019
0.6814804216216969
0.6944806579815238
0.6794791658753289
0.6782283572492398
0.6781284073242644
0.695739943626792
0.6686272252631638
0.6849503555637149
0.660429481766363
0.7048727498586271
0.682716737793009
0.6611083153385592
0.6664825431505288
0.6742044022157093
0.667138656970784
0.6603844847170635
0.6384840217659447
0.6699391853618364
0.6699261512989881
0.6973432399108916
0.6908949008786938
0.6622894638516514
0.6615805604230117
0.6974290400074409
0.6601696782383679
0.6786233615497059
0.6791847421649259
0.6995592290666373
0.6818321985873607
0.6781216127526569
0.6818970272789671
0.6653213318149527
0.6700130853672326
0.6652306118323053
0.6969271653236394
0.678106815990152
0.6401046959902995
0.6422475141272203
0.6699275836372476
0.6744657949771671
0.6794623876878516
0.6615314383724106
0.71629500037782