# Multilayer Perceptron (MLP)
Multilayer Perceptron (MLP) is probably the simplist form of neuroal networks. It is mainly used for classification problem where there is stron non-linearality among features. A MLP consists of an input layer, one or more hidden layers (including the output layer), each of which is a full connected layer. The output of each layer goes through an activation function (usually ReLu) before going to the next layer.

![MLP](./imgs/multilayer_perceptron.png)



In [148]:
import numpy as np

class MultiLayerPerceptron:
    def __init__(self, ndim, hidden_layer_dims):
        all_dims = [ndim] + hidden_layer_dims
        self.Ws = [np.random.randn(all_dims[i] + 1, all_dims[i + 1]) for i in range(len(all_dims) - 1)]
        
    def relu_(self, x):
        return np.maximum(0, x)
    
    def relu_prime_(self, x):
        def func(v):
            return 1 if v > 0 else 0
        vfunc = np.vectorize(func)
        return vfunc(x)
    
    def cost_(self, y, hat_y):
        return 0.5 * (y - hat_y)**2
    
    def cost_prime_(self, y, hat_y):
        return (y - hat_y)
    
    def predict_(self, X):
        """Conduct prediction for a batch of inputs."""
        bias = np.ones((X.shape[0], 1))
        hiddens_wo_activation = []  # store the intermediate values for the hidden layers before activation
        outputs = np.concatenate((bias, X), axis=1)
        for W in self.Ws[:-1]:
            outputs = outputs.dot(W)
            outputs = np.concatenate((np.ones((outputs.shape[0], 1)), outputs), axis=1)  # prepend bias
            hiddens_wo_activation.append(outputs)
            outputs = self.relu_(outputs)
        # apply the last layer without relu
        outputs = outputs.dot(self.Ws[-1])  # dim: (n_batch, n_out)
        hiddens_wo_activation.append(outputs)
#         for h in hiddens_wo_activation:
#             print(h.shape)
        return outputs, hiddens_wo_activation
    
    def predict(self, X):
        return self.predict_(X)[0]
    
    def train(self, X, y, lr=0.01):
        pred_y, hiddens_wo_activation = self.predict_(X)
        # backpropagation with chain rule
        # Last layer: E_out = C'(Wn) = cost' * relu'(Hn)
        # Other layers: E_n = E_n+1 * W_n * relu'(Zn)
        error_by_layers = []
#         print("Output layer:", self.cost_prime_(y, pred_y).shape, self.relu_prime_(hiddens_wo_activation[-1]).shape)
        error_outputs = self.cost_prime_(y, pred_y) * self.relu_prime_(hiddens_wo_activation[-1])  # dim: (n_batch, n_out)
        error_by_layers.append(error_outputs)
        for layer in range(len(self.Ws) - 1, 0, -1):  # from later layers to earlier layers except the first layer
#             batchW = np.stack([self.Ws[layer]] * X.shape[0])
            print(
                "layer: ", layer, 
                "error[%d].shape:" % (len(error_by_layers) - 1), np.sum(error_by_layers[-1], axis=0).shape, 
                ", batchW[%d]: " % layer, self.Ws[layer].shape, 
                ", relu' hidden_wo_activation[%d]" % (layer - 1), self.relu_prime_(hiddens_wo_activation[layer-1]).shape)
            error = np.expand_dims(np.sum(error_by_layers[-1], axis=0), axis=1).T.dot(self.Ws[layer].T)  # dim: (1, h_n)
            print("error.shape:", error.shape)
            relu_prime_z = np.sum(self.relu_prime_(hiddens_wo_activation[layer-1]), axis=0)  # dim: (, h_n)
            relu_prime_z = np.expand_dims(relu_prime_z, axis=1)  # dim: (1, h_n)
            error = error * relu_prime_z.T  # dim: (h_n, 1)
            print("error of layer %d: " % layer, error.shape)
            error_by_layers.append(error)
        
        
        return np.abs(error_outputs).sum() / error_outputs.size

In [149]:
n_batch = 10
n_dim = 15
hidden_layer_dims = [8, 2]

model = MultiLayerPerceptron(n_dim, hidden_layer_dims)

for it in range(1):
    X = np.random.randn(n_batch, n_dim)
    y = np.array([[1, 0] if r >= 2 else [0, 1] for r in np.sum(X, axis=1)])  # dim: (n_batch, n_out)
    loss = model.train(X, y, lr=0.001)
    if it % 100 == 0:
        print('iteration %d, loss: %.3f' % (it, loss))

layer:  1 error[0].shape: (2,) , batchW[1]:  (9, 2) , relu' hidden_wo_activation[0] (10, 9)
error.shape: (1, 9)
error of layer 1:  (1, 9)
iteration 0, loss: 3.903
