## Multi Layer Perceptron

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.random.seed(1)
%matplotlib inline

## 1. Define functions

In [None]:
def one_hot_encode(x, n_class):
    """One Hot encoding
    
    parameter
    ---------
    x: array_like [n_sample]
    
    n_class: number of class
    
    return
    ------
    en_1hot: array_like [n_smaple, n_class] one hot encoding matrix
    """
    
    en_1hot = np.zeros([len(x), n_class])
    
    for idx, cat in enumerate(x):
        en_1hot[idx, cat] = 1
    
    return en_1hot


def sigmoid(x):
    
    return 1 / (1 + np.exp(-x))


def softmax(x):
    exp_x = np.exp(x)
    return  exp_x / np.sum(exp_x, axis=-1, keepdims=True)


def foward(X, params):
    """Neuron network forward
    
    parameter
    ----------
    X: array_like [n_sample, n_input]
    params: dictionary of layer weights
    
    return
    ------
    output: final layer output
    caches: dictionary of foward layer output
    """
    
    W1, b1 = params['W1'], params['b1']
    W2, b2 = params['W2'], params['b2']
    
    Z1 = np.matmul(X, W1) + b1
    A1 = sigmoid(Z1)
    Z2 = np.matmul(A1, W2) + b2
    A2 = sigmoid(Z2)
    
    caches = {'Z1': Z1, 'A1': A1,
              'Z2': Z2, 'A2': A2}

    return A2, caches
    

def backward(X, y, params, caches):
    """Neuron network backward
    
    parameter
    ----------
    X: array_like [n_sample, n_input]
    y: array_like [n_sample, n_class]
    params: dictionary of layer weights
    caches: dictionary of foward layer output
    
    return
    ------
    grads: dictionary of layer weights gradient
    
    """
    
    W1 = params['W1']
    W2 = params['W2']

    A1 = caches['A1']
    A2 = caches['A2']
    
    m = X.shape[0]
    
    # chain rule compute gradient
    # last layer gradient
    dZ2 = A2 - y
    dW2 = np.matmul(A1.T, dZ2) / m
    db2 = np.sum(dZ2, axis=0) / m
    
    dZ1 = np.multiply(np.matmul(dZ2, W2.T), (np.multiply(A1 ,(1 - A1))))
    dW1 = np.matmul(X.T, dZ1) / m
    db1 = np.sum(dZ1, axis=0) / m
    
    grads = {'dW1': dW1, 'db1': db1,
             'dW2': dW2, 'db2': db2}
    
    return grads


def update_parameters(params, grads, learning_rate):
    """Update parameters
    
    parameters
    ----------
    grads: dictionary of layer weights gradient
    params: dictionary of layer weights
    learning_rate: learing rate to update weights
    
    """
    params['W1'] = params['W1'] - learning_rate * grads['dW1']
    params['b1'] = params['b1'] - learning_rate * grads['db1']
    params['W2'] = params['W2'] - learning_rate * grads['dW2']
    params['b2'] = params['b2'] - learning_rate * grads['db2']
    
    return params


def compute_cost(output, y):
    """Compute loss
    
    parameter
    ---------
    output: array_like [n_sample, n_class] nn predict output
    y: array_like [n_sample, n_class]
    """
    loss = np.multiply(y, np.log(output)) + np.multiply((1 - y), np.log(1 - output))
    loss = -np.mean(np.sum(loss, axis=-1, keepdims=True))
    
    return loss


def gradient_descent(X, y, params, learning_rate=0.01, max_iter=None, tol=1e-4,
                     show_cost=None):
    """Gradient descent 
    
    parameters
    ----------
    X: array_like [n_samples, n_features]
    y: array_like [n_samples, n_class]
    params: dictionary of layer weights

    return
    ------
    parmas: dictionary of layer weights
    costs: costs of iteratsions
    """
    
    if max_iter is None:
        max_iter = 1000
    
    costs = np.zeros(max_iter)
    
    for n_iter in np.arange(0, max_iter):    
        
        output, caches = foward(X, params)
        costs[n_iter] = compute_cost(output, y)
        grads = backward(X, y, params, caches)
        params = update_parameters(params, grads, learning_rate)
        
        if show_cost is not None and (n_iter % show_cost) == 0:
            print('costs[%4d]: %.4e' % (n_iter, costs[n_iter]))
            
        if np.abs(costs[n_iter - 1] - costs[n_iter]) < tol:
            costs = costs[:n_iter]
            break
    
    return params, costs

## 2. Split data

In [None]:
def mnist_split_data(csv_file):
    """Split image and label from mnist csv data
    
    return
    ------
    X: array_like [n_sample, n_feature] mnist flat image
    y: array_like [n_sample] mnist image label
    """
    
    df = pd.read_csv(csv_file)
    
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values
    return X, y

In [None]:
X_train, y_train = mnist_split_data('ex3_train.csv')
X_test, y_test = mnist_split_data('ex3_test.csv')

n_class = len(np.unique(y_train))
y_train_en = one_hot_encode(y_train, n_class)

## 3. Initialize parameters

In [None]:
def weights_init(layer_dims):
    """weights initialize for layers
    
    parameter
    ---------
    layer_dims: list [n_input, n_output]
    
    return
    ------
    model: dictionary of layer weights
    """
    
    model = {}
    
    for idx, (n_input, n_output) in enumerate(layer_dims):
        
        layer_weights = 'W{:d}'.format(idx + 1)
        layer_biases = 'b{:d}'.format(idx + 1)
        
        weights = np.multiply(np.random.randn(n_input, n_output), 0.01)
        b = np.zeros([n_output])
        
        model[layer_weights] = weights
        model[layer_biases] = b
        
    return model

## 4. Neural Network model with 1 hidden layer

In [None]:
layer_dims = [(400, 25), 
              (25,  10)]

model = weights_init(layer_dims)
model, costs = gradient_descent(X_train, y_train_en, model, tol=1e-6,
                                 learning_rate=0.1, max_iter=3000)

## 5. Predictions

In [None]:
def predict(X, model):
    """Predcit smaples with softmax prob
    
    parameter
    ---------
    X: array_like [n_sample, n_feature]
    model: dictionary of layer weights
    
    return
    ------
    softmax prob label 
    """
    
    output, _ = foward(X, model)
    
    return np.argmax(softmax(output), axis=-1)

def accuracy(y_pred, y_true):
    
    return np.sum(np.equal(y_pred, y_true)) / len(y_true)
    

In [None]:
y_train_pred = predict(X_train, model)
train_acc = accuracy(y_train_pred, y_train)
print("train acc: {:6.4f}".format(train_acc * 100))

y_test_pred = predict(X_test, model)
test_acc = accuracy(y_test_pred, y_test)
print("test acc: {:6.4f}".format(train_acc * 100))

## 6. Optimization

In [None]:
# Set hyper parameter tuning grid search
n_iters = [2000, 3000, 6000]
#n_iters = [100, 300, 500]
learning_rates = [0.01, 0.1, 1]
grid = [(learning_rate, n_iter) for learning_rate in learning_rates for n_iter in n_iters]

In [None]:
layer_dims = [(400, 25), 
              (25,  10)]

for idx, (learning_rate, n_iter) in enumerate(grid):
    model = weights_init(layer_dims)
    model, costs = gradient_descent(X_train, y_train_en, model, tol=1e-10,
                                    learning_rate=learning_rate, max_iter=n_iter)
    y_train_pred = predict(X_train, model)
    train_acc = accuracy(y_train_pred, y_train)   
    
    y_test_pred = predict(X_test, model)
    test_acc = accuracy(y_test_pred, y_test)
    
    plt.xlabel('iteraions')
    plt.ylabel('cost')
    plt.xticks(np.arange(0, n_iter, n_iter / 10))
    plt.xlim(xmin=1, xmax=n_iter)
    plt.plot(costs)
    plt.show()
    print('learning rate {:.2f} iters {:d}'.format(learning_rate, n_iter))
    print('Train acc: {:6.4f}'.format(train_acc * 100))
    print('Test acc: {:6.4f}'.format(test_acc * 100))