In [20]:
import pandas as pd
import numpy as np
import collections
import itertools
from sklearn import metrics, model_selection
import matplotlib.pyplot as plt
from matplotlib.colors import colorConverter, ListedColormap
import csv

In [21]:
def load_data():
    X_train = []
    X_test = []
    T_train = []
    T_test = []

    with open(
            './data/x_train.csv') as csvfile:
        r = csv.reader(csvfile, delimiter='\n', quotechar='|')
        for row in r:
            str = row[0].split(',')
            arr = list(map(int, str))
            X_train.append(arr)
    X_train = np.asarray(X_train)

    with open(
            './data/x_test.csv') as csvfile:
        r = csv.reader(csvfile, delimiter='\n', quotechar='|')
        for row in r:
            str = row[0].split(',')
            arr = list(map(int, str))
            X_test.append(arr)
    X_test = np.asarray(X_test)

    with open(
            './data/y_train.csv') as csvfile:
        r = csv.reader(csvfile, delimiter='\n')
        for row in r:
            if row[0] == '0':
                T_train.append([1, 0, 0, 0])
            elif row[0] == '1':
                T_train.append([0, 1, 0, 0])
            elif row[0] == '2':
                T_train.append([0, 0, 1, 0])
            elif row[0] == '3':
                T_train.append([0, 0, 0, 1])
    T_train = np.asarray(T_train)

    with open(
            './data/y_test.csv') as csvfile:
        r = csv.reader(csvfile, delimiter='\n')
        for row in r:
            if row[0] == '0':
                T_test.append([1, 0, 0, 0])
            elif row[0] == '1':
                T_test.append([0, 1, 0, 0])
            elif row[0] == '2':
                T_test.append([0, 0, 1, 0])
            elif row[0] == '3':
                T_test.append([0, 0, 0, 1])
    T_test = np.asarray(T_test)

    # Divide the test set into a validation set and final test set.
    X_validation, X_test, T_validation, T_test = cross_validation.train_test_split(
        X_test, T_test, test_size=0.3)

    return (X_train, X_test, X_validation, T_train, T_test, T_validation)

In [22]:
# =================
# Utility functions
# =================

def readin(filepath):
    df = pd.read_csv(filepath, header=None)
    return df.values

def convert_to_onehot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)].T
    return Y

def relu(z):
    return np.maximum(0, z)

def softmax(z):
    """
    Calculate the softmax over n input samples
    
    @param x: of shape(n, m). n is the number of data samples, m is the dimension
    """
    z_exp = np.exp(z)
    z_sum = np.sum(z_exp, axis=1, keepdims=True)
    s = z_exp / z_sum
    return s


def softmax_stable(z):
    """
    Compute the softmax of vector x in a numerically stable way.
    """
    shiftz = z - np.max(z)
    z_exp = np.exp(shiftz)
    z_sum = np.sum(z_exp, axis=1, keepdims=True)
    s = z_exp / z_sum
    return s

# def relu_deriv(y):
#     return (y > 0).astype(int)*1.0

def relu_deriv(y):
    return 1. * (y > 0)

In [23]:
class Layer(object):
    """Base class for the different layers
    """
    
    def get_params_iter(self):
        """Return an itertor over the parameters (if any)
        The iterator has the same order as get_params_grad
        The elements returned are editable in-place
        """
        return []
    
    def get_params_grad(self, X, output_grad):
        """Return a list of gradients over the params
        The list has the same order as the get_params_iter iterator
        X is the input
        output_grad is the gradient at the output of this layer"""
        return []
    
    def get_output(self, X):
        """Perform forward step linear transformation"""
        pass
    
    def get_input_grad(self, Y, output_grad=None, T=None):
        """Return the gradeint at the intputs of this layer
        Y is the pre-computed output
        output_grad is the gradient at the output of this layer
        T is the target, which will be only used in output layer"""
        pass
    
    
class LinearLayer(Layer):
    """The linear layer performs a linear transformation to its input."""

    def __init__(self, n_in, n_out):
        """Initialize hidden layer parameters.
        n_in is the number of input variables.
        n_out is the number of output variables."""
        self.W = np.random.randn(n_in, n_out) * 0.1
        self.b = np.zeros(n_out)

    def get_params_iter(self):
        """Return an iterator over the parameters."""
        return itertools.chain(np.nditer(self.W, op_flags=['readwrite']),
                               np.nditer(self.b, op_flags=['readwrite']))

    def get_output(self, X):
        """Perform the forward step linear transformation."""
        return X.dot(self.W) + self.b

    def get_params_grad(self, X, output_grad):
        """Return a list of gradients over the parameters."""
        JW = X.T.dot(output_grad)
        Jb = np.sum(output_grad, axis=0)
        return [g for g in itertools.chain(np.nditer(JW), np.nditer(Jb))]

    def get_input_grad(self, Y, output_grad):
        """Return the gradient at the inputs of this layer."""
        return output_grad.dot(self.W.T)
    
class ReluLayer(Layer):
    """The relu layer applies the relu function to its inputs."""
    
    def get_output(self, X):
        """Perform the forward step transformation."""
        return relu(X)
    
    def get_input_grad(self, Y, output_grad):
        """Return the gradient at the inputs of this layer."""
        return np.multiply(relu_deriv(Y), output_grad)
    
class SoftmaxOutputLayer(Layer):
    """The softmax output layer computes the classification propabilities at the output."""
    
    def get_output(self, X):
        """Perform the forward step transformation."""
        return softmax(X)
    
    def get_input_grad(self, Y, T):
        """Return the gradient at the inputs of this layer."""
        return (Y - T) / Y.shape[0]
    
    def get_cost(self, Y, T):
        """Return the cost at the output of this output layer."""
        return -1.0*np.multiply(T, np.log(Y)).sum()/ Y.shape[0]

In [25]:
def forward_step(input_samples, layers):
    """
    Compute the forward activations of each layer.
    
    @param input_samples: a matrix of input samples (each row is an input vector)
    @param layers: a list of layers
    @returns a list of activations. activations[0] contains the input
    """
    activations = [input_samples]
    
    X = input_samples
    for layer in layers:
        Y = layer.get_output(X)
        activations.append(Y)
        X = activations[-1]
    return activations


def backward_step(activations, targets, layers):
    param_grads = collections.deque()
    output_grad = None
    
    for layer in reversed(layers):
        Y = activations.pop()  # Get the activations of the last layer on the stack
        # Compute the error at the output layer.
        # The output layer error is calculated different then hidden layer error.
        if output_grad is None:
            input_grad = layer.get_input_grad(Y, targets)
        else:  # output_grad is not None (layer is not output layer)
            input_grad = layer.get_input_grad(Y, output_grad)
        # Get the input of this layer (activations of the previous layer)
        X = activations[-1]
        # Compute the layer parameter gradients used to update the parameters
        grads = layer.get_params_grad(X, output_grad)
        param_grads.appendleft(grads)
        # Compute gradient at output of previous layer (input of current layer):
        output_grad = input_grad
    return list(param_grads)


def update_params(layers, param_grads, learning_rate):
    """
    Update the parameters of the given layers with the given gradients
    by gradient descent with the given learning rate.
    """
    for layer, layer_backprop_grads in zip(layers, param_grads):
        for param, grad in zip(layer.get_params_iter(), layer_backprop_grads):
            # The parameter returned by the iterator 
            # point to the memory space of the original layer,
            # so it can be modified inplace.
            param -= learning_rate * grad
            

In [26]:
def plot_costs(minibatch_costs, training_costs, validation_costs, nb_of_iterations, nb_of_batches):
    # Plot the minibatch, full training set, and validation costs
    minibatch_x_inds = np.linspace(0, nb_of_iterations, num=nb_of_iterations * nb_of_batches)
    iteration_x_inds = np.linspace(1, nb_of_iterations, num=nb_of_iterations)
    # Plot the cost over the iterations
    plt.plot(minibatch_x_inds, minibatch_costs, 'k-', linewidth=0.5, label='cost minibatches')
    plt.plot(iteration_x_inds, training_costs, 'r-', linewidth=2, label='cost full training set')
    plt.plot(iteration_x_inds, validation_costs, 'b-', linewidth=3, label='cost validation set')
    # Add labels to the plot
    plt.xlabel('iteration')
    plt.ylabel('$\\xi$', fontsize=15)
    plt.title('Decrease of cost over backprop iteration')
    plt.legend()
    x1, x2, y1, y2 = plt.axis()
    plt.axis((0, nb_of_iterations, 0, 2.5))
    plt.grid()
    plt.show()


def plot_accuracys(train_accuracys, validation_accuracys, nb_of_iterations):
    # Plot the minibatch, full training set, and validation costs
    iteration_x_inds = np.linspace(1, nb_of_iterations, num=nb_of_iterations)
    # Plot the cost over the iterations
    plt.plot(iteration_x_inds, train_accuracys, 'r-', linewidth=2, label='acc. full training set')
    plt.plot(iteration_x_inds, validation_accuracys, 'b-', linewidth=3, label='acc. validation set')
    # Add labels to the plot
    plt.xlabel('iteration')
    plt.ylabel('accuracy')
    plt.title('Increase of accuracy over backprop iteration')
    plt.legend(loc=4)
    x1, x2, y1, y2 = plt.axis()
    plt.axis((0, nb_of_iterations, 0, 1.0))
    plt.grid()
    plt.show()
    
def gradient_check(layers):
    # Perform gradient checking
    nb_samples_gradientcheck = 10 # Test the gradients on a subset of the data
    X_temp = X_train[0:nb_samples_gradientcheck,:]
    T_temp = T_train[0:nb_samples_gradientcheck,:]
    # Get the parameter gradients with backpropagation
    activations = forward_step(X_temp, layers)
    param_grads = backward_step(activations, T_temp, layers)

    # Set the small change to compute the numerical gradient
    eps = 0.0001
    # Compute the numerical gradients of the parameters in all layers.
    for idx in range(len(layers)):
        layer = layers[idx]
        layer_backprop_grads = param_grads[idx]
        # Compute the numerical gradient for each parameter in the layer
        for p_idx, param in enumerate(layer.get_params_iter()):
            grad_backprop = layer_backprop_grads[p_idx]
            # + eps
            param += eps
            plus_cost = layers[-1].get_cost(forward_step(X_temp, layers)[-1], T_temp)
            # - eps
            param -= 2 * eps
            min_cost = layers[-1].get_cost(forward_step(X_temp, layers)[-1], T_temp)
            # reset param value
            param += eps
            # calculate numerical gradient
            grad_num = (plus_cost - min_cost)/(2*eps)
            # Raise error if the numerical grade is not close to the backprop gradient
            if not np.isclose(grad_num, grad_backprop):
                raise ValueError('Numerical gradient of {:.6f} is not close to the backpropagation gradient of {:.6f}!'.format(float(grad_num), float(grad_backprop)))
    print('No gradient errors found')

In [None]:
X_train_all = readin("./data/x_train.csv")
Y_train_all = readin("./data/y_train.csv")
T_train_all = convert_to_onehot(Y_train_all, 4).T
X_test = readin("./data/x_test.csv")
Y_test = readin("./data/y_test.csv")
T_test = convert_to_onehot(Y_test, 4).T

np.random.shuffle(X_train_all)
np.random.shuffle(T_train_all)
X_train, X_validation = X_train_all[:10000,:], X_train_all[10000:,:]
T_train, T_validation = T_train_all[:10000,:], T_train_all[10000:,:]

In [32]:
X_train, X_test, X_validation, T_train, T_test, T_validation = load_data()

In [33]:
# Define a sample model to be trained on the data
hidden_neurons = 28  # Number of neurons in the first hidden-layer
# Create the model
layers = []  # Define a list of layers
# Add first hidden layer
layers.append(LinearLayer(X_train.shape[1], hidden_neurons))
layers.append(ReluLayer())
for i in range(5):
    # Add second hidden layer
    layers.append(LinearLayer(hidden_neurons, hidden_neurons))
    layers.append(ReluLayer())
# Add output layer
layers.append(LinearLayer(hidden_neurons, T_train.shape[1]))
layers.append(SoftmaxOutputLayer())

In [34]:
gradient_check(layers)

ValueError: Numerical gradient of -0.002699 is not close to the backpropagation gradient of -0.002622!

In [30]:
# Create the minibatches
batch_size = 50
nb_of_batches = X_train.shape[0] / batch_size  # Number of batches
# Create batches (X,Y) from the training set
XT_batches = zip(
    np.array_split(X_train, nb_of_batches, axis=0),  # X samples
    np.array_split(T_train, nb_of_batches, axis=0))  # Y targets

In [31]:
# Perform backpropagation
# initalize some lists to store the cost for future analysis        
minibatch_costs = []
training_costs = []
validation_costs = []
train_accuracys = []
validation_accuracys = []
max_nb_of_iterations = 50  # Train for a maximum of 300 iterations
learning_rate = 0.01  # Gradient descent learning rate

y_true = np.argmax(T_test, axis=1)  # Get the target outputs
x_train_true = np.argmax(T_train, axis=1)
x_val_true = np.argmax(T_validation, axis=1)
    
# Train for the maximum number of iterations
for iteration in range(max_nb_of_iterations):
    for X, T in XT_batches:  # For each minibatch sub-iteration
        activations = forward_step(X, layers)  # Get the activations
        minibatch_cost = layers[-1].get_cost(activations[-1], T)  # Get cost
        minibatch_costs.append(minibatch_cost)
        param_grads = backward_step(activations, T, layers)  # Get the gradients
        update_params(layers, param_grads, learning_rate)  # Update the parameters
    
    # Get full training cost for future analysis (plots)
    activations = forward_step(X_train, layers)
    train_cost = layers[-1].get_cost(activations[-1], T_train)
    training_costs.append(train_cost)
    y_pred = np.argmax(activations[-1], axis=1)  # Get the predictions made by the network
    train_accuracy = metrics.accuracy_score(x_train_true, y_pred)  # Test set accuracy
    train_accuracys.append(train_accuracy)
        
    # Get full validation cost
    activations = forward_step(X_validation, layers)
    validation_cost = layers[-1].get_cost(activations[-1], T_validation)
    validation_costs.append(validation_cost)
    y_pred = np.argmax(activations[-1], axis=1)  # Get the predictions made by the network
    validation_accuracy = metrics.accuracy_score(x_val_true, y_pred)  # Test set accuracy
    validation_accuracys.append(validation_accuracy)
    print('iter {}: train loss {:.4f} acc {:.4f}, val loss {:.4f} acc {:.4f}'.format(iteration + 1, train_cost, train_accuracy, validation_cost, validation_accuracy))

#     if len(validation_costs) > 5:
#         # Stop training if the cost on the validation set doesn't decrease
#         #  for 3 iterations
#         if validation_costs[-1] >= validation_costs[-2] >= validation_costs[-3] >= \
#             validation_costs[-4] >= validation_costs[-5]:
#             break
    
nb_of_iterations = iteration + 1

iter 1: train loss 1.3757 acc 0.2726, val loss 1.3772 acc 0.2761
iter 2: train loss 1.3757 acc 0.2726, val loss 1.3772 acc 0.2761
iter 3: train loss 1.3757 acc 0.2726, val loss 1.3772 acc 0.2761
iter 4: train loss 1.3757 acc 0.2726, val loss 1.3772 acc 0.2761
iter 5: train loss 1.3757 acc 0.2726, val loss 1.3772 acc 0.2761
iter 6: train loss 1.3757 acc 0.2726, val loss 1.3772 acc 0.2761
iter 7: train loss 1.3757 acc 0.2726, val loss 1.3772 acc 0.2761
iter 8: train loss 1.3757 acc 0.2726, val loss 1.3772 acc 0.2761
iter 9: train loss 1.3757 acc 0.2726, val loss 1.3772 acc 0.2761
iter 10: train loss 1.3757 acc 0.2726, val loss 1.3772 acc 0.2761
iter 11: train loss 1.3757 acc 0.2726, val loss 1.3772 acc 0.2761
iter 12: train loss 1.3757 acc 0.2726, val loss 1.3772 acc 0.2761
iter 13: train loss 1.3757 acc 0.2726, val loss 1.3772 acc 0.2761
iter 14: train loss 1.3757 acc 0.2726, val loss 1.3772 acc 0.2761
iter 15: train loss 1.3757 acc 0.2726, val loss 1.3772 acc 0.2761
iter 16: train loss

In [None]:
plot_costs(minibatch_costs, training_costs, validation_costs, nb_of_iterations, nb_of_batches)
plot_accuracys(train_accuracys, validation_accuracys, nb_of_iterations)