# Multi-linear classifier

We will train and test a two layer network with multiple outputs to classify images from the CIFAR-10 dataset. We will train the network using mini-batch gradient descent applied to a cost function that computes the cross-entropy loss of the classifier applied to the labelled training data and an L2 regularization term on the weight matrix.

In [None]:
# import functions to load batch, softmax function, compute gradient, display image for each label
# and transfer model to matlab
import functions as functions

import tensorflow.keras.utils as np_utils
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# dir of data
data_batch_1 = "cifar-10-batches-py/data_batch_1"
# validation data
data_batch_2 = "cifar-10-batches-py/data_batch_2"

In [None]:
# parameter of the network
m = 50

# Training the multi-linear classifier

## Read in the data & initialize the parameters of the network

We start by extracting the data:

In [None]:
# Reads in the data from a CIFAR-10 batch file 
# Returns the image and label data in separate files.
def LoadBatch(filename):
    data_dict = functions.load_batch(filename)
    data = data_dict[b'data']
    
    # extract the labels
    labels = data_dict[b'labels']
    
    # convert to one-hot representation
    onehot_labels = np_utils.to_categorical(labels)
    
    return data.T, onehot_labels.T, labels

In [None]:
# data is (data_dimension, data_count)
# onehot_labels is (labels_count, data_count)
data, onehot_labels, labels = LoadBatch(data_batch_1)

# we do the same for the validation set
X_validation, Y_validation, labels_validation = LoadBatch(data_batch_2)

data_dimension = len(data[:,0])
labels_count = len(onehot_labels[:,0])

Now we preprocess the raw data by normalizing it (we assume the noise is Gaussian and that the data is normally distributed):

In [None]:
def normalize_data(X):    
    return (X -  np.mean(X,axis=1)[:, np.newaxis]) / np.std(X, axis=1)[:, np.newaxis]

In [None]:
# Than normalize the data according to the normal distribution
data = normalize_data(data)
X_validation = normalize_data(X_validation)

In [None]:
print("data mean:", np.mean(data))
print("data std:", np.std(data))

print("validation mean:", np.mean(data))
print("validation std:", np.std(data))

We know $$ s_1 = W_1x + b_1$$
$$h = max(0,s_1)$$
$$s = W_2h + b_2$$
$$p = softmax(s)$$
initialize the W1, W2, b1 and b2 of the model with each entry have Gaussian random values with zero mean and standard
deviation .01 :

In [None]:
# Define the shape of W
W1_shape = (m, data_dimension)
b1_shape = (m, 1)
W2_shape = (labels_count, m)
b2_shape = (labels_count, 1)

# Initialize the array with Gaussian random values
W1 = np.random.normal(loc=0.0, scale=1/np.sqrt(data_dimension), size=W1_shape)
W2 = np.random.normal(loc=0.0, scale=1/np.sqrt(m), size=W2_shape)
b1 = b2 = 0

## Compute the gradients for the network parameters

We write the evaluation and cost function:

In [None]:
# returns a (label_count, data_count) matrix
def evaluate_classifier(data, W1, b1, W2, b2):
    h = np.maximum(0, W1@data + b1)
    p = functions.softmax(W2@h + b2)
    return h, p

# returns the cost functions that we need to minimize
def compute_cost(data, onehot_labels, W1, b1, W2, b2, lbd):
    cost = 0
    
    # p is (label_count, data_count)
    h, p = evaluate_classifier(data, W1, b1, W2, b2)
    
    # for every data in training data set
    for d in range(0,len(data[0])):
        cost -= onehot_labels[:,d].T @ np.log(p[:,d])
        
    # we devide by the data_size    
    cost /= len(data[0])
    
    # we add the regularization term
    cost += lbd * (np.sum(W1**2) + np.sum(W2**2))
         
    return cost

In [None]:
compute_cost(data, onehot_labels, W1, b1, W2, b2, 0)

We write a function that computes the accuracy of the network:

In [None]:
def compute_accuracy(data, labels, W1, b1, W2, b2):    
    # Get the index of the maximum value which is the label for each row
    h , p = evaluate_classifier(data, W1, b1, W2, b2)
    predicted_labels = np.argmax(p, axis=0)
    
    return np.sum(labels == predicted_labels) / len(labels)

We compute the accuracy for the randomly initialized parameters. We should get an accuracy of 10% since it's random and there is 10 labels:

In [None]:
compute_accuracy(data, labels, W1, b1, W2, b2)

We write the function that evaluates for a mini-batch the gradient of the cost function.

The mini-batch gradient is defined as follows: 

$$\textbf{W}^{t+1} = \textbf{W}^t - \eta \sum_{n \in B^t} \nabla l_{cross}(\textbf{x},\textbf{y},\textbf{W},\textbf{b})$$

In [None]:
# mini batch data is (data_dimension, data_size) 
# onehot_labels is (label_count, data_size)
# predicted_labels is (label_count, data_size)
# W is (label_count, data_dimension)
def compute_gradient(data, onehot_labels, predicted_labels, W1, W2, H, lbd):
    data_size = len(data[0])
        
    # g is (label_count, data_size)
    # We start by the last layer
    g = -(onehot_labels - predicted_labels)
    
    grad_W2 = 2*lbd*W2 + (g @ H.T)/data_size
    grad_b2 = np.mean(g, axis = 1)
    
    # Now the first layer
    g = W2.T @ g
    g = g * (H > 0).astype(int)
    
    grad_W1 = 2*lbd*W1 + (g @ data.T)/data_size
    grad_b1 = np.mean(g, axis = 1)
        
    return grad_W1, grad_b1[:, np.newaxis], grad_W2, grad_b2[:, np.newaxis] 

## Train the network with cyclical learning rates

Now that we made sure the gradient descent it correct, we implement the mini batch gardient algorithm:

In [None]:
# gd_params = (s_batch, eta, n_epochs)
def mini_batch_gd(data, onehot_labels, gd_params, W1, b1, W2, b2, lbd, X_validation, Y_validation):
    # define the parameters
    s_batch = gd_params[0]
    eta_min, eta_max, step_size = gd_params[1], gd_params[2], gd_params[3]
    n_epochs = gd_params[4]
    
    # initialize empty lists to store the loss and cost function values
    cost = []
    cost_vald = []
    accuracy = []
    accuracy_vald = []

    W1_star = W1
    b1_star = b1
    W2_star = W2
    b2_star = b2
    
    # cyclical learning rate
    step_t = 0
    l = 0
    
    # construct the mini batches
    mini_batches = construct_mini_batches(s_batch, data, onehot_labels)
        
    validation_mini_batches = construct_mini_batches(s_batch, X_validation, Y_validation)
     
    for iter in range(n_epochs):
        for (mini_batch_X, mini_batch_y) in mini_batches:
            # compute the predictions for the mini_batch
            h, p = evaluate_classifier(mini_batch_X, W1_star, b1_star, W2_star, b2_star)
            
            # compute the learning rate
            if 2*l*step_size <= step_t <= (2*l + 1)*step_size:
                eta = eta_min + (eta_max - eta_min) * (step_t-2*l*step_size) / step_size
            else:
                eta = eta_max - (eta_max - eta_min) * (step_t-(2*l + 1)*step_size) / step_size

            step_t += 1
            if step_t % (step_size*2) == 0:
                l += 1
            
            # compute the new gradients
            grad_W1, grad_b1, grad_W2, grad_b2 = compute_gradient(mini_batch_X, mini_batch_y, p, W1_star, W2_star, h, lbd)        
            W1_star = W1_star - eta * grad_W1
            b1_star = b1_star - eta * grad_b1
            W2_star = W2_star - eta * grad_W2
            b2_star = b2_star - eta * grad_b2
            
            if step_t % 400 == 0:
                # compute the loss and cost function values
                cost.append(compute_cost(mini_batch_X, mini_batch_y, W1_star, b1_star, W2_star, b2_star, lbd))
                cost_vald.append(compute_cost(X_validation, Y_validation, W1_star, b1_star, W2_star, b2_star, lbd))

        #accuracy.append(compute_accuracy(mini_batch_X, np.argmax(mini_batch_y, axis=0), W1_star, b1_star, W2_star, b2_star))
        #accuracy_vald.append(compute_accuracy(X_validation, np.argmax(Y_validation, axis=0), W1_star, b1_star, W2_star, b2_star))
    
    print(l)
    plot_loss_cost(cost, cost_vald)
    #plot_accuracy(accuracy, accuracy_vald)
    return W1_star, b1_star, W2_star, b2_star
    
# return a tuple of arrays (x_batch, y_batch)
def construct_mini_batches(s_batch, data, onehot_labels):
    nb_batch = int(np.ceil(len(data[0])/s_batch))
    
    mini_batches = []
    
    for j in range(nb_batch):
        # set the start and end index of the batch
        j_start = j*s_batch
        j_end = (j+1)*s_batch        
        x_batch = data[:,j_start:j_end]
        y_batch = onehot_labels[:,j_start:j_end]
        
        mini_batches.append((x_batch, y_batch))
        
    return mini_batches

# plot the cost function values after each epoch
def plot_loss_cost(cost, cost_vald):
    step_size = 100
    x_axis = [i*step_size for i in range(len(cost))] # get the step size values as the x-axis
    plt.plot(x_axis, cost, label='Train cost')
    plt.plot(x_axis, cost_vald, label='Validation cost')
    plt.xlabel('step_size')
    plt.ylabel('cost')
    plt.legend()
    plt.show()
    
# plot accuracy
def plot_accuracy(accuracy, accuracy_vald):
    step_size = 100
    x_axis = [i*step_size for i in range(len(accuracy))] # get the step size values as the x-axis
    plt.plot(x_axis, accuracy, label='Train')
    plt.plot(x_axis, accuracy_vald, label='Validation')
    plt.xlabel('step_size')
    plt.ylabel('accuracy')
    plt.legend()
    plt.show()

We test the code:

In [None]:
# gd_params = (s_batch, eta_min, eta_max, step_size, n_epochs)
gd_params = (100, 1e-5, 1e-1, 500, 10)
lbd = 0.01
W1_star, b1_star, W2_star, b2_star = mini_batch_gd(data, onehot_labels, gd_params, W1, b1, W2, b2, lbd, X_validation, Y_validation)

compute_accuracy(X_validation, labels_validation, W1_star, b1_star, W2_star, b2_star)

## Train the network

We run it for more than 1 cycle (say 3), and for a larger step size $n_s = 800$.

In [None]:
# gd_params = (s_batch, eta_min, eta_max, step_size, n_epochs)
gd_params = (100, 1e-5, 1e-1, 800, 48)
lbd = 0
W1_star, b1_star, W2_star, b2_star = mini_batch_gd(data, onehot_labels, gd_params, W1, b1, W2, b2, lbd, X_validation, Y_validation)

compute_accuracy(X_validation, labels_validation, W1_star, b1_star, W2_star, b2_star)

Load all the data for the coarse search of lambda

In [None]:
# dir of data
data_batch_1 = "cifar-10-batches-py/data_batch_1"
data_batch_2 = "cifar-10-batches-py/data_batch_2"
data_batch_3 = "cifar-10-batches-py/data_batch_3"
data_batch_4 = "cifar-10-batches-py/data_batch_4"
data_batch_5 = "cifar-10-batches-py/data_batch_5"
test_batch = "cifar-10-batches-py/test_batch"

data_1, onehot_labels_1, labels_1 = LoadBatch(data_batch_1)
data_2, onehot_labels_2, labels_2 = LoadBatch(data_batch_2)
data_3, onehot_labels_3, labels_3 = LoadBatch(data_batch_3)
data_4, onehot_labels_4, labels_4 = LoadBatch(data_batch_4)
t_data_5, t_onehot_labels_5, t_labels_5 = LoadBatch(data_batch_5)
data_test, onehot_labels_test, labels_test = LoadBatch(test_batch)

data_5, onehot_labels_5, labels_5 = t_data_5[:,0:9000], t_onehot_labels_5[:,0:9000], t_labels_5[0:9000]
data_validation, onehot_labels_validation, labels_validation = t_data_5[:,9000:10000], t_onehot_labels_5[:,9000:10000], t_labels_5[9000:10000]

In [None]:
# merge all the data
data = np.hstack((data_1, data_2, data_3, data_4, data_5))
onehot_labels = np.hstack((onehot_labels_1, onehot_labels_2, onehot_labels_3, onehot_labels_4, onehot_labels_5))

# normalize
data = normalize_data(data)
data_validation = normalize_data(data_validation)
data_test = normalize_data(data_test)

Compute random values of lambda:

In [None]:
l_min, l_max = -5, -1

# gd_params = (s_batch, eta_min, eta_max, step_size, n_epochs)
gd_params = (100, 1e-5, 1e-1, 980, 8)
best_accuracy = 0

for i in range(8):
    # generate random lambda
    l = l_min + (l_max - l_min)*np.random.rand(1, 1)
    lbd = (10 ** l) [0][0]
    
    W1_star, b1_star, W2_star, b2_star = mini_batch_gd(data, onehot_labels, gd_params, W1, b1, W2, b2, lbd, data_validation, onehot_labels_validation)
    accuracy = compute_accuracy(data_validation, labels_validation, W1_star, b1_star, W2_star, b2_star)
    
    # Save the best lambda 
    if(accuracy > best_accuracy):
        best_accuracy = accuracy
        best_lbd = lbd

# gd_params = (s_batch, eta_min, eta_max, step_size, n_epochs)
gd_params = (100, 1e-5, 1e-1, 980, 16)

for i in range(8):
    # generate random lambda
    lbd = np.random.normal(best_lbd, best_lbd*0.01)
    
    W1_star, b1_star, W2_star, b2_star = mini_batch_gd(data, onehot_labels, gd_params, W1, b1, W2, b2, lbd, data_validation, onehot_labels_validation)
    accuracy = compute_accuracy(data_validation, labels_validation, W1_star, b1_star, W2_star, b2_star)

    # Save the best lambda 
    if(accuracy > best_accuracy):
        best_accuracy = accuracy
        best_lbd = lbd
        
print("best accuracy:", best_accuracy)
print("best lambda:", best_lbd)

In [None]:
# gd_params = (s_batch, eta_min, eta_max, step_size, n_epochs)
gd_params = (100, 1e-5, 1e-1, 980, 12)
lbd = best_lbd
W1_star, b1_star, W2_star, b2_star = mini_batch_gd(data, onehot_labels, gd_params, W1, b1, W2, b2, lbd, data_validation, onehot_labels_validation)

compute_accuracy(data_test, labels_test, W1_star, b1_star, W2_star, b2_star)