In [None]:
# Design a simple 2 layer NN to train CIFAR10 dataset
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline



In [None]:
# Search and locate the dataset and functions if it is in a remote directory
# For simplicity, download the dataset to the current folder and skip this cell
import os
with os.scandir(path = 'Path to your dataset') as entries:
    for entry in entries:
        print(entry.name)


In [None]:
with os.scandir(path = 'Path to dataloader function') as entries:
    for entry in entries:
        print(entry.name)

In [None]:
# Import the 'load_CIFAR' module to this jupyter notebook if it is in a remote directory 
# For simplicity, download the loader module to the current folder togetherwith the dataset and skip this cell
import sys
my_path_dir = 'Path to dataloader function'
sys.path.insert(0, my_path_dir)
mod =  __import__('data_utils') # this is available in cs231 stanford assignment
sys.path.pop(0)

In [None]:
# check if it is working
help(mod.load_CIFAR10)

In [None]:
# Load the CIFAR-10 dataset
# if this is not working, run the next cell instead
cifar10_dir = ' path to the data '
X_training, y_training, X_test, y_test = mod.load_CIFAR10(cifar10_dir)
print(X_training.shape, y_training.shape, X_test.shape, y_test.shape)

In [None]:
# Shuffle the indices and spare a validation set from training set
rand_indices = np.random.permutation(50000)

# Split the validation and training data
training_indices = rand_indices[:48000]
validation_indices = rand_indices[48000:]

# re-assign the training and validation sets
X_train = X_training[training_indices]
X_val = X_training[validation_indices]
y_train = y_training[training_indices]
y_val = y_training[validation_indices]
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)



In [None]:
# Reshape each instance to one long row. Each instance originally has 32 x 3 x 3 attributes
X_train = X_train.reshape(48000, -1)
X_val = X_val.reshape(2000, -1)
X_test = X_test.reshape(10000, -1)
print(X_train.shape, X_val.shape, X_test.shape)

In [None]:
# Zero center the data by substracting the mean of all instances from each instance
mean_image = np.mean(X_train, axis=0)
X_train -= mean_image
X_val -= mean_image
X_test -= mean_image

In [None]:
# View the data
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)

In [None]:
# Create a simple 2 layer neural network to train the cifar 10 data
from sklearn.metrics import log_loss

class Two_Layer_Neural_Network():
    
    ''' This is a simple 2 layer Neural Network made for image recognition. The model is run on cifar-10 dataset.
    Batch size is randomly picked and training is done with 1000 iterations to overfit the data intentionally. 
    Forward propagation function uses two inverted drop outs, leaky relu, softmax methods. 
    The loss function uses negative log likelihood method. Regulation is not used since drop out is used 2 times.
    Updating the network parameters is done by Adam's update and are saved to .txt file '''
    
    def __init__(self, training_data, training_labels, val_data, val_labels, test_data, test_labels, 
                 batch_size = 400 , std = 1e-3): 
        # input size is training_data.shape[1]
        self.training_data = training_data
        self.training_labels = training_labels
        self.val_data = val_data
        self.val_labels = val_labels
        self.test_data = test_data
        self.test_labels = test_labels
        self.batch_size = batch_size
        # define the shape of input data
        N, D = training_data.shape
        # size of layers is chosen
        self.input_size = D
        # there are 10 classes in labels
        self.output_size = 10
        # randomly choose a length for hidden size
        # the optimum hidden size should be choosen with a grid or random search between 50 - 2000
        self.hidden_size = 200
        # initialize the parameters and keep track of them
        self.params = {}
        # std is chosen 1e-3 since emprically xavier initialization method works between: 1e-2 and 1e-4
        # the optimum initialization of std constant should be chosen with a grid search between 1e-1 to 1e-5
        self.params['w1'] = std * np.random.randn(self.input_size, self.hidden_size) 
        self.params['w2'] = std * np.random.randn(self.hidden_size, self.output_size) 
        # the dot product of input and w1 yields a shape of 2d with [input size axis = 0 , hidden size axis = 1]
        self.params['b1'] = np.zeros([self.batch_size, self.hidden_size])
        # the dot product of hidden and w2 yields a shape of 2d with [hidden size axis = 0 , output size axis = 1]
        self.params['b2'] = np.zeros([self.batch_size, self.output_size])
        # keep track of the changing grads 
        self.grads = {} 
        self.grads['w1'] = np.zeros(self.params['w1'].shape)
        self.grads['w2'] = np.zeros(self.params['w2'].shape)
        self.grads['b1'] = np.zeros(self.params['b1'].shape)
        self.grads['b2'] = np.zeros(self.params['b2'].shape) 
        # keep track of loss, parameters while training the network
        self.loss_history = []
        self.train_accuracy_history = []
        self.val_accuracy_history = []
    
        
    def forward_prop(self, input_data):
        self.input_data = input_data
        # forward pass, first layer 
        fw1 = np.dot(input_data, self.params['w1']) + self.params['b1']
        # use leaky relu as it is the most popular activation function (relu would be just np.max(fw1, 0))
        self.alpha_for_leaky_relu = 1e-6
        fw1_relu = self.alpha_for_leaky_relu * fw1
        # hidden layer is ready
        self.hidden_layer = np.maximum(fw1_relu , fw1)
        # use inverted drop out for regularization 
        # traditional regularization loss: reg_constant * np.sum(w**2) for each w added to the loss function
        drop_out_mask = (np.random.randn(*self.hidden_layer.shape) < 0.5) / 0.5 # dropout mask
        self.hidden_layer *= drop_out_mask  # drop out anything lower than 0.5 and double anything above 0.5
        # forward pass second layer 
        fw2 = np.dot(self.hidden_layer, self.params['w2']) + self.params['b2']
        # use inverted drop out for regularization
        drop_out_mask2 = (np.random.randn(*fw2.shape) < 0.5) / 0.5 # dropout mask
        fw2 *= drop_out_mask2
        scores = fw2
        # convert the scores to softmax scores 
        find_max = np.max(scores, axis = 1)
        find_max = find_max[:, np.newaxis]
        trimmed_scores = scores - find_max
        exp_scores = np.exp(trimmed_scores)
        self.probs = exp_scores / np.sum(exp_scores, axis = 1)[:, np.newaxis]
        # output softmax scores
        return self.probs
    
    
    def loss(self, input_labels): # (select columns and press: " CTRL and / " to undo the #)
        # based on softmax scores, the targets are compared with labels with negative log loss function
        corr_probs = self.probs[range(len(self.probs)), input_labels] # for all instances extract the label indices
        corr_avg_log_probs = (-np.sum(np.log(corr_probs))) / len(input_labels)
        # to avoid inf or zero division errors caused by dtype, change the data structure to float64
        data_loss =  corr_avg_log_probs
        # since drop out is used for regularization, the regularization loss is not added to total data loss
        # reg_loss would be = reg * np.sum(W1*W1) + reg * np.sum(W2*W2) 
        return data_loss
        # if this code outputs inf values, use scikit learn's log loss function  
        return data_loss
        
    
    def backward_prop(self, input_labels):
        # backward pass 
        # to calculate the dscores, we create a dloss we use dscores 
        dscores = np.copy(self.probs)
        # the derivative of the combination of softmax and negative log-likelihood is: probs - 1
        dscores[range(len(input_labels)), input_labels] -= 1
        # calculate the gradient of the average loss
        dscores = dscores / len(input_labels)
        # db2 has dimensions of N x scores and dscores has dimension of N x scores
        self.grads['b2'] += dscores 
        # backprop w2 and b2
        self.grads['w2'] = np.dot(self.hidden_layer.T, dscores)
        # backprop hidden layer
        dhidden = np.dot(dscores, self.params['w2'].T)
        # backprop the ReLU non-linearity
        dhidden[self.hidden_layer <= 0] = self.alpha_for_leaky_relu
        # continue with db1 and dw1
        self.grads['b1'] += dhidden
        self.grads['w1'] = np.dot(self.input_data.T, dhidden) 
        # since we used drop out we are not adding the regularization gradient contribution
        # dW2 += 2*reg * W2
        # dW1 += 2*reg * W1
        return self.grads
        
        
    def train(self, learning_rate = 1e-3, learning_rate_decay = 0.99, training_iter = 1000):
        ''' The learning rate, learning rate decay, batch size and number of iterations should be optimized.
        Any change in these hyperparameters yield a different result.'''
        # train the network with training data and check the training accuracy with validation data
        for i in range(training_iter): 
            # Use SGD to train the model and optimize the parameters 
            rand_indices = np.random.permutation(len(self.training_data))
            batch_indices = rand_indices[:self.batch_size]
            training_batch_data = self.training_data[batch_indices]
            # assign the batch labels and keep the last training batch data
            training_batch_labels =  self.training_labels[batch_indices]
            self.last_training_batch_data = training_batch_data
            # forward pass with training batch data
            self.forward_prop(training_batch_data)
            # execute the below loss function when the optimum hyper parameters are found and stop training
            total_data_loss = self.loss(training_batch_labels)           
            # keep an eye on the training loss and calculate the accuracy of training and validation sets 
            if i % 50 == 0:
                # check the training loss 
                self.loss_history.append(total_data_loss) # the last training loss is stored
                # predict the accuracy from the last batch training batch
                predicted_training_batch = self.predict(training_batch_data)
                # check training accuracy 
                predict_train_accuracy = self.accuracy(predicted_training_batch, training_batch_labels)
                # store the training accuracy
                self.train_accuracy_history.append(predict_train_accuracy) # training accuracy is stored
                # predict the accuracy of validation set
                # prepare validation set with random indices 
                rand_ind_for_val = np.random.permutation(len(self.val_data))
                one_batch_of_ind = rand_ind_for_val[:self.batch_size]
                val_batch_data = self.val_data[one_batch_of_ind]
                val_batch_labels = self.val_labels[one_batch_of_ind]
                # check validation accuracy 
                predicted_val_results = self.predict(val_batch_data)
                accuracy_val_results = self.accuracy(predicted_val_results, val_batch_labels)
                # store validation accuracy
                self.val_accuracy_history.append(accuracy_val_results) # validation accuracy is stored
                #training loss, training accuracy and validation accuracy are all stored for each check point
           
            backward = self.backward_prop(training_batch_labels) # dw1, db1, dw1, dw2 stored in grads dictionary
            # update the parameters
            # REMINDER Adam update's general formula for one parameter is:
            # m = beta1*m + (1-beta1)*dx
            # v = beta2*v + (1-beta2)*(dx**2)
            # x += - learning_rate * m / (np.sqrt(v) + eps)
            eps = 1e-8
            beta1 = 0.9 
            beta2 = 0.999 
            ms = {}
            vs = {}
            for k in self.params.keys():
                ms[k] = np.zeros(self.params[k].shape)
                vs[k] = np.zeros(self.params[k].shape)
            
            for k in self.params.keys():
                ms[k] = (beta1 * ms[k]) + (1 - beta1) * self.grads[k]
                vs[k] = (beta2 * vs[k]) + (1 - beta2) * np.square(self.grads[k])
                self.params[k] += -learning_rate * ms[k] / (np.sqrt(vs[k]) + eps)
                
                   
            # reminder for vanilla update:
            # for k in ['w1', 'b1', 'w2', 'b2']:
            #    self.params[k] += -learning_rate * grads[k]
            
            # Decay learning rate
            learning_rate *= learning_rate_decay
            
#         after finding the optimum number of iterations and tuning all hyper-parameters, 
#         retrain this network with these properties and then save the model weights to the current directory
#         for k in self.params.keys():
#             name = str(k) + '.txt'
#             np.savetxt(name, self.params[k])
            
            
    def predict(self, data_for_prediction): # prediction is made and targets are returned
        predictions_list = np.zeros(len(data_for_prediction))
        for i in range(len(data_for_prediction)):
            # Use the trained weights for prediction. No dropouts, no activation functions, or softmax       
            hidden = np.maximum(0, np.dot(data_for_prediction[i], self.params['w1']) + self.params['b1'][0])     
            scores = np.dot(hidden, self.params['w2']) + self.params['b2'][0]
            prediction =  np.argmax(scores) #  np.argmax(scores, axis=1) if multiple instances are loaded
            predictions_list[i] = prediction
        return predictions_list
    
    
    def accuracy(self, predictions, labels): # load the data to be predicted and their labels
        # results = self.predict(data_for_prediction)
        accuracy = np.mean(predictions == labels)
        return accuracy
        
        
    def show_loss(self):
        print(self.loss_history)
        print(self.val_accuracy_history)
        print(self.train_accuracy_history)
#         plt.subplot(3,1,1)
#         plt.plot(self.loss_history)
#         plt.xlabel('each 50 iterations')
#         plt.ylabel('loss history')
#         plt.subplot(3,1,2)
#         plt.plot(self.train_accuracy_history)
#         plt.xlabel('each 50 iterations')
#         plt.ylabel('training accuracy')
#         plt.subplot(3,1,3)
#         plt.plot(self.val_accuracy_history)
#         plt.xlabel('each 50 iterations')
#         plt.ylabel('test accuracy')

    
    def test_accuracy(self):
        self.test_accuracy_history = []
        test_predictions = []
        for each_instance in self.test_data:
            predict = self.predict(each_instance)
            test_predictions.append(predict)
            test_predictions = np.array(predictions)
        # compare the predictions and actual labels
        return self.accuracy(test_predictions, self.test_labels)
       
    


    
    

In [None]:

test_NN = Two_Layer_Neural_Network(X_train, y_train, X_val, y_val, X_test, y_test)
test_NN.train()




In [None]:
test_NN.show_loss()
#test_NN.test_accuracy()
np.set_printoptions(precision = 3)