# COMS 4995_002 Deep Learning Assignment 1
Due on Monday, Oct 9, 11:59pm

## Part 1

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
import glob
import sys
# you shouldn't need to make any more imports

In [14]:
class NeuralNetwork(object):
    """
    Abstraction of neural network.
    Stores parameters, activations, cached values. 
    Provides necessary functions for training and prediction. 
    """
    def __init__(self, layer_dimensions, drop_prob=0.0, reg_lambda=0.0):
        """
        Initializes the weights and biases for each layer
        :param layer_dimensions: (list) number of nodes in each layer
        :param drop_prob: drop probability for dropout layers. Only required in part 2 of the assignment
        :param reg_lambda: regularization parameter. Only required in part 2 of the assignment
        """
        np.random.seed(1)
        
        self.parameters = {}
        self.num_layers = len(layer_dimensions)
        self.drop_prob = drop_prob
        self.reg_lambda = reg_lambda
        
        # init parameters
        for l in range(1, self.num_layers):
            # glorot init
            eps = np.sqrt(2.0 / (layer_dimensions[l] + layer_dimensions[l-1]))
            self.parameters["W" + str(l)] = np.random.randn(layer_dimensions[l], layer_dimensions[l - 1]) * eps
            self.parameters["b" + str(l)] = np.zeros((layer_dimensions[l], 1)) + 0.01

    def affineForward(self, A, W, b):
        """
        Forward pass for the affine layer.
        :param A: input matrix, shape (L, S), where L is the number of hidden units in the previous layer and S is
        the number of samples
        :returns: the affine product WA + b, along with the cache required for the backward pass
        """
        return np.dot(W, A) + b, (A, W, b)

    def activationForward(self, A, activation="relu"):
        """
        Common interface to access all activation functions.
        :param A: input to the activation function
        :param prob: activation funciton to apply to A. Just "relu" for this assignment.
        :returns: activation(A)
        """ 
        if activation == "relu":
            return self.relu(A)

    def relu(self, X):
        return np.maximum(0, X), X
    
    def softmax(self, X):
        return np.exp(X) / np.sum(np.exp(X),axis=0)
            
    def dropoutForward(self, A, prob):
        """
        :param A: 
        :param prob: drop prob
        :returns: tuple (A, M) 
            WHERE
            A is matrix after applying dropout
            M is dropout mask, used in the backward pass
        """
        M = np.random.rand(A.shape[0], A.shape[1])
        M = (M > prob) *1.0 # set to 0 or 1
        M /= (1 - prob)  # keep expected value the same
        A = A*M
        return A, M
    

    def forwardPropagation(self, X):
        """
        Runs an input X through the neural network to compute activations
        for all layers. Returns the output computed at the last layer along
        with the cache required for backpropagation.
        :returns: (tuple) AL, cache
            WHERE 
            AL is activation of last layer
            cache is cached values for each layer that
                     are needed in further steps
        """
        cache = {}
        A = X
        drop_prob = 0.2
        L = self.num_layers
        for l in range(1, L - 1):
#             print(l)
            A_prev = A
            A, cache_l = self.affineForward(A_prev, self.parameters["W" + str(l)], self.parameters["b" + str(l)])
            cache['linear' + str(l)] = cache_l
            A, cache_a = self.activationForward(A, "relu")
            cache['activation' + str(l)] = cache_a

            if self.drop_prob > 0:
#                 call dropout
                A, cache_d = self.dropoutForward(A, drop_prob)
                cache['dropout' + str(l)] = cache_d

        # for the last layer (without activation)
        AL, cache_l = self.affineForward(A, self.parameters["W" + str(L - 1)], self.parameters["b" + str(L - 1)])
        cache['linear' + str(L - 1)] = cache_l
#         AL = self.activationForward(AL, "softmax")
        return AL, cache
    
    def costFunction(self, AL, y):
        """
        :param AL: Activation of last layer, shape (num_classes, S)
        :param y: labels, shape (S)
        :param alpha: regularization parameter
        :returns cost: A scalar denoting cost
        """
#         compute loss
        m = y.shape[0]
        AL_softmax = self.softmax(AL)
        correct_label_prob = AL_softmax[y, range(m)]
        cost = -np.sum(np.log(correct_label_prob)) / m
        
#         add regularization
        if self.reg_lambda > 0:
            for l in range(1, L):
                cost += 0.5*self.reg_lambda*np.sum(np.square(self.parameters["W" + str(l)]))
#       it would be easier to add gradients from L2 reg at the end of backprop, so we skip the gradient
#       computation here
        
        # gradient of cost
        AL_softmax[y, range(m)] -= 1
        dAL = AL_softmax/float(m)
        return cost, dAL

    def affineBackward(self, dA_prev, cache):
        """
        Backward pass for the affine layer.
        :param dA_prev: gradient from the next layer.
        :param cache: cache returned in affineForward
        :returns dA: gradient on the input to this layer
                 dW: gradient on the weights
                 db: gradient on the bias
        """
        A, W, b = cache

        dW = np.dot(dA_prev, A.T)
        db = np.expand_dims(np.sum(dA_prev, axis=1), axis=1)
        dA = np.dot(W.T, dA_prev)
        return dA, dW, db

    def activationBackward(self, dA, cache, activation="relu"):
        """
        Interface to call backward on activation functions.
        """
        if activation == "relu":
            return self.relu_derivative(dA, cache)
        
    def relu_derivative(self, dx, cached_x):
        dx[cached_x < 0] = 0
        return dx

    def dropoutBackward(self, dA, cache):
        # dropout in backprop
        M = cache
        dA = np.multiply(dA, M)
        # dA /= (1 - prob)
        return dA

    def backPropagation(self, dAL, Y, cache):
        """
        Run backpropagation to compute gradients on all paramters in the model
        :param dAL: gradient on the last layer of the network. Returned by the cost function.
        :param Y: labels
        :param cache: cached values during forwardprop
        :returns gradients: dW and db for each weight/bias
        """
        gradients = {}
        L = self.num_layers
        m = dAL.shape[1]
        dA_prev, dW, db = self.affineBackward(dAL, cache['linear' + str(L - 1)])
        gradients['dW' + str(L - 1)] = dW
        gradients['db' + str(L - 1)] = db

        for l in range(L - 2, 0, -1):
            
            if self.drop_prob > 0:
#                 call dropout
                dA_prev = self.dropoutBackward(dA_prev, cache['dropout' + str(l)])

            dA = self.activationBackward(dA_prev, cache['activation' + str(l)], "relu")
            dA, dW, db = self.affineBackward(dA, cache['linear' + str(l)])

            dA_prev = dA
            gradients['dW' + str(l)] = dW
            gradients['db' + str(l)] = db
            
#         add gradients from L2 regularization to each dW
        if self.reg_lambda > 0:
            for l in range(1, L):
                gradients["W" + str(l)] += self.reg_lambda*self.parameters["W" + str(l)]

        return gradients


    def updateParameters(self, gradients, alpha):
        """
        :param gradients: gradients for each weight/bias
        :param alpha: step size for gradient descent 
        """
        L = self.num_layers
        for l in range(1, L):
            self.parameters["W" + str(l)] += -alpha * gradients["dW" + str(l)]
            self.parameters["b" + str(l)] += -alpha * gradients["db" + str(l)]

    def train(self, X, y, iters=1000, alpha=0.0001, batch_size=100, print_every=100):
        """
        :param X: input samples, each column is a sample
        :param y: labels for input samples, y.shape[0] must equal X.shape[1]
        :param iters: number of training iterations
        :param alpha: step size for gradient descent
        :param batch_size: number of samples in a minibatch
        """
        
        for i in range(0, iters):
            # get minibatch
            X_batch, y_batch = self.get_batch(X, y, batch_size)
            # forward prop
            AL, cache = self.forwardPropagation(X_batch)
            # compute loss
            cost, dAL = self.costFunction(AL, y_batch)
            # compute gradients
            gradients = self.backPropagation(dAL, y_batch, cache)
            # update weights and biases based on gradient
            self.updateParameters(gradients, alpha)
            if i%print_every==0:
                y_pred = self.predict(X_batch)
                acc = float(np.sum(y_pred == y_batch))/y_batch.shape[0]
                print("iter = %i, loss = %f, accuracy=%f" % (i, cost, acc))
#             print("iter = %i, loss = %f" % (i, cost))
                
    def predict(self, X):
        """
        Make predictions for each sample
        Same as forwardPropagation, just skips the dropout layer and storing cache
        """
        A = X
        L = self.num_layers
        for l in range(1, L - 1):
            A_prev = A
            A, _ = self.affineForward(A_prev, self.parameters["W" + str(l)], self.parameters["b" + str(l)])
            A, _ = self.activationForward(A, "relu")

        # for the last layer (without activation)
        AL, _ = self.affineForward(A, self.parameters["W" + str(L - 1)], self.parameters["b" + str(L - 1)])
        
        y_pred = np.argmax(AL, axis=0)
#         print(y_pred.shape)
        return y_pred

    def get_batch(self, X, y, batch_size):
        """
        Return minibatch of samples and labels
        
        :param X, y: samples and corresponding labels
        :parma batch_size: minibatch size
        :returns: (tuple) X_batch, y_batch
        """
        num_samples = X.shape[1]
        indices = np.random.randint(num_samples, size=batch_size)
        X_batch = X[:, indices]
        y_batch = y[indices]
        return X_batch, y_batch

In [3]:
# Helper functions, DO NOT modify this

def get_img_array(path):
    """
    Given path of image, returns it's numpy array
    """
    return scipy.misc.imread(path)


def resize_img(img_arr, target_dim=(50, 50)):
    """
    Resizes img represented as numpy array
    """
    return scipy.misc.imresize(img_arr, target_dim)


def get_files(folder):
    """
    Given path to folder, returns list of files in it
    """
    filenames = [file for file in glob.glob(folder+'*/*')]
    filenames.sort()
    return filenames

def get_label(filepath, label2id):
    """
    Files are assumed to be labeled as: /path/to/file/999_frog.png
    Returns label for a filepath
    """
    tokens = filepath.split('/')
    label = tokens[-1].split('_')[1][:-4]
    if label in label2id:
        return label2id[label]
    else:
        sys.exit("Invalid label: " + label)

In [4]:
# Functions to load data

def get_labels(folder, label2id):
    """
    Returns vector of labels extracted from filenames of all files in folder
    :param folder: path to data folder
    :param label2id: mapping of text labels to numeric ids. (Eg: automobile -> 0)
    """
    files = get_files(folder)
    y = []
    for f in files:
        y.append(get_label(f,label2id))
    return np.array(y)

def one_hot(y, num_classes=10):
    """
    Converts each label index in y to vector with one_hot encoding
    """
    y_one_hot = np.zeros((y.shape[0], num_classes))
    y_one_hot[y] = 1
    return y_one_hot.T

def get_label_mapping(label_file):
    """
    Returns mappings of label to index and index to label
    The input file has list of labels, each on a separate line.
    """
    with open(label_file, 'r') as f:
        id2label = f.readlines()
        id2label = [l.strip() for l in id2label]
    label2id = {}
    count = 0
    for label in id2label:
        label2id[label] = count
        count += 1
    return id2label, label2id

def get_images(folder):
    """
    returns numpy array of all samples in folder
    each column is a sample resized to 30x30 and flattened
    """
    files = get_files(folder)
    images = []
    count = 0
    
    # TODO: REMOVE THIS
    fig1 = plt.figure(figsize=(5,5))
    
    for f in files:
        count += 1
        if count%10000==0:
            print("Loaded {}/{}".format(count,len(files)))
        img_arr = get_img_array(f)
        
#         # display a few images
#         if count <= 6:
#             ax = fig1.add_subplot(3,2,count)
#             ax.set_axis_off()
#             ax.imshow(img_arr, interpolation='none')
              
        # resize image
        # no need to resize for CIFAR10, already 32x32
        # img_arr = resize_img(img_arr)
        
        img_arr = img_arr.flatten() / 255.0
        images.append(img_arr)
    X = np.column_stack(images)

    return X

def get_train_data(data_root_path, suffix="train"):
    """
    Return X and y
    """
    train_data_path = data_root_path + suffix
    id2label, label2id = get_label_mapping(data_root_path+'labels.txt')
    print(label2id)
    X = get_images(train_data_path)
    y = get_labels(train_data_path, label2id)
    return X, y

def save_predictions(filename, y):
    np.save(filename, y)

In [7]:
# Load the data
# Also display a few images for a sanity check
data_root_path = '/Users/yjc/columbia_course/deep learning/HW/cifar10-hw1/'
X_train, y_train = get_train_data(data_root_path) # this may take a few minutes
# X_test = get_images(data_root_path + 'test')
# X_test, y_test = get_train_data(base_path, 'test')
# print(X_train.shape)
# print(y_train.shape)
print('Data loading done')

{'deer': 4, 'truck': 9, 'dog': 5, 'frog': 6, 'bird': 2, 'cat': 3, 'automobile': 1, 'airplane': 0, 'ship': 8, 'horse': 7}


`imread` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imread`` instead.


Loaded 10000/50000
Loaded 20000/50000
Loaded 30000/50000
Loaded 40000/50000
Loaded 50000/50000
Data loading done


<matplotlib.figure.Figure at 0x109a73080>

# Part 1

In [15]:
layer_dimensions = [X_train.shape[0], 512, 256, 128, 10]  # including the input and output layers
NN = NeuralNetwork(layer_dimensions)

In [16]:
NN.train(X_train, y_train, iters=10000, alpha=0.0001, batch_size=200, print_every=200)

iter = 0, loss = 2.390885, accuracy=0.105000
iter = 200, loss = 2.305319, accuracy=0.130000
iter = 400, loss = 2.287392, accuracy=0.105000
iter = 600, loss = 2.261579, accuracy=0.160000
iter = 800, loss = 2.274771, accuracy=0.145000
iter = 1000, loss = 2.227966, accuracy=0.220000
iter = 1200, loss = 2.249587, accuracy=0.195000
iter = 1400, loss = 2.235504, accuracy=0.180000
iter = 1600, loss = 2.253706, accuracy=0.170000
iter = 1800, loss = 2.226301, accuracy=0.190000
iter = 2000, loss = 2.200227, accuracy=0.250000
iter = 2200, loss = 2.216754, accuracy=0.225000
iter = 2400, loss = 2.181884, accuracy=0.290000
iter = 2600, loss = 2.172604, accuracy=0.220000
iter = 2800, loss = 2.176344, accuracy=0.260000
iter = 3000, loss = 2.170724, accuracy=0.250000
iter = 3200, loss = 2.168253, accuracy=0.225000
iter = 3400, loss = 2.146997, accuracy=0.275000
iter = 3600, loss = 2.149460, accuracy=0.255000
iter = 3800, loss = 2.107983, accuracy=0.275000
iter = 4000, loss = 2.153116, accuracy=0.275000

In [None]:
y_predicted = NN.predict(X_test)
save_predictions('ans1-uni', y_predicted)

In [None]:
# test if your numpy file has been saved correctly
a = np.load('ans1-uni.npy')
print(a.shape)
a[:10]

## Part 2: Regularizing the neural network
##### Add dropout and L2 regularization

In [None]:
NN2 = NeuralNetwork(layer_dimensions, drop_prob=0.3, reg_lambda=0)
NN2.train(X_train, y_train, iters=10000, alpha=0.001, batch_size=1000, print_every=200)

In [None]:
y_predicted2 = NN2.predict(X_test)
save_predictions('ans2-uni', y_predicted)

In [None]:
X_test, y_test = get_train_data(data_root_path, 'test')

# Evaluating on test set

In [None]:
test_acc = float(np.sum(y_predicted == y_test))/y_test.shape[0]
print("Part 1 test accuracy is", test_acc)

test_acc = float(np.sum(y_predicted2 == y_test))/y_test.shape[0]
print("Part 2 test accuracy is", test_acc)