# COMS 4995_002 Deep Learning Assignment 1
Due on Thursday, Feb 8, 11:59pm

This assignment can be done in groups of at most 2 students. Everyone must submit on Courseworks individually.

Write down the UNIs of your group (if applicable)

Member 1: Jiachen Yang, jy2865

Member 2: Zijing Wu, zw2442

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
import glob
import sys
# you shouldn't need to make any more imports

In [74]:
class NeuralNetwork(object):
    """
    Abstraction of neural network.
    Stores parameters, activations, cached values. 
    Provides necessary functions for training and prediction. 
    """
    def __init__(self, layer_dimensions, drop_prob=0.0, reg_lambda=0.0, optimization_methods = "SGD", beta = 1.0 , reg_methods="L2"):
        """
        Initializes the weights and biases for each layer
        :param layer_dimensions: (list) number of nodes in each layer
        :param drop_prob: drop probability for dropout layers. Only required in part 2 of the assignment
        :param reg_lambda: regularization parameter. Only required in part 2 of the assignment
        """
        np.random.seed(1)
        
        self.parameters = {}
        self.num_layers = len(layer_dimensions)
        self.drop_prob = drop_prob
        self.reg_lambda = reg_lambda
        self.opt_methods = optimization_methods
        self.traning = True
        self.beta = beta
        self.reg_methods = reg_methods
        
        # init parameters
        self.parameters['W'] = {}
        self.parameters['B'] = {}
        self.parameters['Z'] = {}
        self.parameters['A'] = {}
        self.parameters['D'] = {}
        self.parameters['m'] = {}
        self.parameters['mb'] = {}
        for i in range(0,len(layer_dimensions)):
            self.parameters['W'][i] = np.random.randn(layer_dimensions[i], layer_dimensions[i - 1]) / np.sqrt(layer_dimensions[i - 1])
            self.parameters['B'][i] = np.zeros([layer_dimensions[i], 1])
            self.parameters['m'][i] = None
            self.parameters['mb'][i] = None
            self.parameters['D'][i] = None
            # self.parameters['Z'][i] = np.zeros((layer_dimensions[i],1))
            # self.parameters['A'][i] = np.zeros((layer_dimensions[i],1))
        

    def affineForward(self, A, W, b):
        """
        Forward pass for the affine layer.
        :param A: input matrix, shape (L, S), where L is the number of hidden units in the previous layer and S is
        the number of samples
        :returns: the affine product WA + b, along with the cache required for the backward pass
        """
        Z = np.dot(W, A) + b
        
        # return affine product
        return Z
        
        

    def activationForward(self, A, activation="relu"):
        """
        Common interface to access all activation functions.
        :param A: input to the activation function
        :param prob: activation funciton to apply to A. Just "relu" for this assignment.
        :returns: activation(A)
        """ 
        if activation == "relu":
            return self.relu(A)
        elif activation == "softmax":
            return self.softmax(A)


    def relu(self, X):
        return np.maximum(0, X)
    
    def softmax(self, X):
        return np.exp(X) / np.sum(np.exp(X), axis = 0)
    
            
    def dropout(self, A, prob):
        """
        :param A: 
        :param prob: drop prob
        :returns: tuple (A, M) 
            WHERE
            A is matrix after applying dropout
            M is dropout mask, used in the backward pass
        """
        M = np.random.rand(A.shape[0],A.shape[1])
        M = (M>prob)*1.0
        M /= (1-prob)
        A *= M
        return A, M

    def forwardPropagation(self, X):
        """
        Runs an input X through the neural network to compute activations
        for all layers. Returns the output computed at the last layer along
        with the cache required for backpropagation.
        :returns: (tuple) AL, cache
            WHERE 
            AL is activation of last layer
            cache is cached values for each layer that
                     are needed in further steps
        """
        self.parameters["A"][0] = X
        
        for i in range(1, self.num_layers-1):
            Z = self.affineForward(self.parameters["A"][i-1], self.parameters["W"][i], self.parameters["B"][i])
            A = self.activationForward(Z)
            self.parameters["Z"][i] = Z
            self.parameters["A"][i] = A
            
            if self.traning and self.drop_prob > 0:
                A, M = self.dropout(A,self.drop_prob)
                self.parameters["D"][i] = M
        
        Z = self.affineForward(self.parameters["A"][self.num_layers-2], self.parameters["W"][self.num_layers-1], self.parameters["B"][self.num_layers-1])
        AL = self.activationForward(Z,"softmax")
        self.parameters["Z"][self.num_layers-1] = Z
        self.parameters["A"][self.num_layers-1] = AL
        self.parameters["D"][self.num_layers-1] = None
            
        return AL
    
    def costFunction(self, AL, y):
        """
        :param AL: Activation of last layer, shape (num_classes, S)
        :param y: labels, shape (S)
        :param alpha: regularization parameter
        :returns cost, dAL: A scalar denoting cost and the gradient of cost
        """
        # compute loss
        size = y.shape[0]
        cost = -np.sum(np.log(AL[y,range(size)]))/size
        
        if self.reg_lambda > 0:
            if self.reg_methods == "L2":
                for i in range(1,self.num_layers):
                    cost += 0.5*self.reg_lambda*np.sum(np.square(self.parameters['W'][i]))
            elif self.reg_methods == "L1":
                for i in range(1,self.num_layers):
                    cost += self.reg_lambda*np.sum(np.abs(self.parameters['W'][i]))

        # gradient of cost
        AL[y,range(size)] -= 1
        dAL = AL / size
        
        return cost, dAL

    def affineBackward(self, dA_prev, cache):
        """
        Backward pass for the affine layer.
        :param dA_prev: gradient from the next layer.
        :param cache: cache returned in affineForward
        :returns dA: gradient on the input to this layer
                 dW: gradient on the weights
                 db: gradient on the bias
        """
        W,Z,A,D,mW,mb = cache

        if self.drop_prob > 0 and not D is None:
            #call dropout_backward
            A = self.dropout_backward(A,D)
            
        dA = self.activationBackward(W.T@dA_prev,Z)
        dW = dA_prev@A.T
        db = np.expand_dims(np.sum(dA_prev, axis=1), axis=1)
        
        if self.opt_methods == "Momentum":
            if mW is not None:
                dW = (1-self.beta) * dW + self.beta*mW
                db = (1-self.beta) * db + self.beta*mb
            mW = dW
            mb = db
        elif self.opt_methods == "rmsprop": 
            if mW is None:
                mW = np.zeros([dW.shape[0],dW.shape[1]])
                mb = np.zeros([db.shape[0],db.shape[1]])
            ds = (1-self.beta) * dW * dW + self.beta * mW
            dW = dW / np.sqrt(ds+1e-10)
            #ds[ds<1e-5] = 1e-5
            #dW = dW / np.sqrt(ds)
            dbs = (1-self.beta) * db * db + self.beta * mb
            db = dbs / np.sqrt(dbs+1e-10)
            #dbs[dbs < 1e-5] = 1e-5
            #db = dbs / np.sqrt(dbs)
            mW = ds
            mb = dbs
            
        
        return dA, dW, db, mW, mb

    def activationBackward(self, dA, cache, activation="relu"):
        """
        Interface to call backward on activation functions.
        In this case, it's just relu. 
        """
        Z = cache
        if activation == "relu":
            return dA*self.relu_derivative(Z)

        
    def relu_derivative(self, cached_x):
        
        return np.where(cached_x>0,1,0)
    
    def dropout_backward(self, dA, cache):
        return dA * cache

    def backPropagation(self, dAL, Y, cache):
        """
        Run backpropagation to compute gradients on all paramters in the model
        :param dAL: gradient on the last layer of the network. Returned by the cost function.
        :param Y: labels
        :param cache: cached values during forwardprop
        :returns gradients: dW and db for each weight/bias
        """
        gradients = {}
        L = self.num_layers
        dA_prev,dW,db,mW,mb = self.affineBackward(dAL,(self.parameters['W'][L-1],self.parameters['Z'][L-2],self.parameters['A'][L-2],None,self.parameters['m'][L-1],self.parameters['mb'][L-1]))
        self.parameters['m'][L-1] = mW
        self.parameters['mb'][L-1] = mb
        gradients[L-1] = {}
        gradients[L-1]['dW'] = dW
        if self.reg_lambda > 0:
            if self.reg_methods == "L2":
                gradients[L-1]['dW'] += self.reg_lambda * self.parameters['W'][L-1]
            elif self.reg_methods == "L1":
                gradients[L-1]['dW'] += self.reg_lambda * np.sign(self.parameters['W'][L-1])
        gradients[L-1]['db'] = db
        
        self.parameters["D"][0] = None
        
        for i in range(L-3,-1,-1):
            dA_prev,dW,db,mW,mb = self.affineBackward(dA_prev,(self.parameters['W'][i+1],self.parameters['Z'][i],self.parameters['A'][i],self.parameters["D"][i],self.parameters['m'][i+1],self.parameters['mb'][i+1]))
            self.parameters['m'][i+1] = mW
            self.parameters['mb'][i+1] = mb
            gradients[i+1] = {}
            gradients[i+1]['dW'] = dW
            gradients[i+1]['db'] = db
            
            
            if self.reg_lambda > 0:
                if self.reg_methods == "L2":
                # add gradients from L2 regularization to each dW
                    gradients[i+1]['dW'] += self.reg_lambda * self.parameters['W'][i+1]
                elif self.reg_methods == "L1":
                    gradients[i+1]['dW'] += self.reg_lambda * np.sign(self.parameters['W'][i+1])
        
        return gradients


    def updateParameters(self, gradients, alpha):
        """
        :param gradients: gradients for each weight/bias
        :param alpha: step size for gradient descent 
        """
        L = self.num_layers
        
        for i in range(1,L):
#             print(gradients[i]['dW'].shape)
#             print(np.sum(np.abs(gradients[i]['dW'])))
            self.parameters['W'][i] -= alpha*gradients[i]['dW']
            self.parameters['B'][i] -= alpha*gradients[i]['db']

    def train(self, X, y, iters=1000, alpha=0.0001, batch_size=100, print_every=100):
        """
        :param X: input samples, each column is a sample
        :param y: labels for input samples, y.shape[0] must equal X.shape[1]
        :param iters: number of training iterations
        :param alpha: step size for gradient descent
        :param batch_size: number of samples in a minibatch
        :param print_every: no. of iterations to print debug info after
        """
        self.traning = True
        data_size = X.shape[1]
        train_size = int(data_size*0.9)
        random_range = np.random.permutation(data_size)
        X_train = X[:,random_range[0:train_size]]
        X_validation = X[:,random_range[train_size:data_size]]

        y_train = y[random_range[0:train_size]]
        y_validation = y[random_range[train_size:data_size]]
        
        for i in range(0, iters):
            # get minibatch
            batch_X,batch_y = self.get_batch(X_train,y_train,batch_size)
            # forward prop
            AL = self.forwardPropagation(batch_X)
            # compute loss
            cost,dAL = self.costFunction(AL,batch_y)
            # compute gradients
            self.parameters['Z'][0] = batch_X
            gradients = self.backPropagation(dAL,batch_y,None)
            # update weights and biases based on gradient
            self.updateParameters(gradients,alpha)
            if i % print_every == 0:
                # print cost, train and validation set accuracies
                y_pred = self.predict(batch_X)
                acc = float(np.sum(y_pred == batch_y))/batch_y.shape[0]
                                
                y_validation_predicted = self.predict(X_validation)
                validation_acc = float(np.sum(y_validation_predicted == y_validation))/y_validation.shape[0]
                
                print("Iteration %5d: Cost is %.6f, Accuracy is %.6f The accuracy of validation set is %f"%(i,cost,acc,validation_acc))
                
    def predict(self, X):
        """
        Make predictions for each sample
        """
        self.traning = False
        A = X
        for i in range(1, self.num_layers):
            Z = self.affineForward(A, self.parameters["W"][i], self.parameters["B"][i])
            A = self.activationForward(Z)
        
        y_pred = np.argmax(A,axis=0)

        return y_pred

    def get_batch(self, X, y, batch_size):
        """
        Return minibatch of samples and labels
        
        :param X, y: samples and corresponding labels
        :parma batch_size: minibatch size
        :returns: (tuple) X_batch, y_batch
        """
        index = np.random.randint(0,X.shape[1],size=batch_size)
        X_batch = X[:,index]
        y_batch = y[index]

        return X_batch, y_batch

In [4]:
# Helper functions, DO NOT modify this

def get_img_array(path):
    """
    Given path of image, returns it's numpy array
    """
    return scipy.misc.imread(path)

def get_files(folder):
    """
    Given path to folder, returns list of files in it
    """
    filenames = [file for file in glob.glob(folder+'*/*')]
    filenames.sort()
    return filenames

def get_label(filepath, label2id):
    """
    Files are assumed to be labeled as: /path/to/file/999_frog.png
    Returns label for a filepath
    """
    tokens = filepath.split('/')
    label = tokens[-1].split('_')[1][:-4]
    if label in label2id:
        return label2id[label]
    else:
        sys.exit("Invalid label: " + label)

In [5]:
# Functions to load data, DO NOT change these

def get_labels(folder, label2id):
    """
    Returns vector of labels extracted from filenames of all files in folder
    :param folder: path to data folder
    :param label2id: mapping of text labels to numeric ids. (Eg: automobile -> 0)
    """
    files = get_files(folder)
    y = []
    for f in files:
        y.append(get_label(f,label2id))
    return np.array(y)

def one_hot(y, num_classes=10):
    """
    Converts each label index in y to vector with one_hot encoding
    """
    y_one_hot = np.zeros((y.shape[0], num_classes))
    y_one_hot[y] = 1
    return y_one_hot.T

def get_label_mapping(label_file):
    """
    Returns mappings of label to index and index to label
    The input file has list of labels, each on a separate line.
    """
    with open(label_file, 'r') as f:
        id2label = f.readlines()
        id2label = [l.strip() for l in id2label]
    label2id = {}
    count = 0
    for label in id2label:
        label2id[label] = count
        count += 1
    return id2label, label2id

def get_images(folder):
    """
    returns numpy array of all samples in folder
    each column is a sample resized to 30x30 and flattened
    """
    files = get_files(folder)
    images = []
    count = 0
    
    for f in files:
        count += 1
        if count % 10000 == 0:
            print("Loaded {}/{}".format(count,len(files)))
        img_arr = get_img_array(f)
        img_arr = img_arr.flatten() / 255.0
        images.append(img_arr)
    X = np.column_stack(images)

    return X

def get_train_data(data_root_path):
    """
    Return X and y
    """
    train_data_path = data_root_path + 'train'
    id2label, label2id = get_label_mapping(data_root_path+'labels.txt')
    print(label2id)
    X = get_images(train_data_path)
    y = get_labels(train_data_path, label2id)
    return X, y

def save_predictions(filename, y):
    """
    Dumps y into .npy file
    """
    np.save(filename, y)

In [6]:
# Load the data
data_root_path = '/Users/yjc/columbia_course/deep learning/HW/cifar10-hw1/'
X_train, y_train = get_train_data(data_root_path) # this may take a few minutes
X_test = get_images(data_root_path + 'test')
print('Data loading done')

{'ship': 8, 'airplane': 0, 'automobile': 1, 'dog': 5, 'truck': 9, 'bird': 2, 'deer': 4, 'frog': 6, 'cat': 3, 'horse': 7}


`imread` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imread`` instead.


Loaded 10000/50000
Loaded 20000/50000
Loaded 30000/50000
Loaded 40000/50000
Loaded 50000/50000
Loaded 10000/10000
Data loading done


## Part 1

#### Simple fully-connected deep neural network

In [74]:
layer_dimensions = [X_train.shape[0], 512, 512, 256, 256, 128, 128, 64, 32, 10]  # including the input and output layers
NN = NeuralNetwork(layer_dimensions)
NN.train(X_train, y_train, iters=15000, alpha=0.05, batch_size=200, print_every=1000)

Iteration     0: Cost is 2.299764, Accuracy is 0.140000 The accuracy of validation set is 0.121800
Iteration  1000: Cost is 1.751489, Accuracy is 0.365000 The accuracy of validation set is 0.384400
Iteration  2000: Cost is 1.530456, Accuracy is 0.495000 The accuracy of validation set is 0.374600
Iteration  3000: Cost is 1.526228, Accuracy is 0.500000 The accuracy of validation set is 0.422200
Iteration  4000: Cost is 1.430042, Accuracy is 0.615000 The accuracy of validation set is 0.470800
Iteration  5000: Cost is 1.253294, Accuracy is 0.600000 The accuracy of validation set is 0.465800
Iteration  6000: Cost is 1.251801, Accuracy is 0.680000 The accuracy of validation set is 0.502800
Iteration  7000: Cost is 0.996773, Accuracy is 0.700000 The accuracy of validation set is 0.481600
Iteration  8000: Cost is 0.958680, Accuracy is 0.790000 The accuracy of validation set is 0.520200
Iteration  9000: Cost is 0.837242, Accuracy is 0.840000 The accuracy of validation set is 0.515200
Iteration 

In [75]:
y_predicted = NN.predict(X_test)
save_predictions('ans1-jy2865', y_predicted)

In [76]:
# test if your numpy file has been saved correctly
loaded_y = np.load('ans1-jy2865.npy')
print(loaded_y.shape)
loaded_y[:10]

(10000,)


array([4, 8, 0, 7, 5, 8, 8, 4, 8, 1])

## Part 2: Improving the performance

### Rmsprop optimization

In [None]:
layer_dimensions = [X_train.shape[0], 1024, 512, 32, 10]
NN2 = NeuralNetwork(layer_dimensions, drop_prob=0.0, reg_lambda=0.001,optimization_methods="rmsprop",beta=0.99)
NN2.train(X_train, y_train, iters=12000, alpha=0.001, batch_size=200, print_every=200)

Iteration     0: Cost is 3.139226, Accuracy is 0.140000 The accuracy of validation set is 0.092400
Iteration   200: Cost is 2.788609, Accuracy is 0.360000 The accuracy of validation set is 0.303200
Iteration   400: Cost is 2.696365, Accuracy is 0.435000 The accuracy of validation set is 0.338000


In [None]:
y_predicted2 = NN2.predict(X_test)
save_predictions( 'ans2_1_rmsprop-jy2865',y_predicted2)

### Momentum

In [26]:
layer_dimensions = [X_train.shape[0], 512, 512, 256, 256, 128, 128, 64, 32, 10]
NN3 = NeuralNetwork(layer_dimensions, drop_prob=0.0, reg_lambda=0.001,optimization_methods="Momentum",beta=0.1)
NN3.train(X_train, y_train, iters=12000, alpha=0.05, batch_size=200, print_every=200)

Iteration     0: Cost is 3.247533, Accuracy is 0.140000 The accuracy of validation set is 0.121800
Iteration   200: Cost is 2.941849, Accuracy is 0.240000 The accuracy of validation set is 0.230000
Iteration   400: Cost is 2.947781, Accuracy is 0.280000 The accuracy of validation set is 0.294800
Iteration   600: Cost is 2.751282, Accuracy is 0.320000 The accuracy of validation set is 0.338000
Iteration   800: Cost is 2.587683, Accuracy is 0.360000 The accuracy of validation set is 0.342000
Iteration  1000: Cost is 2.641108, Accuracy is 0.390000 The accuracy of validation set is 0.376200
Iteration  1200: Cost is 2.729076, Accuracy is 0.410000 The accuracy of validation set is 0.323000
Iteration  1400: Cost is 2.603846, Accuracy is 0.365000 The accuracy of validation set is 0.392400
Iteration  1600: Cost is 2.521889, Accuracy is 0.390000 The accuracy of validation set is 0.416600
Iteration  1800: Cost is 2.343106, Accuracy is 0.470000 The accuracy of validation set is 0.428000
Iteration 

In [28]:
y_predicted3 = NN3.predict(X_test)
save_predictions( 'ans2_2_momentum-jy2865',y_predicted3)

### L1 regularization

In [75]:
layer_dimensions = [X_train.shape[0], 512, 512, 256, 256, 128, 128, 64, 32, 10]
NN4 = NeuralNetwork(layer_dimensions, drop_prob=0.0, reg_lambda=0.001,optimization_methods="Momentum",beta=0.1,reg_methods="L1")
NN4.train(X_train, y_train, iters=12000, alpha=0.05, batch_size=200, print_every=200)

Iteration     0: Cost is 45.672819, Accuracy is 0.140000 The accuracy of validation set is 0.121800
Iteration   200: Cost is 27.042657, Accuracy is 0.190000 The accuracy of validation set is 0.166600
Iteration   400: Cost is 16.553707, Accuracy is 0.170000 The accuracy of validation set is 0.179400
Iteration   600: Cost is 11.128965, Accuracy is 0.140000 The accuracy of validation set is 0.168200
Iteration   800: Cost is 7.997338, Accuracy is 0.190000 The accuracy of validation set is 0.183200
Iteration  1000: Cost is 6.152236, Accuracy is 0.210000 The accuracy of validation set is 0.176200
Iteration  1200: Cost is 4.843216, Accuracy is 0.195000 The accuracy of validation set is 0.192600
Iteration  1400: Cost is 4.071917, Accuracy is 0.175000 The accuracy of validation set is 0.198200
Iteration  1600: Cost is 3.407433, Accuracy is 0.160000 The accuracy of validation set is 0.194600
Iteration  1800: Cost is 3.067523, Accuracy is 0.225000 The accuracy of validation set is 0.190000


KeyboardInterrupt: 