# COMS 4995_002 Deep Learning Assignment 1
Due on Monday, Oct 9, 11:59pm

This assignment can be done in groups of at most 3 students. Everyone must submit on Courseworks individually.

Write down the UNIs of your group (if applicable)

Member 1: Wilson Lui, wl2522

Member 2: Vanessa Saldana Fountain, vsf2106

Member 3: Craig Brandon Barretto, cbb2151

In [256]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.misc
import glob
import sys
# you shouldn't need to make any more imports

In [9]:
class NeuralNetwork(object):
    """
    Abstraction of neural network.
    Stores parameters, activations, cached values. 
    Provides necessary functions for training and prediction. 
    """
    def __init__(self, layer_dimensions, drop_prob=0.0, reg_lambda=0.0):
        """
        Initializes the weights and biases for each layer
        :param layer_dimensions: (list) number of nodes in each layer
        :param drop_prob: drop probability for dropout layers. Only required in part 2 of the assignment
        :param reg_lambda: regularization parameter. Only required in part 2 of the assignment
        """
        np.random.seed(1)
        
        self.parameters = {'num_layers': self.num_layers,
                           'drop_prob': self.drop_prob,
                           'reg_lambda': self.reg_lambda}
        self.num_layers = len(layer_dimensions)
        self.drop_prob = drop_prob
        self.reg_lambda = reg_lambda
        
        self.weights = [np.random.randn(layer_dimensions[layer],
                                        layer_dimensions[layer - 1]) for layer in range(1, num_layers)]
        self.biases = [np.random.randn(layer, 1) for layer in layer_dimensions[1:]]
        
        # init parameters
        

    def affineForward(self, A, W, b):
        """
        Forward pass for the affine layer.
        :param A: input matrix, shape (L, S), where L is the number of hidden units in the previous layer and S is
        the number of samples
        :returns: the affine product WA + b, along with the cache required for the backward pass
        """
        Z = np.matmul(W, A) + b
        cache = [A, W, b]
        
        return Z, cache
        

    def activationForward(self, A, activation="relu"):
        """
        Common interface to access all activation functions.
        :param A: input to the activation function
        :param prob: activation funciton to apply to A. Just "relu" for this assignment.
        :returns: activation(A)
        """ 
        if activation == 'relu':
            return relu(dropout(self, A, self.drop_prob)[0])
        
        elif activation == 'relu_no_dropout':
            return relu(self, A, self.drop_prob)

        
    def relu(self, X):
        return np.maximum(0, X)
            
        
    def:
        """
        :param A: 
        :param prob: drop prob
        :returns: tuple (A, M) 
            WHERE
            A is matrix after applying dropout
            M is dropout mask, used in the backward pass
        """
        M = np.random.binomial(1, 1 - prob, A.shape)
        A = (M*A)/(1 - prob)
        
        return A, M

    
    def forwardPropagation(self, X):
        """
        Runs an input X through the neural network to compute activations
        for all layers. Returns the output computed at the last layer along
        with the cache required for backpropagation.
        :returns: (tuple) AL, cache
            WHERE 
            AL is activation of last layer
            cache is cached values for each layer that
                     are needed in further steps
        """
        
        weights = self.weights
        biases = self.biases
        
        cache = []
        AL = X
        
        for i in range(self.num_layers):
            h, c = affineForward(AL, weights[i], biases[i])
            cache.append(c)
            
            if i == 0 or i == num_layers:
                AL = activationForward(h, activation='relu_no_dropout')
            
            else:
                AL = activationForward(h)
            
        return AL, cache


    def costFunction(self, AL, y):
        """
        :param AL: Activation of last layer, shape (num_classes, S)
        :param y: labels, shape (S)
        :param alpha: regularization parameter
        :returns cost, dAL: A scalar denoting cost and the gradient of cost
        """
        # compute loss
        a = np.exp(X[y, np.arange(y.shape[0])])/np.sum(np.exp(X), axis=0)
        
        if self.reg_lambda > 0:
            # add regularization
            l2 = 0
            
            for W in self.weights:
                l2 += np.sum(np.power(W, 2))
            
            l2 = l2 + self.reg_lambda/(2*y.shape[0])
            cost = np.sum(-np.log(a))/X.shape[1] + l2
        
        elif self.reg_lambda == 0:
            cost = np.sum(-np.log(a))/X.shape[1]
        
        # gradient of cost
        dAL = -(y - a) - (1 - y)/(1 - a)
        
        return cost, dAL

    def affineBackward(self, dA_prev, cache):
        """
        Backward pass for the affine layer.
        :param dA_prev: gradient from the next layer.
        :param cache: cache returned in affineForward
        :returns dA: gradient on the input to this layer
                 dW: gradient on the weights
                 db: gradient on the bias
        """
        #fully connected layer with no non-linear activation function 
        #dot product between inputs and weights
        
        A = cache[0]
        W = cache[1]
        b = cache[2]
        
        dA = dA_prev.dot(dW.T).reshape(A.shape[0])
        dW = A.reshape(A.shape[0], -1).T.dot(dA_prev)
        db = np.sum(dA_prev, axis = 0)
        
        return dA, dW, db

    def activationBackward(self, dA, cache, activation="relu"):
        """
        Interface to call backward on activation functions.
        In this case, it's just relu. 
        """
        if activation == 'relu':
            return relu(dropout_backward(self, dA, self.cache)[1])
        
        else:
            return dropout_backward(self, dA, self.cache)[1]
        
    def relu_derivative(self, dx, cached_x):
        dx = np.where(cached_x > 0)
        return dx

    def dropout_backward(self, dA, cache):
        dA = 1 - cache **2
        return dA 

    def backPropagation(self, dAL, Y, cache):
        """
        Run backpropagation to compute gradients on all paramters in the model
        :param dAL: gradient on the last layer of the network. Returned by the cost function.
        :param Y: labels
        :param cache: cached values during forwardprop
        :returns gradients: dW and db for each weight/bias
        """
        gradients = {'dW': [np.zeros(w.shape) for w in self.weights],
                     'db': [np.zeros(b.shape) for b in self.biases]}
        w_len = (self.num_layers) - 1
        for i in range(w_len):
            if self.drop_prob > 0:
                return dropout_backward(self, dA, cache)
                #call dropout_backward
           
            
        if self.reg_lambda > 0:
            dW += dAL*self.weights
            # add gradients from L2 regularization to each dW
            
        return gradients


    def updateParameters(self, gradients, alpha):
        """
        :param gradients: gradients for each weight/bias
        :param alpha: step size for gradient descent 
        """
        
        self.weights = [w - (alpha * dw) for w, dw in zip(self.weights,gradients['dW'])]
        self.biases = [b - (alpha * db) for b, db in zip(self.biases,gradients['db'])]

    def train(self, X, y, iters=1000, alpha=0.0001, batch_size=100, print_every=100):
        """
        :param X: input samples, each column is a sample
        :param y: labels for input samples, y.shape[0] must equal X.shape[1]
        :param iters: number of training iterations
        :param alpha: step size for gradient descent
        :param batch_size: number of samples in a minibatch
        :param print_every: no. of iterations to print debug info after
        """
        
        for i in range(0, iters):
            # get minibatch
            X_batch, y_batch = get_batch(X, y, batch_size)
            
            # forward prop
            AL, cache = forwardPropagation(X_batch)

            # compute loss
            cost, dAL = costFunction(AL,y_batch)

            # compute gradients
            gradients = backPropagation(dAL,y_batch,cache)

            # update weights and biases based on gradient
            updateParameters(gradients,alpha)

            if i % print_every == 0:
                # print cost, train and validation set accuracies
                
    def predict(self, X):
        """
        Make predictions for each sample
        """

        y_pred = [np.argmax(forwardPropagation(x)) for x in X]
        return y_pred

    def get_batch(self, X, y, batch_size):
        """
        Return minibatch of samples and labels
        
        :param X, y: samples and corresponding labels
        :parma batch_size: minibatch size
        :returns: (tuple) X_batch, y_batch
        """
        num_samples = len(y)
        idx = np.random.randint(num_samples, size=batch_size)
        
        X_batch = X[:,idx]
        y_batch = y[idx]

        return X_batch, y_batch
    

SyntaxError: invalid syntax (<ipython-input-9-00586248cd93>, line 122)

In [3]:
def affineForward(A, W, b):

    Z = np.matmul(W, A) + b
    cache = [A,W,b]
        
    return Z, cache
    
def forwardPropagation(X):
    layer_dimensions = [20, 100, 50, 4]
    num_layers = len(layer_dimensions)
    layer_dimensions = [X.shape[0]] + layer_dimensions
    weights = [np.random.randn(layer_dimensions[layer + 1], layer_dimensions[layer]) for layer in range(num_layers)]
    biases = [np.random.randn(layer, 1) for layer in layer_dimensions[1:]]
    cache = []
    AL = X #do we have to activate X before running as well?
    for i in range(num_layers):
        h, c = affineForward(AL, weights[i], biases[i])
        cache.append(c)
        AL = activationForward(h)
    
    return AL, cache

def activationForward(A, activation="relu"):
       
    if activation == 'relu':
        return relu(dropout(A, 0)[0])
    else:
        return dropout(A, 0)[0]

def relu(X):
    return np.maximum(0, X)

def dropout(A, prob):

    M = np.random.binomial(1, 1 - prob, A.shape)
    A = M*A
        
    return A, M

def costFunction(AL, y):
        """
        :param AL: Activation of last layer, shape (num_classes, S)
        :param y: labels, shape (S)
        :param alpha: regularization parameter
        :returns cost, dAL: A scalar denoting cost and the gradient of cost
        """
        # compute loss
        print(X)
        print(AL)
        return sum(-np.log(np.exp(X[y, np.arange(y.shape[0])])/np.sum(np.exp(X), axis=0)))

X = np.random.randn(4, 6)
y = np.array([1, 2, 1, 0, 3, 0])
z = forwardPropagation(X)[0]
print(z)
print(costFunction(z, y))

[[   0.            0.            0.            0.            0.            0.        ]
 [   0.            0.            0.            0.            0.            0.        ]
 [ 346.11393379  342.60731341  382.5911783   211.71014542  484.10177676
   439.99221486]
 [   0.            0.            0.            0.            0.            0.        ]]
[[-1.0512458   0.59673156  1.17942456 -0.49144803  1.60792931  1.06653289]
 [-1.04056201  0.14560455 -0.76195773 -0.62429588  1.45913116  1.08310527]
 [-0.20205452 -0.45376338  0.39750631  0.20755261 -0.20782217 -0.13270209]
 [ 0.246701    0.38693215 -0.59426762  1.13966871  0.93655367 -1.83375772]]
[[   0.            0.            0.            0.            0.            0.        ]
 [   0.            0.            0.            0.            0.            0.        ]
 [ 346.11393379  342.60731341  382.5911783   211.71014542  484.10177676
   439.99221486]
 [   0.            0.            0.            0.            0.            0.        

In [273]:
X = np.array([range(6), range(6), np.arange(2, 8), np.arange(2, 8)])
y = np.array([1, 2, 1, 0, 3, 0])
print(X)
print(y)
print(np.exp(X[y, np.arange(y.shape[0])]))
print(np.sum(np.exp(X), axis=0))
print(np.exp(X[y, np.arange(y.shape[0])])/np.sum(np.exp(X), axis=0))
print(np.sum(-np.log(np.exp(X[y, np.arange(y.shape[0])])/np.sum(np.exp(X), axis=0))))
a = np.exp(X[y, np.arange(y.shape[0])])/np.sum(np.exp(X), axis=0)
print(X.shape[1])
cost = np.sum(-np.log(a))/X.shape[1]
print(cost)
print(-(y - a) - (1 - y)/(1 - a))

[[0 1 2 3 4 5]
 [0 1 2 3 4 5]
 [2 3 4 5 6 7]
 [2 3 4 5 6 7]]
[1 2 1 0 3 0]
[   1.           20.08553692    7.3890561    20.08553692  403.42879349
  148.4131591 ]
[   16.7781122     45.6076375    123.97441226   336.99739205   916.05388705
  2490.09263506]
[ 0.05960146  0.44039854  0.05960146  0.05960146  0.44039854  0.05960146]
12.9204511496
6
2.15340852494
[-0.94039854  0.22738458 -0.94039854 -1.00377748  1.01437062 -1.00377748]


In [274]:
a = np.exp(X[y, np.arange(y.shape[0])])/np.sum(np.exp(X), axis=0)
cost = np.sum(-np.log(a))/X.shape[1]
        
dAL = -(y - a) - (1 - y)/(1 - a)

print(cost)
print(dAL)

2.15340852494
[-0.94039854  0.22738458 -0.94039854 -1.00377748  1.01437062 -1.00377748]


In [None]:
def get_batch(X, y, start_index, batch_size):

    num_samples = y.shape[0]
        
    end_index = start_index + batch_size
    if end_index <= num_samples:
        X_batch = X[:, start_index:end_index]
        y_batch = y[start_index:end_index]
    else:
        num_needed = end_index - num_samples
        idx = np.random.randint(low=0, high=start_index + 1, size=num_needed)
        X_batch = np.hstack((X[:, start_index:num_samples], X[:, idx]))
        y_batch = np.hstack((y[start_index:num_samples], y[idx]))

    return X_batch, y_batch

tmp = np.vstack((X, y.reshape(1, y.shape[0])))

np.random.shuffle(tmp.T)
X_shuffle = tmp[:-1, :]
y_shuffle = tmp[-1]

for i in range(0, 4):
            # get minibatch
    minibatch = get_batch(X_shuffle, y_shuffle, i*6, 6)

            

In [242]:
layer_dimensions = [1024, 2, 3, 10]
num_layers = len(layer_dimensions)
w = [np.random.randn(layer_dimensions[layer],
                                        layer_dimensions[layer - 1]) for layer in range(1, num_layers)]

sum = 0
for arr in w:
    print(arr)
    sum += np.sum(np.power(arr, 2))
    print(sum)
print(len(layer_dimensions))

[[ 0.46545872 -1.18451186 -0.14435158 ..., -1.10285693  0.2096132
   0.09647918]
 [ 0.41830311 -0.58718594  1.15162257 ...,  0.35347054  1.02449449
   1.59520472]]
2026.51851294
[[-1.44841695  1.47592368]
 [ 0.9179381   0.29827026]
 [ 0.53612275 -0.41796752]]
2032.18847529
[[ 1.51503008 -0.84580818  0.65553981]
 [-0.10485777  0.83321957 -0.84311856]
 [-0.46200693  0.99529294  0.21866331]
 [ 0.56383977 -1.84154886  1.46773239]
 [ 0.1304769  -0.62208291  0.08807739]
 [ 0.86629397 -0.14887122 -1.06486437]
 [ 0.09059368 -1.40816211 -0.58023079]
 [-0.1845622   1.98474232  0.64060703]
 [-0.57914649  0.62148421  1.19365481]
 [ 0.29024885 -0.68542022  0.12202937]]
2055.90551489
4


In [10]:
# Helper functions, DO NOT modify this

def get_img_array(path):
    """
    Given path of image, returns it's numpy array
    """
    return scipy.misc.imread(path)

def get_files(folder):
    """
    Given path to folder, returns list of files in it
    """
    filenames = [file for file in glob.glob(folder+'*/*')]
    filenames.sort()
    return filenames

def get_label(filepath, label2id):
    """
    Files are assumed to be labeled as: /path/to/file/999_frog.png
    Returns label for a filepath
    """
    tokens = filepath.split('/')
    label = tokens[-1].split('_')[1][:-4]
    if label in label2id:
        return label2id[label]
    else:
        sys.exit("Invalid label: " + label)

In [11]:
# Functions to load data, DO NOT change these

def get_labels(folder, label2id):
    """
    Returns vector of labels extracted from filenames of all files in folder
    :param folder: path to data folder
    :param label2id: mapping of text labels to numeric ids. (Eg: automobile -> 0)
    """
    files = get_files(folder)
    y = []
    for f in files:
        y.append(get_label(f,label2id))
    return np.array(y)

def one_hot(y, num_classes=10):
    """
    Converts each label index in y to vector with one_hot encoding
    """
    y_one_hot = np.zeros((y.shape[0], num_classes))
    y_one_hot[y] = 1
    return y_one_hot.T

def get_label_mapping(label_file):
    """
    Returns mappings of label to index and index to label
    The input file has list of labels, each on a separate line.
    """
    with open(label_file, 'r') as f:
        id2label = f.readlines()
        id2label = [l.strip() for l in id2label]
    label2id = {}
    count = 0
    for label in id2label:
        label2id[label] = count
        count += 1
    return id2label, label2id

def get_images(folder):
    """
    returns numpy array of all samples in folder
    each column is a sample resized to 30x30 and flattened
    """
    files = get_files(folder)
    images = []
    count = 0
    
    for f in files:
        count += 1
        if count % 10000 == 0:
            print("Loaded {}/{}".format(count,len(files)))
        img_arr = get_img_array(f)
        img_arr = img_arr.flatten() / 255.0
        images.append(img_arr)
    X = np.column_stack(images)

    return X

def get_train_data(data_root_path):
    """
    Return X and y
    """
    train_data_path = data_root_path + 'train'
    id2label, label2id = get_label_mapping(data_root_path+'labels.txt')
    print(label2id)
    X = get_images(train_data_path)
    y = get_labels(train_data_path, label2id)
    return X, y

def save_predictions(filename, y):py
    """
    Dumps y into .npy file
    """
    np.save(filename, y)

IndentationError: unexpected indent (<ipython-input-11-72e163b042df>, line 70)

In [12]:
# Load the data
data_root_path = '/path/to/extracted/data/cifar10-hw1/'
X_train, y_train = get_train_data(data_root_path) # this may take a few minutes
X_test = get_images(data_root_path + 'test')
print('Data loading done')

NameError: name 'get_train_data' is not defined

## Part 1

#### Simple fully-connected deep neural network

In [None]:
layer_dimensions = [X_train.shape[0], ..., 10]  # including the input and output layers
NN = NeuralNetwork(layer_dimensions)
NN.train(X_train, y_train, iters=, alpha=, batch_size=, print_every=)

In [None]:
y_predicted = NN.predict(X_test)
save_predictions('ans1-uni', y_predicted)

In [None]:
# test if your numpy file has been saved correctly
loaded_y = np.load('ans1-uni.npy')
print(loaded_y.shape)
loaded_y[:10]

## Part 2: Regularizing the neural network
#### Add dropout and L2 regularization

In [None]:
NN2 = NeuralNetwork(layer_dimensions, drop_prob=0, reg_lambda=0)
NN2.train(X_train, y_train, iters=1000, alpha=0.00001, batch_size=1000, print_every=10)

In [None]:
y_predicted2 = NN2.predict(X)
save_predictions(y_predicted, 'ans2-uni')