In [14]:
import numpy as np
import unittest

np.random.seed(1024)
from models._base_network import _baseNetwork


In [81]:

class TwoLayerNet(_baseNetwork):
    def __init__(self, input_size=28 * 28, num_classes=10, hidden_size=128):
        super().__init__(input_size, num_classes)

        self.hidden_size = hidden_size
        self._weight_init()

    def _weight_init(self):
        """
        initialize weights of the network
        :return: None; self.weights is filled based on method
        - W1: The weight matrix of the first layer of shape (num_features, hidden_size)
        - b1: The bias term of the first layer of shape (hidden_size,)
        - W2: The weight matrix of the second layer of shape (hidden_size, num_classes)
        - b2: The bias term of the second layer of shape (num_classes,)
        """

        # initialize weights
        self.weights['b1'] = np.zeros(self.hidden_size)
        self.weights['b2'] = np.zeros(self.num_classes)
        np.random.seed(1024)
        self.weights['W1'] = 0.001 * np.random.randn(self.input_size, self.hidden_size)
        np.random.seed(1024)
        self.weights['W2'] = 0.001 * np.random.randn(self.hidden_size, self.num_classes)

        # initialize gradients to zeros
        self.gradients['W1'] = np.zeros((self.input_size, self.hidden_size))
        self.gradients['b1'] = np.zeros(self.hidden_size)
        self.gradients['W2'] = np.zeros((self.hidden_size, self.num_classes))
        self.gradients['b2'] = np.zeros(self.num_classes)
        
            

    def forward(self, X, y, mode='train'):
        """
        The forward pass of the two-layer net. The activation function used in between the two layers is sigmoid, which
        is to be implemented in self.,sigmoid.
        The method forward should compute the loss of input batch X and gradients of each weights.
        Further, it should also compute the accuracy of given batch. The loss and
        accuracy are returned by the method and gradients are stored in self.gradients

        :param X: a batch of images (N, input_size)
        :param y: labels of images in the batch (N,)
        :param mode: if mode is training, compute and update gradients;else, just return the loss and accuracy
        :return:
            loss: the loss associated with the batch
            accuracy: the accuracy of the batch
            self.gradients: gradients are not explicitly returned but rather updated in the class member self.gradients
        """
        loss = None
        accuracy = None
        #############################################################################
        # TODO:                                                                     #
        #    1) Implement the forward process:                                      #
        #        1) Call sigmoid function between the two layers for non-linearity  #
        #        2) The output of the second layer should be passed to softmax      #
        #        function before computing the cross entropy loss                   #
        #    2) Compute Cross-Entropy Loss and batch accuracy based on network      #
        #       outputs                                                             #
        #############################################################################
        
        Z1 = np.dot(X,self.weights['W1']) + self.weights['b1']
        A1 = self.sigmoid(Z1)
        
    
        
        Z2 = np.dot(A1, self.weights['W2']) + self.weights['b2']
        A2 = self.softmax(Z2)
        
        accuracy = self.compute_accuracy(A2,y)
        loss = self.cross_entropy_loss(A2,y)



        #############################################################################
        # TODO:                                                                     #
        #    1) Implement the backward process:                                     #
        #        1) Compute gradients of each weight and bias by chain rule         #
        #        2) Store the gradients in self.gradients                           #
        #    HINT: You will need to compute gradients backwards, i.e, compute       #
        #          gradients of W2 and b2 first, then compute it for W1 and b1      #
        #          You may also want to implement the analytical derivative of      #
        #          the sigmoid function in self.sigmoid_dev first                   #
        #############################################################################

        error = A2
        error[range(len(y)),y] -= 1
        
        self.gradients['W2'] = 1/len(y) * np.dot(A1.T,error)
        self.gradients['b2'] = 1/len(y) * np.sum(error,axis=0)
        
        dhidden = np.dot(1/len(y) *error,self.weights['W2'].T)
        diff_sigmoid = dhidden*self.sigmoid_dev(Z1)
        
        self.gradients['W1'] = np.dot(X.T,diff_sigmoid)
        self.gradients['b1'] = np.sum(diff_sigmoid,axis=0)


        return loss, accuracy

test_batch = np.load('tests/softmax_grad_check/test_batch.npy')
test_label = np.load('tests/softmax_grad_check/test_label.npy')

model = TwoLayerNet(hidden_size=128)
# expected_loss = 2.30285
w1_grad_expected = np.load('tests/twolayer_grad_check/w1.npy')
b1_grad_expected = np.load('tests/twolayer_grad_check/b1.npy')
w2_grad_expected = np.load('tests/twolayer_grad_check/w2.npy')
b2_grad_expected = np.load('tests/twolayer_grad_check/b2.npy')

loss, _ = model.forward(test_batch, test_label, mode='train')


tn = TestNetwork()
tn.setUp()
tn.test_two_layer_net()

In [64]:
class TestNetwork(unittest.TestCase):
    """ The class containing all test cases for this assignment"""

    def setUp(self):
        """Define the functions to be tested here."""
        self.test_batch = np.load('tests/softmax_grad_check/test_batch.npy')
        self.test_label = np.load('tests/softmax_grad_check/test_label.npy')

    def test_one_layer_softmax_relu(self):
        model = SoftmaxRegression()
        expected_loss = 2.3029
        expected_grad = np.load('tests/softmax_grad_check/softmax_relu_grad.npy')
        loss, _ = model.forward(self.test_batch, self.test_label, mode='train')
        w_grad = model.gradients['W1']
        self.assertAlmostEqual(expected_loss, loss, places=5)
        diff = np.sum(np.abs(expected_grad - w_grad))
        self.assertAlmostEqual(diff, 0)

    def test_two_layer_net(self):
        model = TwoLayerNet(hidden_size=128)
        expected_loss = 2.30285
        w1_grad_expected = np.load('tests/twolayer_grad_check/w1.npy')
        b1_grad_expected = np.load('tests/twolayer_grad_check/b1.npy')
        w2_grad_expected = np.load('tests/twolayer_grad_check/w2.npy')
        b2_grad_expected = np.load('tests/twolayer_grad_check/b2.npy')

        loss, _ = model.forward(self.test_batch, self.test_label, mode='train')

        self.assertAlmostEqual(expected_loss, loss, places=5)

        self.assertAlmostEqual(np.sum(np.abs(w1_grad_expected - model.gradients['W1'])), 0)
        self.assertAlmostEqual(np.sum(np.abs(b1_grad_expected - model.gradients['b1'])), 0)
        self.assertAlmostEqual(np.sum(np.abs(w2_grad_expected - model.gradients['W2'])), 0)
        self.assertAlmostEqual(np.sum(np.abs(b2_grad_expected - model.gradients['b2'])), 0)


In [79]:
class SoftmaxRegression(_baseNetwork):
    def __init__(self, input_size=28 * 28, num_classes=10):
        """
        A single layer softmax regression. The network is composed by:
        a linear layer without bias => (activation) => Softmax
        :param input_size: the input dimension
        :param num_classes: the number of classes in total
        """
        super().__init__(input_size, num_classes)
        self._weight_init()

    def _weight_init(self):
        '''
        initialize weights of the single layer regression network. No bias term included.
        :return: None; self.weights is filled based on method
        - W1: The weight matrix of the linear layer of shape (num_features, hidden_size)
        '''
        np.random.seed(1024)
        self.weights['W1'] = 0.001 * np.random.randn(self.input_size, self.num_classes)
        self.gradients['W1'] = np.zeros((self.input_size, self.num_classes))
                
    def one_hot(self,y):
        class_labels = [i for i in range(10)]
        one_hot = np.eye(self.num_classes)[np.vectorize(lambda c: class_labels[c])(y).reshape(-1)]
        for i in range(len(y)):
            one_hot[i] = one_hot[i] * y[i]
        return one_hot
    
    # def softmax(self, scores):
    #     f = np.exp(scores - np.max(scores))  # shift values
    #     return f / np.sum(f)

    def forward(self, X, y, mode='train'):
        """
        Compute loss and gradients using softmax with vectorization.

        :param X: a batch of image (N, 28x28)
        :param y: labels of images in the batch (N,)
        :return:
            loss: the loss associated with the batch
            accuracy: the accuracy of the batch
        """
        loss = None
        gradient = None
        accuracy = None
        #############################################################################
        # TODO:                                                                     #
        #    1) Implement the forward process and compute the Cross-Entropy loss    #
        #    2) Compute the gradient of the loss with respect to the weights        #
        # Hint:                                                                     #
        #   Store your intermediate outputs before ReLU for backwards               #
        #############################################################################

        # Z = X * W
        Z = np.matmul(X,self.weights['W1'])
        A = self.ReLU(Z)
        p = self.softmax(A)
        
        accuracy = self.compute_accuracy(p,y)
        loss = self.cross_entropy_loss(p,y)
        
        


        if mode != 'train':
            return loss, accuracy

        #############################################################################
        # TODO:                                                                     #
        #    1) Implement the backward process:                                     #
        #        1) Compute gradients of each weight by chain rule                  #
        #        2) Store the gradients in self.gradients                           #
        #############################################################################
        
        error = p
        error[range(len(y)),y] -= 1
        
        
        # dA = p - y # derivative of softmax cross-entropy loss with respect to A
        dZ = 1/len(y)*np.multiply(error, np.int64(A > 0)) # derivative of ReLU activation with respect to Z
        dW = np.dot(X.T, dZ)
        
        self.gradients['W1'] += dW # update gradients
        
        

        return loss, accuracy

tn = TestNetwork()
tn.setUp()
tn.test_one_layer_softmax_relu()