# **Task 1: Activations**

---


In [1]:
import numpy as np
import math

In [2]:
class Activation(object):

    """
    Interface for activation functions (non-linearities).

    In all implementations, the state attribute must contain the result,
    i.e. the output of forward.
    """

    # No additional work is needed for this class, as it acts like an
    # abstract base class for the others

    # Note that these activation functions are scalar operations. I.e, they
    # shouldn't change the shape of the input.

    def __init__(self):
        self.state = None

    def __call__(self, x):
        return self.forward(x)

    def forward(self, x):
        raise NotImplemented

    def derivative(self):
        raise NotImplemented

In [3]:
class Identity(Activation):

    """
    Identity function (already implemented).
    """

    # This class is a gimme as it is already implemented for you as an example

    def __init__(self):
        super(Identity, self).__init__()

    def forward(self, x):
        self.state = x
        return x

    def derivative(self):
        return 1.0

In [4]:
class Sigmoid(Activation):

    """
    Sigmoid non-linearity
    """

    # Remember do not change the function signatures as those are needed
    # to stay the same for AutoLab.

    def __init__(self):
        super(Sigmoid, self).__init__()
        self.output = None

    def forward(self, x):
        self.output = 1 / (1 + np.exp(-x))
        return self.output

    def derivative(self):
        if self.output is None:
            raise ValueError("Forward pass must be called before derivative.")
        return self.output * (1 - self.output)

In [5]:
class Tanh(Activation):

    """
    Tanh non-linearity
    """

    def __init__(self):
        super(Tanh, self).__init__()
        self.output = None

    def forward(self, x):
        self.output = np.tanh(x)
        return self.output

    def derivative(self):
      if self.output is None:
            raise ValueError("Forward pass must be called before derivative.")
      return 1 - np.pow(self.output, 2)


In [6]:
class ReLU(Activation):

    """
    ReLU non-linearity
    """

    def __init__(self):
        super(ReLU, self).__init__()
        self.output = None

    def forward(self, x):
        self.output = np.maximum(0, x)
        return self.output

    def derivative(self):
        if self.output is None:
            raise ValueError("Forward pass must be called before derivative.")
        return 1 - np.where(self.output > 0, 1, 0)

# **Task 2: Loss**

---



In [7]:
# The following Criterion class will be used again as the basis for a number
# of loss functions (which are in the form of classes so that they can be
# exchanged easily (it's how PyTorch and other ML libraries do it))

class Criterion(object):
    """
    Interface for loss functions.
    """

    # Nothing needs done to this class, it's used by the following Criterion classes

    def __init__(self):
        self.logits = None
        self.labels = None
        self.loss = None

    def __call__(self, x, y):
        return self.forward(x, y)

    def forward(self, x, y):
        raise NotImplemented

    def derivative(self):
        raise NotImplemented

In [8]:
class SoftmaxCrossEntropy(Criterion):
    """
    Softmax loss
    """

    def __init__(self):
        super(SoftmaxCrossEntropy, self).__init__()

    def forward(self, x, y):
        """
        Argument:
            x (np.array): (batch size, 10)
            y (np.array): (batch size, 10)
        Return:
            out (np.array): (batch size, )
        """
        self.logits = x
        self.labels = y

        """Compute softmax values for each sets of scores in x."""
        self.softmax_probs = np.exp(x) / np.sum(np.exp(x), axis=0)
        softmax_probs = self.softmax_probs

        # Compute the per-sample loss
        # y * log(softmax_probs) extracts the log probabilities of the correct classes
        log_probs = np.log(softmax_probs + 1e-15)  # Add a small epsilon to avoid log(0)
        loss = -np.sum(y * log_probs, axis=1)  # Cross-entropy loss (batch_size,)

        return loss

    def derivative(self):
        """
        Return:
            out (np.array): (batch size, 10)
        """

        # Gradient of cross-entropy with softmax: softmax_probs - labels
        batch_size = self.logits.shape[0]
        return (self.softmax_probs - self.labels)/batch_size

# **Task 3: Linear Layer**

---



In [11]:
class Linear():
    def __init__(self, in_feature, out_feature, weight_init_fn, bias_init_fn):

        """
        Argument:
            W (np.array): (in feature, out feature)
            dW (np.array): (in feature, out feature)
            momentum_W (np.array): (in feature, out feature)

            b (np.array): (1, out feature)
            db (np.array): (1, out feature)
            momentum_B (np.array): (1, out feature)
        """

        self.W = weight_init_fn(in_feature, out_feature)
        self.b = bias_init_fn(out_feature)

        # TODO: Complete these but do not change the names.
        self.dW = np.zeros(self.W)
        self.db = np.zeros(self.b)

        self.momentum_W = np.zeros(self.W)
        self.momentum_b = np.zeros(self.b)

    def __call__(self, x):
        return self.forward(x)

    def forward(self, x):
        """
        Argument:
            x (np.array): (batch size, in feature)
        Return:
            out (np.array): (batch size, out feature)
        """
        self.x = x
        out = np.dot(x, self.W) + self.b
        return out

    def backward(self, delta):

        """
        Argument:
            delta (np.array): (batch size, out feature)
        Return:
            out (np.array): (batch size, in feature)
        """
        batch_size = self.x.shape[0]    #number of samples
        self.dW = np.dot(self.x.T, delta) / batch_size
        self.db = np.sum(delta, axis=0, keepdims=True) / batch_size
        dx = np.dot(delta, self.W.T)
        return dx


# **Task 4: Simple MLP**

---



In [10]:
class MLP(object):

    """
    A simple multilayer perceptron
    """

    def __init__(self, input_size, output_size, hiddens, activations, weight_init_fn,
                 bias_init_fn, criterion, lr):

        # Don't change this -->
        self.train_mode = True
        self.nlayers = len(hiddens) + 1
        self.input_size = input_size
        self.output_size = output_size
        self.activations = activations
        self.criterion = criterion
        self.lr = lr
        # <---------------------

        # Don't change the name of the following class attributes,
        # the autograder will check against these attributes. But you will need to change
        # the values in order to initialize them correctly

        # Initialize and add all your linear layers into the list 'self.linear_layers'
        # (HINT: self.foo = [ bar(???) for ?? in ? ])
        # (HINT: Can you use zip here?)
        self.linear_layers = [
            Linear(in_feature, out_feature, weight_init_fn, bias_init_fn)
            for in_feature, out_feature in zip([input_size] + hiddens, hiddens + [output_size])
            ]


    def forward(self, x):
        """
        Argument:
            x (np.array): (batch size, input_size)
        Return:
            out (np.array): (batch size, output_size)
        """
        # Complete the forward pass through your entire MLP.
        self.output = x
        for linear, activation in zip(self.linear_layers, self.activations):
            x = activation(linear(x))
        self.output = x
        return x

    def zero_grads(self):
        # Use numpyArray.fill(0.0) to zero out your backpropped derivatives in each
        # of your linear and batchnorm layers.
        for layer in self.linear_layers:
            layer.dW.fill(0.0)
            layer.db.fill(0.0)

    def step(self):
        # Apply a step to the weights and biases of the linear layers.
        # (You will add momentum later in the assignment to the linear layers)

        for i in range(len(self.linear_layers)):
            # Update weights and biases here
            layer = self.linear_layers[i]
            layer.W = layer.W - self.lr * layer.dW
            layer.b = layer.b - self.lr * layer.db
        # Do the same for batchnorm layers

        #raise NotImplemented

    def backward(self, labels):
        # Backpropagate through the activation functions, batch norm and
        # linear layers.
        # Be aware of which return derivatives and which are pure backward passes
        # i.e. take in a loss w.r.t it's output.
        loss_grad = self.criterion.backward(self.output, labels)
        for layer, activation in reversed(list(zip(self.linear_layers, self.activations))):
          loss_grad = activation.backward(layer.backward(loss_grad))

    def error(self, labels):
        return (np.argmax(self.output, axis = 1) != np.argmax(labels, axis = 1)).sum()

    def total_loss(self, labels):
        return self.criterion(self.output, labels).sum()

    def __call__(self, x):
        return self.forward(x)

    def train(self):
        self.train_mode = True

    def eval(self):
        self.train_mode = False



In [13]:
#mlp = MLP(784, 10, [64, 64, 32], [Sigmoid(), Sigmoid(), Sigmoid(), Identity()], weight_init_fn, bias_init_fn, SoftmaxCrossEntropy(), 0.008)