## Exercise - DL Tutorial 04: FCNN - BP 

Please complete the following notebook and submit your solutions to manuel.milling@informatik.uni-augsburg.de OR maurice.gerczuk@informatik.uni-augsburg.de till 26 May 23:59.

## student name: 
> Benedikt Bauer, David Heim, Franz Schulze

Solutions from exercise sheet 3 (class methods below).

In [2]:
import numpy as np
#numpy random seed
np.random.seed(42)

trainx, trainy, testx, testy = np.load('./res/mnist.npy', allow_pickle=True)
print("Trainx shape: {}".format(trainx.shape))
print("Trainy shape: {}".format(trainy.shape))
print("Testx shape:  {}".format(testx.shape))
print("Testy shape:  {}".format(testy.shape))

def sigmoid(X):
    return 1/(1 +np.exp(-X))

def softmax(X):
    #more stable
    eps = X.max()
    return np.exp(X + eps)/(np.sum(np.exp(X + eps), axis=1).reshape((X.shape[0],1)))

def fcc_one_layer(X, W, b, activation):
    return activation(np.matmul(X, W) + b)

def cross_entropy(pred_logits, y):
    num_data_points = pred_logits.shape[0]
    correct_logits = pred_logits[np.arange(num_data_points),y]
    return np.mean(-np.log(correct_logits))

def accuracy(logits, labels):
    class_predictions = np.argmax(logits, axis=1)
    return np.mean(class_predictions == labels)



Trainx shape: (60000, 784)
Trainy shape: (60000,)
Testx shape:  (10000, 784)
Testy shape:  (10000,)


1.   Implement the error of the last layer.

In [3]:
def delta_last_layer(H, y):
    """
    :param H: softmax activations of shape (num_examples, num_classes)
    :param y: correct labels of shape (num_examples,)
    :return: delta of of last layer, i.e. derivative of cross entropy times derivative of softmax
    """
    # note: this is destructive concerning H 
    # (which is not really a problem for the fcc class because H is not used after this, and you can regain H by running forward_propagation again)
    H[range(H.shape[0]), y] -= 1 # using formula 32 
    return H.T


2.   Implement the derivative of the sigmoid function in terms of the sigmoid function.

In [4]:
def del_sigmoid(H: np.ndarray) -> np.ndarray:
    """
    :param H: output of the sigmoid function shape (num_examples, num_units)
    :return: element-wise derivative of the sigmoid function
    """
    return H * (1 - H)


3.   Implement the backpropagation as a class method.
4.   Implement the the optimisation step as a class method.


In [5]:
class fcc:
    def __init__(self, n_input, n_hidden1, n_hidden2, n_out):
        #parameters
        self.W_i_h1 = np.random.randn(n_input, n_hidden1)
        self.W_h1_h2 = np.random.randn(n_hidden1, n_hidden2)
        self.W_h2_o = np.random.randn(n_hidden2, n_out)
        self.b_h1 = np.random.randn(n_hidden1)
        self.b_h2 = np.random.randn(n_hidden2)
        self.b_out = np.random.randn(n_out)
        #neuron activations and input H^n
        self.X = None
        self.h1 = None
        self.h2 = None
        self.out = None
        #components of the gradient
        self.dW_i_h1 = None
        self.db_h1 = None
        self.dW_h1_h2 = None
        self.db_h2 = None
        self.dW_h2_o = None
        self.db_out = None

        n_trainable_bias = self.b_h1.shape[0] + self.b_h2.shape[0] + self.b_out.shape[0]
        n_trainable_weights = self.W_i_h1.shape[0] * self.W_i_h1.shape[1] + self.W_h1_h2.shape[0] * self.W_h1_h2.shape[1] + self.W_h2_o.shape[0] * self.W_h2_o.shape[1]
        print("Number of parameters: {}".format(n_trainable_bias + n_trainable_weights))

    def forward_propagation(self, X):
        self.X = X
        self.h1 = fcc_one_layer(X, self.W_i_h1, self.b_h1, sigmoid)
        self.h2 = fcc_one_layer(self.h1, self.W_h1_h2, self.b_h2, sigmoid)
        self.out = fcc_one_layer(self.h2, self.W_h2_o, self.b_out, softmax)
        return self.out

    def backprop(self, y):
        """
        :param y: labels, i.e. numbers of correct classes of shape (num_examples,)
        """
        count_samples = self.X.shape[0]
        delta_o = delta_last_layer(self.out, y)
        self.dW_h2_o = (delta_o @ self.h2).T / count_samples # using formula 27
        self.db_out = np.mean(delta_o, axis = 1) # using formula 28 
        delta_h2 = (self.W_h2_o @ delta_o) * del_sigmoid(self.h2).T # using formula 26 and 33
        self.dW_h1_h2 = (delta_h2 @ self.h1).T / count_samples # formula 27
        self.db_h2 = np.mean(delta_h2, axis = 1) # using formula 28
        delta_h1 = (self.W_h1_h2 @ delta_h2) * del_sigmoid(self.h1).T # using formula 26 and 33
        self.dW_i_h1 = (delta_h1 @ self.X).T / count_samples # formula 27
        self.db_h1 = np.mean(delta_h1, axis = 1) # using formula 28

    def gradient_step(self, learning_rate):
        """
        :param learning_rate: learning_rate for training
        """
        self.W_i_h1 -= learning_rate * self.dW_i_h1
        self.W_h1_h2 -= learning_rate * self.dW_h1_h2
        self.W_h2_o -= learning_rate * self.dW_h2_o
        self.b_h1 -= learning_rate * self.db_h1
        self.b_h2 -= learning_rate * self.db_h2
        self.b_out -= learning_rate * self.db_out

    def train_single(self, X, y, learning_rate):
        self.forward_propagation(X)
        self.backprop(y)
        self.gradient_step(learning_rate)

    def train_mini_batch(self, X, y, learning_rate, batch_size):
        for batchx, batchy in self.minibatches(X, y, batch_size):
            self.train_single(batchx, batchy, learning_rate)

    def minibatches(self, X: np.ndarray, y: np.ndarray, batch_size: int):
        num_samples = X.shape[0]
        all_indices = np.arange(num_samples)
        # shuffle batches at every step
        np.random.shuffle(all_indices)
        for start_index in range(0, num_samples - batch_size + 1, batch_size):
            batch_indices = all_indices[start_index : start_index + batch_size]
            yield X[batch_indices], y[batch_indices]

    def write_data(self, trainx: np.ndarray, trainy: np.ndarray, testx: np.ndarray, testy: np.ndarray, iteration: int, file: str):
        logits_train = self.forward_propagation(trainx)
        logits_test = self.forward_propagation(testx)
        f = open(file, "a")
        f.write(f"Epoch: {iteration}:\n")
        f.write(f"\taccuracy: train_data: {accuracy(logits_train, trainy)}; test_data: {accuracy(logits_test, testy)}\n")
        f.write(f"\tcross_entropy: train_data: {cross_entropy(logits_train, trainy)}; test_data: {cross_entropy(logits_test, testy)}\n")
        f.close()

    def train(self, trainx: np.ndarray, trainy: np.ndarray, testx: np.ndarray, testy: np.ndarray, learning_rate: int, iterations: int, write_frequency: int, output_file: str, batch_size = 0):
        for i in range(iterations + 1):
            if (i % write_frequency == 0):
                self.write_data(trainx, trainy, testx, testy, i, output_file)
            if batch_size > 0:
                self.train_mini_batch(trainx, trainy, learning_rate, batch_size)
            else: 
                self.train_single(trainx, trainy, learning_rate)



5.   Implement the training routine.

In [6]:
learning_rate = 0.01
num_iterations = 1000
num_iterations_mini_batch = 100

# since we store network parameters as class variables we need for both variants a net
neural_net = fcc(784, 400, 400, 10)
neural_net_mini_batch = fcc(784, 400, 400, 10)

# since we call train without batch_size, train(...) runs gradient decent without mini batches
neural_net.train(trainx, trainy, testx, testy, learning_rate, num_iterations, write_frequency=100, output_file="normal.txt")
neural_net_mini_batch.train(trainx, trainy, testx, testy, learning_rate, num_iterations_mini_batch, write_frequency=10, output_file="mini_batch.txt", batch_size=64)

logits = neural_net.forward_propagation(trainx)
logits_minibatch = neural_net_mini_batch.forward_propagation(trainx)

print(f"Train Loss normal gradient descent:\t{cross_entropy(logits, trainy)}")
print(f"Train Accuracy normal gradient descent:\t{accuracy(logits, trainy)}")

print(f"\nTrain Loss mini batch gradient descent:\t{cross_entropy(logits_minibatch, trainy)}")
print(f"Train Accuracy mini batch gradient descent:\t{accuracy(logits_minibatch, trainy)}")



Number of parameters: 478410
Number of parameters: 478410
Train Loss normal gradient descent:	2.505757390249101
Train Accuracy normal gradient descent:	0.57275

Train Loss mini batch gradient descent:	0.16209650866632427
Train Accuracy mini batch gradient descent:	0.9557
