Okay, let's start over after we've understood some maths. 

In [128]:
# Imports:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Data Setup:
X = np.load('data/X-data.npy')
y = np.load('data/y-data.npy')

np.random.seed(0)

In [129]:
# Abstract Class Definitions: 
class Layer():
    def __init__(self): 
        self.inputs = None  # The inputs into this layer. 
        self.outputs = None # The ouputs of this layer. 
        
    # Forward propagation method.
    def forward(self, inputs):
        pass
    
    # Backward propagation method.
    def backward(self):
        pass
    
class Activation():
    def activate():
        pass
    
    def prime():
        pass
    
class Loss():
    def calculate_loss():
        pass

In [140]:
class ReLU(Activation):
    def activate(self, x): 
        return np.maximum(x, 0.0)
    
    def derivative(self, x):
        return (x > 0) * 1  # * 1 to return a number.
    
class Sigmoid(Activation):
    def activate(self, x):
        return 1 / (1 + np.exp(-x))
    
    def derivative(self, x):
        s = self.activate(x)
        return s * (1 - s)

In [141]:
class CrossEntropyLoss():
    def calculate_loss(self, y_true, y_pred):
        return np.mean(-y_true * np.log(y_pred) - (1 - y_true) * np.log(1 - y_pred))
    
    def derivative(self, y_true, y_pred): 
        r = ((1 - y_true) / (1 - y_pred) - y_true / y_pred) / np.size(y_true)
        return r

In [142]:
# Dense (fully connected) Layer Class:
class Dense(Layer): 
    def __init__(self, input_size, output_size, activation_function='relu', name='unnamed'):
        self.name = name
        self.weights = np.random.randn(output_size, input_size)
        self.biases = np.zeros((output_size, 1))
        self.outputs = None
        
        if activation_function == 'relu':
            self.activation = ReLU()
        elif activation_function == 'sigmoid':
            self.activation = Sigmoid()
        else:
            self.activation = ReLU()  # Default to ReLU activation function.
        
    def print_weights(self):
        print('Weights:\n', pd.DataFrame(self.weights))
        
    def print_biases(self):
        print('Biases:\n', pd.DataFrame(self.biases))
    
    def forward(self, inputs):
        self.inputs = inputs
        self.outputs = self.activation.activate(np.dot(self.weights, inputs) + self.biases)
        return self.outputs
    
    def backward(self, gradient_vector, learning_rate):
        # We can skip updating self.outputs because they will be changed in the next iteration.
        print('weights', self.weights.shape)
        print('biases', self.biases.shape)
        print('gradient vec', gradient_vector.shape)
        print('inputs', self.inputs.shape)
        self.weights -= learning_rate * np.dot(gradient_vector, self.inputs.T)
        self.biases -= learning_rate * gradient_vector
        return np.dot(self.weights.T, gradient_vector)

In [192]:
class Network:
    def __init__(self, layers, loss_function='cross_entropy'):
        self.layers = layers
        self.output = None
        
        if loss_function == 'cross_entropy':
            self.loss = CrossEntropyLoss()
        else:
            self.loss = CrossEntropyLoss()  # Default to cross entropy loss. 
    
    def train(self, X_train, y_train, number_epochs, learning_rate=0.1):
        for epoch in range(number_epochs):
            error = 0
            
            for x, y in zip(X_train, y_train):
                # Process the forward pass. This goes through every layer.
                self.predict(x)
                
                # Calculate the error after the forward pass. 
                error += self.loss.calculate_loss(y, self.output)
                
                # Calculate the derivative of the loss in respect to the output.
                dL_dA = self.loss.derivative(y, self.output)  
                print('dl_da', dL_dA)
                # Shape is the number of neurons of the last layer x 1. 
                
                # Calculate the derivative of the activation function in respect to the weighted sum
                dA_dZ = self.layers[1].activation.derivative(self.layers[1].outputs)
                print('da dz', dA_dZ)
                
                # The derivative of the weighted sum in respect to the weights
                dZ_dW = self.layers[1].inputs
                print('dz dw', dZ_dW[0])
                
                # The derivative of the loss in respect to the weights
                dL_dW_L1 = np.multiply(np.multiply(dL_dA, dA_dZ), dZ_dW[0])
                
                print('dl dw', dL_dW_L1)
                
                # Update the weights
                self.layers[1].weights = self.layers[1].weights - learning_rate * dL_dW_L1
                
                # For the first layer                
                # Calculate the derivative of the activation function in respect to the weighted sum
                delta_L1 = np.multiply(dL_dA, dA_dZ).T
                print('delta_L1 shape', delta_L1.shape)
                weights_L1 = self.layers[1].inputs
                inputs_L1 = self.layers[0].inputs
                dA_dZ_L0 = self.layers[0].activation.derivative(self.layers[0].outputs)
                
                dL_dW_L0 = np.dot(delta_L1, weights_L1)
                
                # Now perform backpropagation.
                # for layer in reversed(self.layers):
                #     # Go through all the layers from 'right to left'. 
                #     dA_dZ = layer.activation.derivative(layer.outputs)
                #     # dZ_dW = the input (or output of the previous layer) because there is only one function connected through the edge that effects the z node, that being the input. 
                #     dZ_dW = layer.inputs
    
    def predict(self, x):
        outputs = x
        for layer in self.layers:
            outputs = layer.forward(outputs)
        self.output = outputs
            

For backpropagation, the formula for the very last layer is: 
![last-layer-backprop](https://miro.medium.com/max/828/1*zRDMl-GxVO7qENH5dNrZ-g.png)

In [193]:
np.random.seed(42)  # To keep results consistent.

number_inputs = X.shape[1]
epochs = 1

layers = [
    Dense(number_inputs, 2, activation_function='relu', name='Layer 1'),
    Dense(2, 1, activation_function='sigmoid', name='Layer 2')
]

In [194]:
def print_layer_outputs(network):
    for layer in network.layers:
        print(layer.name, ':\n', layer.outputs)

In [195]:
X_practice = X[0:2]
X_practice.shape

(2, 21)

In [196]:
network = Network(layers)
network.train(X_practice, y, number_epochs=epochs)
# print_layer_outputs(network)

dl_da [[1.97531727 2.        ]]
da dz [[0.23536143 0.23500371]]
dz dw [0.05996961 0.        ]
dl dw [[0.02788068 0.        ]]


ValueError: shapes (2,1) and (2,2) not aligned: 1 (dim 1) != 2 (dim 0)