Okay, let's start over after we've understood some maths. 

In [523]:
# Imports:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Data Setup:
X = np.load('data/X-data.npy')
y = np.load('data/y-data.npy')

np.random.seed(0)

In [524]:
# Abstract Class Definitions: 
class Layer():
    def __init__(self): 
        self.inputs = None  # The inputs into this layer. 
        self.outputs = None # The ouputs of this layer. 
        
    # Forward propagation method.
    def forward(self, inputs):
        pass
    
    # Backward propagation method.
    def backward(self):
        pass
    
class Activation():
    def activate():
        pass
    
    def prime():
        pass
    
class Loss():
    def calculate_loss():
        pass

In [525]:
class ReLU(Activation):
    def activate(self, x): 
        return np.maximum(x, 0.0)
    
class Sigmoid(Activation):
    def activate(self, x):
        return 1 / (1 + np.exp(-x))

In [526]:
class CrossEntropyLoss():
    def calculate_loss(self, y_true, y_pred):
        return np.mean(-y_true * np.log(y_pred) - (1 - y_true) * np.log(1 - y_pred))
    
    def gradient(self, y_true, y_pred): 
        r = ((1 - y_true) / (1 - y_pred) - y_true / y_pred) / np.size(y_true)
        return r

In [527]:
# Dense (fully connected) Layer Class:
class Dense(Layer): 
    def __init__(self, input_size, output_size, activation_function='relu'):
        self.weights = np.random.randn(output_size, input_size)
        self.biases = np.zeros(output_size)
        self.outputs = None
        
        if activation_function == 'relu':
            self.activation = ReLU()
        elif activation_function == 'sigmoid':
            self.activation = Sigmoid()
        else:
            self.activation = ReLU()  # Default to ReLU activation function.
        
    def print_weights(self):
        print('Weights:\n', pd.DataFrame(self.weights))
        
    def print_biases(self):
        print('Biases:\n', pd.DataFrame(self.biases))
    
    def forward(self, inputs):
        self.outputs = self.activation.activate(np.dot(self.weights, inputs) + self.biases)
        return self.outputs
    
    def backward(self, gradient_vector, x, learning_rate):
        # We can skip updating self.outputs because they will be changed in the next iteration.
        self.weights -= learning_rate * np.dot(gradient_vector, x.T)
        self.biases -= learning_rate * gradient_vector
        return np.dot(self.weights.T, gradient_vector)

In [528]:
class Network:
    def __init__(self, layers, loss_function='cross_entropy'):
        self.layers = layers
        self.output = None
        
        if loss_function == 'cross_entropy':
            self.loss = CrossEntropyLoss()
        else:
            self.loss = CrossEntropyLoss()  # Default to cross entropy loss. 
    
    def train(self, X_train, y_train, number_epochs, learning_rate=0.01):
        for epoch in range(number_epochs):
            error = 0
            
            for x, y in zip(X_train, y_train):
                # Process the forward pass.
                self.predict(x)
                
                # Calculate the error after this forward pass. 
                error += self.loss.calculate_loss(y, self.output)
                
                # Calculate the derivative of the loss in respect to the output for each output.
                gradient_vector = self.loss.gradient(y, self.output)  # Shape is 569 x 1
                
                # Now perform backpropagation.
                for layer in reversed(self.layers):
                    gradient_vector = layer.backward(gradient_vector, x.reshape(len(x), 1), learning_rate)
    
    def predict(self, x):
        outputs = x
        for layer in self.layers:
            outputs = layer.forward(outputs)
        self.output = outputs
            

In [529]:
np.random.seed(42)  # To keep results consistent.

number_inputs = X.shape[1]
epochs = 1

layers = [
    Dense(number_inputs, 10, activation_function='sigmoid'),
    Dense(10, 10, activation_function='relu'),
    Dense(10, 1, activation_function='sigmoid')
]
layers[0].biases + np.dot(layers[0].weights, X[1])

array([ 0.32418208, -3.24279964, -1.9316259 , -0.77079184, -1.35954831,
        3.30868468, -2.74028547,  1.73267878, -1.53916903,  2.18857362])

In [530]:
network = Network(layers)
network.train(X, y, number_epochs=epochs)
network.output

ValueError: shapes (1,) and (10,) not aligned: 1 (dim 0) != 10 (dim 0)