In [3]:
import numpy as np

# TO DO
- ADAPT THE HIDDEN AND OUTPUT LAYERS TO BE DIFFERENT
- ADD THE OPTION TO ADD WEIGHTS AND BIASES AS WELL AS THE OPTION TO SAVE FROM DISK AND LOAD TO DISK
- SET THE LAST LAYER IN THE BACKPROPAGATION DINAMICALLY
- SET WAYS TO PREPROCESS DATA TO NOT BREAK THE CODE

In [None]:
class neural_network:
    # TODO: ADD A WAY TO USE DIFFERENT HIDDEN LAYERS ACTIVATION FUNCTION AND LAST LAYER ACTIVATION FUNCIONS (GENERALLY: HIDDEN ARE RELU OR TANH)

    def __init__(
            self, 
            architecture:list=[2,3,3,1], 
            hidden_layers_activation_function="relu", 
            output_layer_activation_function="sigmoid",
            activation_function:str="sigmoid", 
            learning_rate=0.01,
            ):

        self.architecture = architecture
        self.cache = dict()
        self.learning_rate = learning_rate
        
        # weights initialized using random numbers from normal distribution
        self.weights = [
            np.random.randn(architecture[x+1], architecture[x])
            for x in range(len(architecture)-1)
            ]
        
        # biases initialized using zeros for each neuron
        self.biases = [
            np.zeros((architecture[x+1], 1))
            for x in range(len(architecture)-1)
        ]
        
        # activation functions supported
        activation_functions_available = {
            "relu": lambda x: np.maximum(0, x),
            "tanh": np.tanh,
            "sigmoid": lambda x: ((1) / (1 + np.exp(-x)))
        }

        if hidden_layers_activation_function.lower() in activation_functions_available.keys() and output_layer_activation_function.lower() in activation_functions_available.keys():
            self.hidden_layers_activation_function = activation_functions_available[hidden_layers_activation_function.lower()]
            self.output_layer_activation_function = activation_functions_available[output_layer_activation_function.lower()]
        else:
            raise NameError("Activation function not supported")

# =================
        # validation of the activation function
        if activation_function.lower() in ["relu", "tanh", "sigmoid"]:
            self.activation_function = activation_functions_available[activation_function.lower()]
        else: 
            raise NameError("Activation Function not supported")
# =================

        derivate_activation_function = {
            # relu'(Z) = 1 if Z > 0 else 0
            "relu": lambda Z: (Z > 0).astype(float),
            # tanh'(Z) = 1 - tanh(Z)²
            "tanh": lambda Z: 1 - np.tanh(Z)**2,
            # sig'(Z) = sig(Z) * (1 - sig(Z))
            "sigmoid": lambda Z: ((1) / (1 + np.exp(-Z))) * (1 - ((1) / (1 + np.exp(-Z))))
        }
    
        self.derivate_activation_function = derivate_activation_function[activation_function.lower()]

        
    def input_data(self, data, y):
        # TODO validation
        self.data = data
        self.y = y
        self.num_instances = len(self.data)

    def _forward_propagation(self):
        """
        Basic formula:
        Z[l] = W[l] A[l-1] + b[l]
        A[l] = g(Z[l])
        
        Where:
            - l: Current Layer
            - W: Weights
            - A: Activation Vector
            - b: Biases
            - g: Activation Function
        """
        # dict to save the values of each layer
        A = self.data
        self.cache["A0"] = A
        for layer_idx in range(len(self.architecture)-1):
            Z = self.weights[layer_idx] @ A + self.biases[layer_idx]
            A = self.activation_function(Z)
            self.cache[f"A{layer_idx + 1}"] = A
            self.cache[f"Z{layer_idx + 1}"] = Z
        y_hat = A
        return y_hat
    
    def _calculate_loss(self, y_hat):
        """
        Use of cross entropy to calculate the loss

        For a single example:
            - L(y_hat, y) = -(y * log y_hat + (1 - y) * log(1 - y_hat))

        For all training samples:
            - C = (1 / m) * sum(L(y_hat, y))
        """
        
        # original data and prediction
        y = self.y
        # y_hat = self._forward_propagation()

        # loss calculation based on the matrices
        prediction_losses = -((y * np.log(y_hat)) + (1 - y) * np.log(1 - y_hat))

        # num of entities extracted to get the global loss
        y_total = y_hat.reshape(-1).shape[0]

        # global loss
        losses_sum = (1 / y_total) * np.sum(prediction_losses, axis=1)

        return np.sum(losses_sum)

    def _backpropagation(self, y_hat, y_real, m):
        """ULTIMO LAYER: DERIVADA DA FUNCAO DE ATIVAÇÃO ISOLADA"""
        gradient_W = [None] * len(self.weights)
        gradient_b = [None] * len(self.weights)

        # last layer dZ
        # only one that uses explicitly the derivative dC/dZ
        # predicted value - real value * scalar factor (1/m)
        # TODO: THIS ONE ISALL  BOUT THE SIGMOID FUNCTION, ADAPT IT TO RECEIVE THE RELU DERIVATIVE + TANH DERIVATIVE
        dZ = (1/m) * (y_hat - y_real)

        for layer_idx in reversed(range(len(self.weights))):
            
            # correto (dW = dZ * A^t[l-1])
            W = self.cache[f"A{layer_idx}"]
            dW = dZ @ W.T
            
            #correto (db = sum(dZ))
            db = np.sum(dZ, axis=1, keepdims=True)

            # saving the weights and biases gradients
            gradient_W[layer_idx] = dW
            gradient_b[layer_idx] = db

            if layer_idx > 0:
                # correto - derivada da camada anterior (w^t * dz)
                dA_back = self.weights[layer_idx]
                dA_back = dA_back.T @ dZ

                dZ = dA_back * self.derivate_activation_function(self.cache[f"Z{layer_idx}"])

        for layer_idx in range(len(self.weights)):
            # weights and biases adaptation using gradient descent
            # theta = theta - learning rate * slope     (derivative)
            self.weights[layer_idx] -= self.learning_rate * gradient_W[layer_idx]
            self.biases[layer_idx] -= self.learning_rate * gradient_b[layer_idx]

    def train(self, max_iterations:int=10000, min_loss_difference:float=None, file_to_save_weights_and_biases:str=None):
        
        for _ in range(max_iterations):
            y_hat = self._forward_propagation()
            loss = self._calculate_loss(
                y_hat=y_hat
                )
            self._backpropagation(
                y_hat=y_hat,
                y_real=self.y,
                m=self.num_instances
                )
        return loss
    
    def predict(self, data):
        A = data
        for layer_idx in range(len(self.architecture)-1):
            Z = self.weights[layer_idx] @ A + self.biases[layer_idx]
            A = self.activation_function(Z)
        y_hat = A
        return y_hat #(y_hat>0.5).astype(float) can be used for binary classification

In [30]:
nn = neural_network()

def prepare_data():
  X = np.array([
      [150, 70],
      [254, 73],
      [312, 68],
      [120, 60],
      [154, 61],
      [212, 65],
      [216, 67],
      [145, 67],
      [184, 64],
      [130, 69]
  ])
  y = np.array([0,1,1,0,0,1,1,0,1,0])
  m = 10
  A0 = X.T
  Y = y.reshape(1, m)

  return A0, Y, m

A0, Y, m = prepare_data()

nn.input_data(A0, Y)

nn.train()

nn.predict(data=np.array([150,70]).reshape(2,1))
# print(nn.weights)
# print(nn.biases)

# y_hat = nn._forward_propagation()
# loss1 = nn._calculate_loss(y_hat)

# nn._backpropagation(y_hat, Y, m)

# y_hat2 = nn._forward_propagation()
# loss2 = nn._calculate_loss(y_hat2)

# print(nn.weights)
# print(nn.biases)


# print(loss1, loss2)

# for i in range(100000):
#   y_hat = nn._forward_propagation()
#   print(nn._calculate_loss(y_hat))
#   nn._backpropagation(y_hat, Y, m)
# print(loss)

array([[0.]])