<h1 style="font-size:32px; color:#F4EE00; font-family:cambria"><i>Multi-Layered, Feedforward Neural Network</i></h1>

<span style="color:#53C8FE; font-family:aparajita; font-size:24px">
    <i>This is a multi-layred, feedforward neural network written from scratch in Python.</i>
</span>
<hr>

<h2 style="font-size:32px; color:#F4EE00; font-family:cambria"><i>Importing Libraries</i></h2>
<hr>

In [1]:
import numpy as np
from time import time
from typing import Any, List, Tuple, TypedDict
from nptyping import NDArray, Float64
np.seterr( over = 'raise' ) # raise error if overflow is encountered

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

<h2 style="font-size:32px; color:orange; font-family:cambria"><i>Defining Data Structures</i></h2>
<hr>

<table width=100%>
    <tr><td style="font-size:20px; text-align:left">$Input\ Matrix$</td></tr>
    <tr>
        <td style="font-size:24px; text-align:left">
            $X = \begin{bmatrix}
                x_{1,1} & x_{1,2} & \dots & x_{1,n} \\
                x_{2,1} & x_{2,2} & \dots & x_{2,n} \\
                \vdots  & \vdots  & \ddots & \vdots \\
                x_{m,1} & x_{m,2} & \dots  & x_{m,n}
            \end{bmatrix}$
        </td>
    </tr>
</table>

In [2]:
Number_of_Examples = Any
Number_of_Features = Any
Input_Matrix = NDArray[ ( Number_of_Examples, Number_of_Features ), Float64 ]

<table width=100%>
    <tr><td style="font-size:20px; text-align:left">$Target\ Matrix$</td></tr>
    <tr>
        <td style="font-size:24px; text-align:left">
            $Y = \begin{bmatrix}
                y_{1,1} & y_{1,2} & \dots  & y_{1,k} \\
                y_{2,1} & y_{2,2} & \dots  & y_{2,k} \\
                \vdots  & \vdots  & \ddots & \vdots \\
                y_{m,1} & y_{m,2} & \dots  & y_{m,k}
            \end{bmatrix}$
        </td>
    </tr>
</table>

In [3]:
Number_of_Targets = Any
Target_Matrix = NDArray[ ( Number_of_Examples, Number_of_Targets ), Float64 ]

<table width=100%>
    <tr><td style="font-size:20px; text-align:left">$Output\ Matrix$</td></tr>
    <tr>
        <td style="font-size:24px; text-align:left">
            $\hat{Y} = \begin{bmatrix}
                \hat{y}_{1,1} & \hat{y}_{1,2} & \dots & \hat{y}_{1,k} \\
                \hat{y}_{2,1} & \hat{y}_{2,2} & \dots & \hat{y}_{2,k} \\
                \vdots & \vdots & \ddots & \vdots \\
                \hat{y}_{m,1} & \hat{y}_{m,2} & \dots & \hat{y}_{m,k}
            \end{bmatrix}$
        </td>
    </tr>
</table>

In [4]:
Output_Matrix = NDArray[ ( Number_of_Examples, Number_of_Targets ), Float64 ]

<table width=100%>
    <tr><td style="font-size:20px; text-align:left">$Perceptron$</td></tr>
    <tr>
        <td style="font-size:24px; text-align:left">
            $P^{(ℓ)}_k = \begin{bmatrix}
                \omega_{1,k}^{(ℓ)} \\
                \omega_{2,k}^{(ℓ)} \\
                \vdots \\ 
                \omega_{j,k}^{(ℓ)}
            \end{bmatrix}$
        </td>
    </tr>
    <tr>
        <td style="font-size:20px; text-align:left">
            $b_k^{(ℓ)} = \omega_{0,k}^{(ℓ)}$
        </td>
    </tr>
</table>

In [5]:
Number_of_Inputs = Any
Bias, Weight = Float64, Float64
Perceptron = NDArray[ ( Number_of_Inputs, 1 ), Weight ]

<table width=100%>
    <tr><td style="font-size:20px; text-align:left">$Layer$</td></tr>
    <tr>
        <td style="font-size:24px; text-align:left">
            $\Omega^{(ℓ)} = \begin{bmatrix}
                P_1^{(ℓ)} & P_2^{(ℓ)} & \dots & P_k^{(ℓ)}
            \end{bmatrix}$
        </td>
    </tr>
    <tr>
        <td style="font-size:20px; text-align:left">
            $\beta^{(ℓ)} = \begin{bmatrix}
                b_1^{(ℓ)} & b_2^{(ℓ)} & \dots & b_k^{(ℓ)}
            \end{bmatrix}$
        </td>
    </tr>
</table>

In [6]:
Number_of_Perceptrons = Any
Layer_Weights = NDArray[ ( Number_of_Inputs, Number_of_Perceptrons ), Perceptron ]
Layer_Biases = NDArray[ ( Number_of_Perceptrons, ), Bias ]

<table width=100%>
    <tr><td style="font-size:20px; text-align:left">$Network$</td></tr>
    <tr>
        <td style="font-size:24px; text-align:left">
            $N = \begin{bmatrix}
                \begin{bmatrix} \Omega^{(1)} & \Omega^{(2)} & \dots & \Omega^{(ℓ)} \end{bmatrix} & 
                \begin{bmatrix} \beta^{(1)} & \beta^{(2)} & \dots & \beta^{(ℓ)} \end{bmatrix}
            \end{bmatrix}$
        </td>
    </tr>
</table>

In [7]:
Network_Weights = List[ Layer_Weights ]
Network_Biases = List[ Layer_Biases ]
class Network( TypedDict ) :
    weights : Network_Weights
    biases  : Network_Biases

<h1 style="font-size:32px; color:orange; font-family:cambria"><i>Creating the Neural Network</i></h1>
<hr>

<table width=100%>
    <tr><td style="font-size:20px; text-align:left">$Initialization$</td></tr>
</table>

In [8]:
class FeedForwardNeuralNetwork :
    
    def __init__( self, perceptrons_per_hidden_layer : List[ int ] = [] ) -> None :
        self.score = 0.0
        self.perlayer = perceptrons_per_hidden_layer
        self.network : Network = { 'weights' : [], 'biases' : [] }
        return

<table width=100%>
    <tr><td style="font-size:20px; text-align:left">$Initialize\ Random\ Weights\ and\ Biases$</td></tr>
</table>

In [9]:
class FeedForwardNeuralNetwork( FeedForwardNeuralNetwork  ) :
    
    def initialize( self, X : Input_Matrix, Y : Target_Matrix ) -> None :
        inputs = X.shape[ 1 ]
        self.network = { 'weights' : [], 'biases' : [] }
        for perceptrons in self.perlayer :
            self.network[ 'weights' ].append( np.random.rand( inputs, perceptrons ) - 0.5 )
            self.network[ 'biases' ].append( np.random.rand( perceptrons ) - 0.5 )
            inputs = perceptrons
        self.network[ 'weights' ].append( np.random.rand( inputs, Y.shape[ 1 ] ) - 0.5 )
        self.network[ 'biases' ].append( np.random.rand( Y.shape[ 1 ] ) - 0.5 )
        return

<table width=100%>
    <tr>
        <td style="font-size:20px; text-align:center" colspan="2" >$Activation\ Function$</td>
    </tr>
    <tr>
        <td style="font-size:20px; text-align:center">$Sigmoid$</td>
        <td style="font-size:24px; text-align:left">$
        f(x) = \frac{1}{1+e^{-x}}
        $</td>
    </tr>
    <tr>
        <td style="font-size:20px; text-align:center">$Derivative$</td>
        <td style="font-size:24px; text-align:left">$
        f(x) = \frac{e^{-x}}{(1+e^{-x})^{2}}
        $</td>
    </tr>
</table>

In [10]:
class FeedForwardNeuralNetwork( FeedForwardNeuralNetwork  ) :
    
    def activation( self, x : NDArray[ Float64 ] ) -> NDArray[ Float64 ] :
        return 1.0 / ( 1.0 + np.exp( -x ) )
    
    def derivative( self, x : NDArray[ Float64 ] ) -> NDArray[ Float64 ] :
        return np.exp( -x ) / np.square( 1.0 + np.exp( -x ) )

<table width=100%>
    <tr>
        <td style="font-size:20px; text-align:center" colspan="2" >$Cost\ and\ Gradient$</td>
    </tr>
    <tr>
        <td style="font-size:20px; text-align:center">$Cost\ Function$</td>
        <td style="font-size:24px; text-align:left">$
        C(Y;\hat{Y}) = 
        \sum_{h=1}^{m}{\sum_{i=1}^{k} (\hat{y}_{h,i} - y_{h,i}})^{2} =
        \sum_{h=1}^{m}{\sum_{i=1}^{k} (a^{(ℓ)}_{h,i} - y_{h,i}})^{2} 
        $</td>
    </tr>
    <tr>
        <td style="font-size:20px; text-align:center">$Gradient$</td>
        <td style="font-size:24px; text-align:left">$
        \nabla^{(ℓ)}_{a}C = \begin{bmatrix}
                \frac{\partial C}{\partial a^{(ℓ)}_{1,1}} &
                \frac{\partial C}{\partial a^{(ℓ)}_{1,2}} &
                \dots & 
                \frac{\partial C}{\partial a^{(ℓ)}_{1,k}} 
                \\
                \frac{\partial C}{\partial a^{(ℓ)}_{2,1}} &
                \frac{\partial C}{\partial a^{(ℓ)}_{2,2}} &
                \dots & 
                \frac{\partial C}{\partial a^{(ℓ)}_{2,k}}
                \\
                \vdots & \vdots & \ddots & \vdots \\
                \\
                \frac{\partial C}{\partial a^{(ℓ)}_{m,1}} &
                \frac{\partial C}{\partial a^{(ℓ)}_{m,2}} &
                \dots & 
                \frac{\partial C}{\partial a^{(ℓ)}_{m,k}}
            \end{bmatrix} = 2(\hat{Y}-Y) 
        $</td>
    </tr>
</table>

In [11]:
class FeedForwardNeuralNetwork( FeedForwardNeuralNetwork  ) :
    
    def cost( self, A : Output_Matrix, Y : Target_Matrix ) -> Float64 :
        return np.square( A - Y ).sum()
    
    def grad( self, A : Output_Matrix, Y : Target_Matrix ) -> NDArray[ Float64 ] :
        return 2*( A - Y )

<table width=100%>
    <tr>
        <td style="font-size:20px; text-align:center" colspan="2" >$Forward\ Propagation$</td>
    </tr>
    <tr>
        <td style="font-size:20px">$Output\ and\ Hidden\ Layer$</td>
        <td style="font-size:24px">
            $L^{(ℓ)}=f*(L^{(ℓ-1)}\Omega^{(ℓ)} + \beta^{(ℓ)})$
        </td>
    </tr>
    <tr>
        <td style="font-size:20px">$Input\ Layer$</td>
        <td style="font-size:24px; text-align:left">$L^{(0)} = X$</td>
    </tr>
</table>

In [12]:
class FeedForwardNeuralNetwork( FeedForwardNeuralNetwork  ) :
    
    def forwardpropagation( self, L : Input_Matrix ) -> Output_Matrix :
        for w, b in zip( self.network[ 'weights' ], self.network[ 'biases' ] ) :
            L = self.activation( np.matmul( L, w ) + b ) 
        return L

<table width=100%>
    <tr>
        <td style="font-size:20px; text-align:center" colspan="2" >$Backpropagation$</td>
    </tr>
    <tr>
        <td style="font-size:20px" rowspan="2">$Containers$</td>
        <td style="font-size:24px; text-align:left">$Z[ℓ] = L^{(ℓ-1)}\Omega^{(ℓ)}+\beta^{(ℓ)}$</td>
    </tr>
    <tr><td style="font-size:24px">$A[ℓ] = L^{(ℓ)}$</td></tr>
</table>

In [13]:
class FeedForwardNeuralNetwork( FeedForwardNeuralNetwork  ) :
    
    def __forwardpropagation( self, X : Input_Matrix, A : List, Z : List ) -> Output_Matrix :
        A.append( X ) # len( A ) = len( Z ) + 1
        for w, b in zip( self.network[ 'weights' ], self.network[ 'biases' ] ) :
            # weighted input to layer
            Z.append( np.matmul( A[ -1 ], w ) + b )
            # output of layer
            A.append( self.activation( Z[ -1 ] ) )
        return A[ -1 ]

<table width=100%>
    <tr>
        <td style="font-size:20px; text-align:center" colspan="2" >$Backpropagation$</td>
    </tr>
    <tr>
        <td style="font-size:20px">$Initialization$</td>
        <td style="font-size:24px; text-align:left">
            $
            \nabla^{(ℓ)}_{z}C=
            (f'*Z[ℓ])*\nabla^{(ℓ)}_{a}C
            $
        </td>
    </tr>
    <tr>
        <td style="font-size:20px" rowspan="3">$Output\ and\ Hidden\ Layers$</td>
        <td style="font-size:24px">$\nabla^{(ℓ)}_{\Omega}C=A[ℓ-1]^{T}\nabla^{(ℓ)}_{z}C$</td>
    </tr>
    <tr>
        <td style="font-size:24px">$\nabla^{(ℓ)}_{\beta}C=\pmb{1}_{1xm}\nabla^{(ℓ)}_{z}C$</td>
    </tr>
    <tr>
        <td style="font-size:24px">$\nabla^{(ℓ-1)}_{z}C=(f'*Z[ℓ-1])*\nabla^{(ℓ)}_{z}C(\Omega^{(ℓ)})^{T}$</td>
    </tr>
</table>

In [14]:
class FeedForwardNeuralNetwork( FeedForwardNeuralNetwork  ) :
    
    def backpropagate( self, grad_z : NDArray[ Float64 ], A : List, Z : List, layer_index : int ) -> Tuple :
        # gradient with respect to the weights of the layer
        grad_w = np.matmul( A[ layer_index ].T, grad_z ) # len( A ) = len( Z ) + 1
        # gradient with respect to the biases of the layer
        grad_b = grad_z.sum( axis = 0 )
        # gradient with respect to the weighted input of the layer
        if layer_index > 0 : # there is no weighted input for layer 0
            grad_z = self.derivative( Z[ layer_index - 1 ] ) *\
                     np.matmul( grad_z, self.network[ 'weights' ][ layer_index ].T )
        return grad_z, grad_w, grad_b

<table width=100%>
    <tr>
        <td style="font-size:20px; text-align:center">$Gradient Descent$</td>
    </tr>
    <tr>
        <td style="font-size:24px; text-align:left">
            $\Omega^{(ℓ)} \rightarrow \Omega^{(ℓ)} - \frac{r}{m}\nabla^{(ℓ)}_{\Omega}C$
        </td>
    </tr>
    <tr>
        <td style="font-size:24px; text-align:left">
            $\beta^{(ℓ)} \rightarrow \beta^{(ℓ)} - \frac{r}{m}\nabla^{(ℓ)}_{\beta}C$
        </td>
    </tr>
</table>

In [15]:
class FeedForwardNeuralNetwork( FeedForwardNeuralNetwork  ) :
    
    def train( self, X : Input_Matrix, Y : Target_Matrix, 
               learning_rate = 1.0, convergence = 0.01, 
               batch_size = 10, max_epoch = 500, max_time = 60 ) -> None :
        ''' Stochastic Gradient Descent '''
        epoch = 1
        start = time()
        totgrad = np.inf
        self.initialize( X, Y )
        output_layer_index = len( self.network[ 'weights' ] ) - 1
        while np.sqrt( totgrad ) > convergence :
            totgrad = 0
            shuffle = np.random.permutation( len( X ) )
            X, Y = X[ shuffle ], Y[ shuffle ]
            for batch_x, batch_y in zip( 
                np.array_split( X, len( X ) // batch_size ), 
                np.array_split( Y, len( Y ) // batch_size ) 
                ) :
                A, Z = [], []
                output = self.__forwardpropagation( batch_x, A, Z )
                # gradient with respect to the output layer
                grad_a = self.grad( output, batch_y )
                totgrad += np.linalg.norm( grad_a )**2
                # gradient with respect to the weighted input of the output layer
                grad_z = self.derivative( Z[ -1 ] ) * grad_a
                for layer_index in range( output_layer_index, -1, -1 ) :
                    grad_z, grad_w, grad_b = self.backpropagate( grad_z, A, Z, layer_index )
                    # updating the weights and biases of layer
                    self.network[ 'weights' ][ layer_index ] -= learning_rate * grad_w / len( batch_x )
                    self.network[ 'biases' ][ layer_index ] -= learning_rate * grad_b / len( batch_x )
            epoch += 1
            if time() - start > max_time :
                print( 'Maximum runtime encountered.' )
                break
            if epoch > max_epoch :
                print( 'Maximum epoch encountered.' )
                break
        self.score = self.cost( self.forwardpropagation( X ), Y )
        return