In [1]:
import numpy as np

In [43]:
class layer_dense:
    'Neural network dense layer'
    
    # initialization (weights and biases)
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01*np.random.randn(n_inputs, n_neurons) 
        self.biases = np.zeros((1,n_neurons))

    # output
    def forward(self, inputs):
        self.output = inputs.dot(self.weights) + self.biases
        
        # remember input values
        self.inputs = inputs
      
    def backward(self, dvalues):
        
        # gradient
        self.dinputs = dvalues.dot(self.weights.T)
        
        # Gradients on parameters
        self.dweights = self.inputs.T.dot(dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)

        
class activation_ReLU:
    'rectified linear unit activation function'
    
    def forward(self, inputs):
        self.output = np.maximum(0, inputs)
        
        # remember input values
        self.inputs = inputs
        
    def backward(self, dvalues):

        # copy dvalues
        self.dinputs = dvalues.copy()
        
        # zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0
        
class activation_softmax:
    'softmax activation function'
    
    def forward(self, inputs):
        
        # unnormalized probabilities
        exp_values = np.exp(inputs-np.max(inputs,axis=1, keepdims=True))
        
        # normalized probabilities
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        self.output = probabilities
        
        # remember input values
        self.inputs = inputs
        
    def backward(self,dvalues):

        
        # initialized gradient (derivative) array
        self.dinputs = np.zeros(dvalues.shape)
        
        # For each data point, compute gradient
        for i in range(len(dvalues)):
            output_row = self.output[i]
            dvalues_row = dvalues[i]
            
            # Flatten output array
            output_row = output_row.reshape(-1, 1)

            # Calculate derivative matrix of the output
            derivative_matrix = np.diagflat(output_row) - output_row.dot(output_row.T)
            
            # store gradient
            self.dinputs[i] = dvalues_row.dot(derivative_matrix)


class loss:
    def calculate(self, output,y):
        sample_losses = self.forward(output,y)
        loss = np.mean(sample_losses)
        return loss
    
# cross-entropy loss        
class loss_crossentropy(loss):
    
    def forward(self, y_pred, y_true):
        
        # number of data points
        n_samples = len(y_pred)
        
        # clip data to prevent division by 0
        y_pred_clipped = np.clip(y_pred, 1e-7,1 - 1e-7)
        
        # confidence values
        confidence_values = y_pred_clipped[range(n_samples),y_true]
       
        # sample losses 
        losses = -np.log(confidence_values)
        return losses
    
    
    def backward(self, dvalues, y_true):
        
        
        # Number of samples
        n_samples = len(dvalues)
    
        #one-hot matrix
        Y = np.zeros(dvalues.shape)
        Y[np.arange(len(y_true)),y_true] = 1
        
        # Calculate gradient
        self.dinputs = -Y / dvalues
        
        # Normalize gradient
        self.dinputs = self.dinputs / n_samples

In [6]:
X = np.random.randn(6,2)

In [7]:
y = np.array([0,1,0,2,2,0])

In [73]:
# neural network layers
layer1 = layer_dense(n_inputs = 2, n_neurons = 3)
activation1 = activation_ReLU()
layer2 = layer_dense(n_inputs = 3, n_neurons = 3)
activation2 = activation_softmax()
loss_function = loss_crossentropy()

**Forward pass**

In [74]:
layer1.forward(X)
activation1.forward(layer1.output)
layer2.forward(activation1.output)
activation2.forward(layer2.output)
loss = loss_function.forward(activation2.output,y)

In [75]:
loss

array([1.09882851, 1.09858052, 1.09858894, 1.09869365, 1.09859159,
       1.09897046])

In [76]:
# probabilities
activation2.output

array([[0.33326127, 0.33337982, 0.33335891],
       [0.33329624, 0.33334392, 0.33335984],
       [0.33334112, 0.33334961, 0.33330927],
       [0.33334211, 0.33335168, 0.33330621],
       [0.33332886, 0.33333091, 0.33334023],
       [0.33321396, 0.33336741, 0.33341863]])

**Backward pass**

In [77]:
# loss function
loss_function.backward(activation2.output,y)

In [78]:
loss_function.dinputs

array([[-0.50010812, -0.        , -0.        ],
       [-0.        , -0.49998412, -0.        ],
       [-0.49998832, -0.        , -0.        ],
       [-0.        , -0.        , -0.50004068],
       [-0.        , -0.        , -0.49998965],
       [-0.50017912, -0.        , -0.        ]])

In [79]:
# softmax function
activation2.backward(loss_function.dinputs)

In [80]:
activation2.dinputs

array([[-0.11112312,  0.0555633 ,  0.05555982],
       [ 0.05554937, -0.11110935,  0.05555997],
       [-0.11110981,  0.05555827,  0.05555154],
       [ 0.05555702,  0.05555861, -0.11111563],
       [ 0.05555481,  0.05555515, -0.11110996],
       [-0.11113101,  0.05556123,  0.05556977]])

In [81]:
# layer 2
layer2.backward(activation2.dinputs)

In [82]:
layer2.dinputs

array([[ 0.00213383, -0.00099012, -0.00018758],
       [-0.000609  ,  0.00148789, -0.00039226],
       [ 0.00213357, -0.00099002, -0.00018755],
       [-0.00152454, -0.00049798,  0.00057984],
       [-0.00152447, -0.00049794,  0.00057981],
       [ 0.00213402, -0.00099011, -0.00018763]])

In [83]:
# ReLU activation function
activation1.backward(layer2.dinputs)

In [84]:
activation1.dinputs

array([[ 0.00213383,  0.        , -0.00018758],
       [-0.000609  ,  0.        ,  0.        ],
       [ 0.        ,  0.        , -0.00018755],
       [ 0.        ,  0.        ,  0.00057984],
       [-0.00152447, -0.00049794,  0.        ],
       [ 0.00213402,  0.        ,  0.        ]])

In [87]:
activation1.inputs

array([[ 0.0194448 , -0.00753888,  0.02906676],
       [ 0.00869302, -0.0001234 , -0.00918796],
       [-0.00661898, -0.00191948,  0.02075133],
       [-0.00948066, -0.00182178,  0.02338598],
       [ 0.00176142,  0.00153605, -0.01252654],
       [ 0.02797608, -0.00089092, -0.02619534]])

In [88]:
# layer 1
layer1.weights

array([[-0.00599458,  0.0036955 , -0.01832981],
       [-0.01470309,  0.00197681,  0.00346087]])

In [89]:
layer1.backward(activation1.dinputs)

In [90]:
layer1.dweights

array([[-0.00267239, -0.00030551, -0.00011932],
       [-0.00525021,  0.00018421,  0.00058618]])

In [91]:
layer1.dbiases

array([[ 0.00213438, -0.00049794,  0.0002047 ]])