In [26]:
import numpy as np

# Stochastic Gradient Descent (SGD)

In [27]:
class layer_dense:
    'Neural network dense layer'
    
    # initialization (weights and biases)
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01*np.random.randn(n_inputs, n_neurons) 
        self.biases = np.zeros((1,n_neurons))

    # output
    def forward(self, inputs):
        self.output = inputs.dot(self.weights) + self.biases
        
        # remember input values
        self.inputs = inputs
      
    def backward(self, dvalues):
        
        # gradient
        self.dinputs = dvalues.dot(self.weights.T)
        
        # Gradients on parameters
        self.dweights = self.inputs.T.dot(dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)

        
class activation_ReLU:
    'rectified linear unit activation function'
    
    def forward(self, inputs):
        self.output = np.maximum(0, inputs)
        
        # remember input values
        self.inputs = inputs
        
    def backward(self, dvalues):

        # copy dvalues
        self.dinputs = dvalues.copy()
        
        # zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0
        
class activation_softmax:
    'softmax activation function'
    
    def forward(self, inputs):
        
        # unnormalized probabilities
        exp_values = np.exp(inputs-np.max(inputs,axis=1, keepdims=True))
        
        # normalized probabilities
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        self.output = probabilities
        
        # remember input values
        self.inputs = inputs
        
    def backward(self,dvalues):

        
        # initialized gradient (derivative) array
        self.dinputs = np.zeros(dvalues.shape)
        
        # For each data point, compute gradient
        for i in range(len(dvalues)):
            output_row = self.output[i]
            dvalues_row = dvalues[i]
            
            # Flatten output array
            output_row = output_row.reshape(-1, 1)

            # Calculate derivative matrix of the output
            derivative_matrix = np.diagflat(output_row) - output_row.dot(output_row.T)
            
            # store gradient
            self.dinputs[i] = dvalues_row.dot(derivative_matrix)


class loss:
    def calculate(self, output,y):
        sample_losses = self.forward(output,y)
        loss = np.mean(sample_losses)
        return loss
    
# cross-entropy loss        
class loss_crossentropy(loss):
    
    def forward(self, y_pred, y_true):
        
        # number of data points
        n_samples = len(y_pred)
        
        # clip data to prevent division by 0
        y_pred_clipped = np.clip(y_pred, 1e-7,1 - 1e-7)
        
        # confidence values
        confidence_values = y_pred_clipped[range(n_samples),y_true]
       
        # sample losses 
        losses = -np.log(confidence_values)
        return losses
    
    
    def backward(self, dvalues, y_true):
        
        
        # Number of samples
        n_samples = len(dvalues)
    
        #one-hot matrix
        Y = np.zeros(dvalues.shape)
        Y[np.arange(len(y_true)),y_true] = 1
        
        # Calculate gradient
        self.dinputs = -Y / dvalues
        
        # Normalize gradient
        self.dinputs = self.dinputs / n_samples
        
# SGD optimizer
class optimizer_SGD:

    def __init__(self, learning_rate=1.0):
        self.learning_rate = learning_rate
        
    def update_parameters(self, layer):
        layer.weights = layer.weights - self.learning_rate*layer.dweights
        layer.biases = layer.biases - self.learning_rate*layer.dbiases

In [10]:
sgd = optimizer_SGD(learning_rate=0.1)

In [11]:
sgd.learning_rate

0.1

## Example

In [43]:
X = np.zeros((10,2))
X[0:5] = 0.1*np.random.randn(5,2)
X[5:10] = 0.1*np.random.randn(5,2) + [2,2]
y = np.array([0,0,0,0,0,1,1,1,1,1])

In [44]:
X

array([[-0.0156219 ,  0.03021195],
       [-0.0917136 , -0.04809307],
       [-0.10580178, -0.11221288],
       [ 0.0356625 , -0.12174811],
       [-0.18838272, -0.03874585],
       [ 1.98734905,  1.99640412],
       [ 1.99311345,  1.9817274 ],
       [ 2.02913733,  2.16177591],
       [ 2.05214561,  2.01604176],
       [ 1.8421564 ,  2.12892829]])

In [114]:
# neural network layers
layer1 = layer_dense(n_inputs = 2, n_neurons = 10)
activation1 = activation_ReLU()
layer2 = layer_dense(n_inputs = 10, n_neurons = 2)
activation2 = activation_softmax()
loss_function = loss_crossentropy()

In [115]:
# optimizer
optimizer = optimizer_SGD(learning_rate=1)

In [116]:
n_epochs = 100
Loss = np.zeros(n_epochs)
for i in range(n_epochs):
    
    # forward pass
    layer1.forward(X)
    activation1.forward(layer1.output)
    layer2.forward(activation1.output)
    activation2.forward(layer2.output)
    losses = loss_function.forward(activation2.output,y)
    Loss[i] = np.mean(losses)
    
    # backward pass
    loss_function.backward(activation2.output,y)
    activation2.backward(loss_function.dinputs)
    layer2.backward(activation2.dinputs)
    activation1.backward(layer2.dinputs)
    layer1.backward(activation1.dinputs)
    
    # update weights and biases
    optimizer.update_parameters(layer1)
    optimizer.update_parameters(layer2)

In [117]:
plt.plot(Loss)

array([0.6929139 , 0.69201413, 0.68854884, 0.67474493, 0.62397221,
       0.48839149, 0.33081774, 0.2398186 , 0.18297364, 0.14546283,
       0.11954223, 0.10060874, 0.08659536, 0.07564038, 0.06697266,
       0.06000844, 0.05416445, 0.04935264, 0.04522849, 0.04166167,
       0.03862741, 0.03592732, 0.03353813, 0.03145836, 0.02956561,
       0.0278601 , 0.02634787, 0.02495484, 0.02368004, 0.02253106,
       0.0214683 , 0.02048167, 0.01958009, 0.01874424, 0.01796065,
       0.01723502, 0.01656224, 0.0159265 , 0.01533018, 0.01477858,
       0.01425373, 0.0137573 , 0.01329432, 0.01285601, 0.01243773,
       0.01204385, 0.0116726 , 0.01131647, 0.01097758, 0.01065975,
       0.01035352, 0.01006036, 0.00978348, 0.00951871, 0.00926333,
       0.00902022, 0.00878904, 0.00856523, 0.00835025, 0.00814702,
       0.00794957, 0.00775912, 0.00757781, 0.00740324, 0.00723368,
       0.00707123, 0.00691566, 0.00676412, 0.00661785, 0.00647855,
       0.00634248, 0.0062106 , 0.00608452, 0.0059622 , 0.00584

In [118]:
# probabilities 
activation2.output

array([[9.92714122e-01, 7.28587775e-03],
       [9.92911975e-01, 7.08802457e-03],
       [9.93010391e-01, 6.98960857e-03],
       [9.92844762e-01, 7.15523798e-03],
       [9.93020977e-01, 6.97902286e-03],
       [5.22135495e-04, 9.99477865e-01],
       [5.37677453e-04, 9.99462323e-01],
       [2.70147734e-04, 9.99729852e-01],
       [4.01658407e-04, 9.99598342e-01],
       [5.33580172e-04, 9.99466420e-01]])

In [119]:
# predictions
np.argmax(activation2.output,axis=1)

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1], dtype=int64)