In [51]:
from scipy.signal import correlate2d, convolve2d

In [38]:
class convolutional_layer():
    def __init__(self, input_shape, kernel_size, depth):
        
        # input_shape = (depth, width, height) of each input
        input_depth, input_height, input_width = input_shape
        self.input_shape = input_shape
        self.input_depth = input_depth
        
        
        # depth = number of kernels
        self.depth = depth
        
        # output shape
        self.output_shape = (depth, input_height - kernel_size + 1, input_width - kernel_size + 1)
        
        # initialize kernels (weights) and biases
        self.kernels_shape = (depth, input_depth, kernel_size, kernel_size)
        self.weights = 0.1*np.random.randn(depth, input_depth, kernel_size, kernel_size)
        self.biases = np.zeros(self.output_shape)

        # initialize momentums and caches (for ADAM)
        self.weight_momentums = np.zeros(self.weights.shape)
        self.weight_caches = np.zeros(self.weights.shape)
        self.bias_momentums = np.zeros(self.biases.shape)
        self.bias_caches = np.zeros(self.biases.shape)
        
    def forward(self, inputs):
        
        # save inputs
        self.inputs = inputs
        
        # initialize output array
        self.output = np.copy(self.biases)
        
        # loop over kernels
        for i in range(self.depth):
            for j in range(self.input_depth):
                self.output[i] += correlate2d(self.inputs[j], self.weights[i, j], "valid")
                
    def backward(self, dinputs):

        # initialize gradients
        self.dweights = np.zeros(self.kernels_shape) # weights = kernels Kij
        self.dinputs = np.zeros(self.input_shape)
        self.dbiases = dinputs
        
        for i in range(self.depth):
            for j in range(self.input_depth):
                self.dweights[i, j] = correlate2d(self.inputs[j], dinputs[i], "valid")
                self.dinputs[j] += convolve2d(dinputs[i], self.weights[i, j], "full")

    
    
    
class reshape:
    def __init__(self, input_shape, output_shape):
        self.input_shape = input_shape
        self.output_shape = output_shape

    def forward(self, inputs):
        self.output = np.reshape(inputs, self.output_shape)

    def backward(self, dinputs):
        self.dinputs = np.reshape(dinputs, self.input_shape)
        
# neural network layer
class layer:
    
    # initialization (weights and biases)
    def __init__(self,n_inputs, n_neurons): # self represents the instance of the class
        self.weights = 0.1*np.random.randn(n_inputs,n_neurons) # small random numbers
        self.biases = np.zeros((1,n_neurons))# zeros
        
        # initialize momentums and caches
        self.weight_momentums = np.zeros(self.weights.shape)
        self.weight_caches = np.zeros(self.weights.shape)
        self.bias_momentums = np.zeros(self.biases.shape)
        self.bias_caches = np.zeros(self.biases.shape)
        
    # forward pass: pass the data through the layer
    def forward(self, inputs):
        # we need to store the output
        self.output = inputs.dot(self.weights) + self.biases
        
        # remember input values
        self.inputs = inputs
        
    def backward(self, dinputs):
        
        # gradient
        self.dinputs = dinputs.dot(self.weights.T)
        
        # gradients on parameters
        self.dweights = self.inputs.T.dot(dinputs)
        self.dbiases = np.sum(dinputs, axis=0, keepdims=True)
        

        
# activation function
class ReLU:
    def forward(self, inputs):
        self.output = np.maximum(0, inputs)
        
        # remember input values
        self.inputs = inputs
        
    def backward(self, dinputs):

        # copy dvalues
        self.dinputs = dinputs.copy()
        
        # zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0
        
        

        
# softmax activation function
class softmax:
    
    def forward(self, inputs):
        
        # unnormalized probabilities
        exp_values = np.exp(inputs-np.max(inputs,axis=1, keepdims=True))
        
        # normalized probabilities
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        self.output = probabilities
        
        # remember input values
        self.inputs = inputs
        
        
    def backward(self,dinputs):

        
        # initialized gradient array
        self.dinputs = np.zeros(dinputs.shape)
        
        # For each data point, compute gradient
        for i in range(len(dinputs)):
            
            # probabilities 
            proba = self.output[i]
            
            # loss gradients
            dinputs_row = dinputs[i]


            # calculate derivative matrix of the output
            derivative_matrix = np.diag(proba) - proba.reshape(-1,1).dot(proba.reshape(1,-1))
            
            # store gradient
            self.dinputs[i] = dinputs_row.dot(derivative_matrix)
            

class loss_crossentropy():
    
    def forward(self, y_hat, y):
        
        # number of data points
        n_samples = len(y)
        
        # clip data to prevent division by 0
        y_hat_clipped = np.clip(y_hat, 1e-7,1 - 1e-7)
        
        # predicted class probabilities
        pred_proba = y_hat_clipped[range(n_samples),y]
       
        # sample losses 
        losses = -np.log(pred_proba)
        
        return losses
    
    
    def backward(self, p, y):
        
        
        # number of samples
        n_samples = len(y)
    
        # one-hot matrix
        Y = np.zeros(p.shape)
        Y[np.arange(len(y)),y] = 1
        
        # calculate gradient
        self.dinputs = -Y / np.clip(p,a_min=1e-200,a_max=1)

        # normalized gradient
        self.dinputs = self.dinputs / n_samples
            
            
        
class optimizer_GD:

    def __init__(self, learning_rate=1):
        self.learning_rate = learning_rate
        
    def update_parameters(self, layer):
        
        # apply gradient descent iteration
        layer.weights = layer.weights - self.learning_rate*layer.dweights
        layer.biases = layer.biases - self.learning_rate*layer.dbiases
        
        
# Adam optimizer
class optimizer_ADAM:
    
    # Initialize optimizer - set parameters
    def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7,beta_1=0.9, beta_2=0.999):
        
        # initial learning rate
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        
        # decay parameters
        self.decay = decay
        
        # number of epochs
        self.epoch = 0
        
        # update learning rate parameter
        self.epsilon = epsilon
        
        # momentum parameter
        self.beta_1 = beta_1
        
        # cache parameter
        self.beta_2 = beta_2
        
        
    # update learning rate
    def pre_update_parameters(self):
        
        # update learning rate
        self.current_learning_rate = self.learning_rate * (1 / (1 + self.decay * self.epoch))
            
    # Update weights/biases
    def update_parameters(self, layer):
                   
        # momentums
        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases
        
        # corrected momentums
        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.epoch + 1))
        bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.epoch + 1))
        
        # cache
        layer.weight_caches = self.beta_2 * layer.weight_caches + (1 - self.beta_2) * layer.dweights**2  
        layer.bias_caches = self.beta_2 * layer.bias_caches + (1 - self.beta_2) * layer.dbiases**2
        
        # corrected cache
        weight_caches_corrected = layer.weight_caches / (1 - self.beta_2 ** (self.epoch + 1))
        bias_caches_corrected = layer.bias_caches / (1 - self.beta_2 ** (self.epoch + 1))
        
        # update weights and biases
        layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_caches_corrected) + self.epsilon)
        layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_caches_corrected) + self.epsilon)
        
    # update epoch
    def post_update_parameters(self):
        self.epoch += 1
        
        
# dropout layer
class dropout:
    
    # initialization - set the dropout rate
    def __init__(self, rate):
        self.rate = rate
        
    # forward pass
    def forward(self, inputs):
        # save inputs 
        self.inputs = inputs
        # mask
        self.mask = np.random.binomial(1, 1 - self.rate, size = inputs.shape) / (1 - self.rate)
        # apply mask
        self.output = inputs * self.mask
        
    # backward pass
    def backward(self, dinputs):
        self.dinputs = dinputs*self.mask

In [2]:
import numpy as np
import matplotlib.pyplot as plot
import pandas as pd

In [3]:
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Theory/master/Data/fashion-mnist_train.csv'
data = pd.read_csv(url)
X = data.drop('label',axis=1).to_numpy()
y = data['label']

In [4]:
X.shape

(5000, 784)

In [46]:
# layer 1
layer1 = layer(n_inputs = 784, n_neurons = 128)
activation1 = ReLU()
dropout1 = dropout(0.4)

# layer 2
layer2 = layer(n_inputs = 128, n_neurons = 10)
activation2 = softmax()


# loss function 
loss_function = loss_crossentropy()

# optimizer
optimizer = optimizer_ADAM(learning_rate=0.01)

In [47]:
# fit the neural network to the training dataset
n_epochs = 501
loss = np.zeros(n_epochs)
accuracy = np.zeros(n_epochs)

# gradient descent iterations
for epoch in range(n_epochs):

    # forward pass
    layer1.forward(X)
    activation1.forward(layer1.output)
    dropout1.forward(activation1.output)
    layer2.forward(dropout1.output)
    activation2.forward(layer2.output)


    # loss (mean squared error)
    losses = loss_function.forward(activation2.output,y)
    loss[epoch] = np.mean(losses)
    
    # accuracy
    y_pred = np.argmax(activation2.output,axis=1)
    accuracy[epoch] = 100*np.mean(y_pred == y)

    # backward pass
    loss_function.backward(activation2.output,y)
    activation2.backward(loss_function.dinputs)
    layer2.backward(activation2.dinputs)
    dropout1.backward(layer2.dinputs)
    activation1.backward(dropout1.dinputs)
    layer1.backward(activation1.dinputs)
    
    
    # update weights and biases
    optimizer.pre_update_parameters()
    optimizer.update_parameters(layer1)
    optimizer.update_parameters(layer2)
    optimizer.post_update_parameters()

    
    
    # print loss and accuracy (every 100 epochs)
    if epoch % 10 == 0:
        print('epoch: '+str(epoch))
        print('loss: '+str(np.round(loss[epoch],3)))
        print('accuracy: '+str(accuracy[epoch]))
        print('------------------------')

epoch: 0
loss: 2.574
accuracy: 6.419999999999999
------------------------
epoch: 10
loss: 1.853
accuracy: 30.0
------------------------
epoch: 20
loss: 1.727
accuracy: 39.42
------------------------
epoch: 30
loss: 1.668
accuracy: 41.44
------------------------
epoch: 40
loss: 1.605
accuracy: 44.12
------------------------
epoch: 50
loss: 1.545
accuracy: 46.339999999999996
------------------------
epoch: 60
loss: 1.474
accuracy: 48.94
------------------------
epoch: 70
loss: 1.401
accuracy: 50.54
------------------------
epoch: 80
loss: 1.34
accuracy: 53.559999999999995
------------------------
epoch: 90
loss: 1.288
accuracy: 54.54
------------------------
epoch: 100
loss: 1.233
accuracy: 55.1
------------------------
epoch: 110
loss: 1.175
accuracy: 57.52
------------------------
epoch: 120
loss: 1.13
accuracy: 59.12
------------------------
epoch: 130
loss: 1.079
accuracy: 60.919999999999995
------------------------
epoch: 140
loss: 1.041
accuracy: 63.019999999999996
----------------

In [41]:
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Theory/master/Data/fashion-mnist_test.csv'
data_test = pd.read_csv(url)
X_test = data_test.drop('label',axis=1).to_numpy()
y_test = data_test['label']

In [48]:
# forward pass
layer1.forward(X_test)
activation1.forward(layer1.output)
dropout1.forward(activation1.output)
layer2.forward(dropout1.output)
activation2.forward(layer2.output)

# prediction
y_pred_test = np.argmax(activation2.output,axis=1)
100*np.mean(y_pred_test == y_test)

74.25

In [49]:
# layer 1
layer1 = convolutional_layer(input_shape=(1,28,28), kernel_size=5, depth=5)
activation1 = ReLU()

# layer 2
layer2 = convolutional_layer(input_shape=(5,24,24), kernel_size=5, depth=5) # 24 = 28-5+1
activation2 = ReLU()

# layer 3
layer3 = reshape(input_shape=(5,20,20), output_shape=(1,5*20*20)) # 20 = 24-5+1

# layer 4
layer4 = layer(n_inputs = 5 * 20 * 20, n_neurons = 10)
activation4 = softmax()


# loss function 
loss_function = loss_crossentropy()

# optimizer
optimizer = optimizer_ADAM(learning_rate=0.01)

In [53]:
X = X.reshape(len(X),1,28,28)

In [54]:
n_epochs = 10
loss = np.zeros(n_epochs)
accuracy = np.zeros(n_epochs)
# gradient descent iterations
for epoch in range(n_epochs):
    
    # EPOCH = loop over all datapoints
    losses = np.zeros(len(X))
    y_pred = np.zeros(len(X))
    for i in range(len(X)):

        # forward pass
        layer1.forward(X[i])
        activation1.forward(layer1.output)

        layer2.forward(activation1.output)
        activation2.forward(layer2.output)
        
        layer3.forward(activation2.output)
        
        layer4.forward(layer3.output)
        activation4.forward(layer4.output)
        

        # sample loss
        losses[i] = loss_function.forward(activation4.output,y[i].reshape(1))
        # prediction
        y_pred[i] = np.argmax(activation4.output,axis=1)
        

        # backward pass
        loss_function.backward(activation4.output,y[i].reshape(1))
        activation4.backward(loss_function.dinputs)
        layer4.backward(activation4.dinputs)

        layer3.backward(layer4.dinputs)
        
        activation2.backward(layer3.dinputs)
        layer2.backward(activation2.dinputs)

        activation1.backward(layer2.dinputs)
        layer1.backward(activation1.dinputs)


        # update weights and biases
        optimizer.pre_update_parameters()
        optimizer.update_parameters(layer1)
        optimizer.update_parameters(layer2)
        optimizer.update_parameters(layer4)
        optimizer.post_update_parameters()

    # loss and accuracy
    loss[epoch] = np.mean(losses)
    accuracy[epoch] = 100*np.mean(y_pred == y)
    
    
    # print loss and accuracy
    print('epoch: '+str(epoch))
    print('loss: '+str(np.round(loss[epoch],3)))
    print('accuracy: '+str(accuracy[epoch]))
    print('------------------------')

epoch: 0
loss: 2.311
accuracy: 9.74
------------------------
epoch: 1
loss: 2.311
accuracy: 9.78
------------------------
epoch: 2
loss: 2.311
accuracy: 9.78
------------------------


KeyboardInterrupt: 

In [64]:
# layer 1
layer1 = convolutional_layer(input_shape=(1,28,28), kernel_size=5, depth=10)
activation1 = ReLU()


# layer 2
layer2 = reshape(input_shape=(10,24,24), output_shape=(1,10*24*24)) # 24 = 28-5+1

# layer 3
layer3 = layer(n_inputs = 10 * 24 * 24, n_neurons = 10)
activation3 = softmax()


# loss function 
loss_function = loss_crossentropy()

# optimizer
optimizer = optimizer_ADAM(learning_rate=0.01)

In [65]:
n_epochs = 10
loss = np.zeros(n_epochs)
accuracy = np.zeros(n_epochs)
# gradient descent iterations
for epoch in range(n_epochs):
    
    # EPOCH = loop over all datapoints
    losses = np.zeros(len(X))
    y_pred = np.zeros(len(X))
    for i in range(len(X)):

        # forward pass
        layer1.forward(X[i])
        activation1.forward(layer1.output)

        layer2.forward(activation1.output)
        
        layer3.forward(layer2.output)
        activation3.forward(layer3.output)
        

        # sample loss
        losses[i] = loss_function.forward(activation3.output,y[i].reshape(1))
        # prediction
        y_pred[i] = np.argmax(activation3.output,axis=1)
        

        # backward pass
        loss_function.backward(activation3.output,y[i].reshape(1))
        activation3.backward(loss_function.dinputs)
        layer3.backward(activation3.dinputs)


        layer2.backward(layer3.dinputs)

        activation1.backward(layer2.dinputs)
        layer1.backward(activation1.dinputs)


        # update weights and biases
        optimizer.pre_update_parameters()
        optimizer.update_parameters(layer1)
        optimizer.update_parameters(layer3)
        optimizer.post_update_parameters()

    # loss and accuracy
    loss[epoch] = np.mean(losses)
    accuracy[epoch] = 100*np.mean(y_pred == y)
    
    
    # print loss and accuracy 
    print('epoch: '+str(epoch))
    print('loss: '+str(np.round(loss[epoch],3)))
    print('accuracy: '+str(accuracy[epoch]))
    print('------------------------')

epoch: 0
loss: 1.277
accuracy: 54.74
------------------------
epoch: 1
loss: 0.84
accuracy: 70.34
------------------------
epoch: 2
loss: 0.735
accuracy: 74.18
------------------------
epoch: 3
loss: 0.69
accuracy: 76.46
------------------------
epoch: 4
loss: 0.646
accuracy: 77.86
------------------------
epoch: 5
loss: 0.623
accuracy: 79.17999999999999
------------------------
epoch: 6
loss: 0.611
accuracy: 79.88
------------------------
epoch: 7
loss: 0.571
accuracy: 80.78
------------------------
epoch: 8
loss: 0.58
accuracy: 80.64
------------------------
epoch: 9
loss: 0.576
accuracy: 81.28
------------------------


In [69]:
X_test = X_test.reshape(len(X_test),1,28,28)

In [70]:
y_test_pred = np.zeros(len(X_test))
for i in range(len(X_test)):

    # forward pass
    layer1.forward(X_test[i])
    activation1.forward(layer1.output)

    layer2.forward(activation1.output)

    layer3.forward(layer2.output)
    activation3.forward(layer3.output)

    # prediction
    y_test_pred[i] = np.argmax(activation3.output,axis=1)
        

In [71]:
100*np.mean(y_test==y_test_pred)

68.0