In [1]:
class NeuralNetwork(object):
    def __init__(self, epochs, learning_rate, batch_size,nodes, rho):
        self.learning_rate=learning_rate
        self.rho=rho
        self.nodes=nodes
        self.epochs=epochs
        self.batch_size=batch_size
    
    def fit(self, x, y, w_b, t):
        z1, z2, z3, a1, a2, output=NeuralNetwork.feed_forward(self,x, w_b, t)
        grads=NeuralNetwork.backprop(self, x, y, z1, z2, z3, a1, a2, output, w_b)
        return output, grads    
      
    def evaluate(self, output, y):
        accuracy = np.mean(np.argmax(y, axis=1) == np.argmax(output, axis=1))
        return accuracy*100
    
    def feed_forward(self, x, w_b, t):
        z1=np.dot(x, w_b[0]) + w_b[1]
        a1=NeuralNetwork.ReLu(self, z1)
        if t==0:
            a1=NeuralNetwork.dropout(self,a1, 0.2)

        z2=np.dot(a1, w_b[2]) + w_b[3]
        a2=NeuralNetwork.ReLu(self,z2)
        if t==0:
            a2=NeuralNetwork.dropout(self, a2, 0.2)
            
        z3=np.dot(a2, w_b[4]) + w_b[5]
        output=NeuralNetwork.softmax(self,z3)        
        
#    print("z1", z1.shape)
#    print("a1", a1.shape)
#    print("w1", w_b[0].shape)
#    print("b1", w_b[1].shape)
#    print("z2", z2.shape)
#    print("a2", a2.shape)
#    print("w2", w_b[2].shape)
#    print("b2", w_b[3].shape)
#    print("z3", z3.shape)
#    print("output", output.shape)
#    print("w3", w_b[4].shape)
#    print("b3", w_b[5].shape)
        if t==1:
            return output
        else:
            return z1, z2, z3, a1, a2, output
  
    def backprop(self, x, y, z1, z2, z3, a1, a2, output, w_b):
        
        #https://towardsdatascience.com/neural-networks-from-scratch-easy-vs-hard-b26ddc2e89c7
        #memoization
        a3_delta = output - y
        z2_delta = np.dot(a3_delta, w_b[4].T)
        a2_delta = z2_delta * NeuralNetwork.derv_relu(self,z2) 
        z1_delta = np.dot(a2_delta, w_b[2].T)
        a1_delta = z1_delta * NeuralNetwork.derv_relu(self, z1) 
        
        #gradient calculation
        L_dw3 = np.dot(a2.T, a3_delta)
        L_db3 = np.sum(a3_delta, axis=0).reshape(1,-1)
        L_dw2 = np.dot(a1.T, a2_delta)
        L_db2 = np.sum(a2_delta, axis=0).reshape(1,-1)
        L_dw1 = np.dot(x.T, a1_delta)
        L_db1 = np.sum(a1_delta, axis=0).reshape(1,-1)

        return [L_dw1, L_db1, L_dw2, L_db2,  L_dw3, L_db3]            


    def ReLu(self,layer):
        return np.maximum(0, layer)

#https://gluon.mxnet.io/chapter03_deep-neural-networks/mlp-dropout-scratch.html
    def dropout(self, x, drop_probability):
        keep_probability = 1 - drop_probability
        mask = np.random.uniform(0, 1.0, x.shape) < keep_probability
        if keep_probability > 0.0:
            scale = (1/keep_probability)
        else:
            scale = 0.0
        return mask * x * scale

    def softmax(self, layer):
        exp = np.exp(layer-np.max(layer, axis=1).reshape(-1,1))
        sum_e = np.sum(exp, axis=1).reshape(-1,1)
        return exp / sum_e
#        for i in range(layer.shape[0]):
#            e_x=np.exp(layer[i,:]-np.max(layer[i,:]))
#            s=np.sum(e_x)
#            layer[i,:]= e_x/s
#        return layer

    def derv_softmax(self, layer):
        return np.dot(layer.T,(1-layer))

    def derv_relu(self, layer):
        layer[layer > 0]=1
        layer[layer <= 0]=0
        return layer
    
#https://gluon.mxnet.io/chapter06_optimization/rmsprop-scratch.html
    def RMSProp(self, w_b, S, grads, rho, lr):
        for wb, grad, s in zip(w_b, grads, S):
            s=rho*s + (1-rho)*np.square(grad) 
            d=lr*grad/np.sqrt(s+1e-8)
            wb-=d 
        return w_b
    
    

In [2]:
import keras
from keras.datasets import fashion_mnist

# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
num_classes = 10
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

Using TensorFlow backend.


60000 train samples
10000 test samples


In [None]:
import numpy as np
from sklearn.model_selection import KFold

data_x= np.concatenate((x_train, x_test))
data_y= np.concatenate((y_train, y_test))

nodes=512
epochs=20
rho=0.9
lr=0.001
batch_size=128

model=NeuralNetwork(epochs=epochs, learning_rate=lr, batch_size=batch_size,nodes=nodes, rho=rho)

#https://medium.com/usf-msds/deep-learning-best-practices-1-weight-initialization-14e5c0295b94
#w1=np.random.randn(x_train.shape[1], nodes) * np.sqrt(2/(x_train.shape[1]+nodes))
#w2=np.random.randn(nodes, nodes) * np.sqrt(2/(nodes+nodes))
#w3=np.random.randn(nodes, y_train.shape[1]) * np.sqrt(2/(nodes+y_train.shape[1]))
#tried out different intialisations

kfold = KFold(n_splits=5)
count=1
for train_index, test_index in kfold.split(data_x):
    print("TRAIN:", train_index, "TEST:", test_index)
    x_train, x_test = data_x[train_index], data_x[test_index]
    y_train, y_test = data_y[train_index], data_y[test_index]

    print("FOLD: ", count)
    count+=1

    x=x_train
    y=y_train
    
    w1=np.random.normal(0, 2/(x_train.shape[1]+nodes), (x_train.shape[1], nodes))
    w2=np.random.normal(0, 2/(nodes+nodes), (nodes, nodes))
    w3=np.random.normal(0, 2/(nodes+y.shape[1]), (nodes,y_train.shape[1]))

    b1=np.zeros((1, nodes))
    b2=np.zeros((1, nodes))
    b3=np.zeros((1, y_train.shape[1]))
    w_b=[w1,b1,w2,b2,w3,b3]
    S=[0,0,0,0,0,0]

    for i in range(epochs):

        iter=int(x.shape[0]/batch_size)
        print("Epoch", i+1)
        l=[]

        for j in range(iter):
            output, grads = model.fit(x[j*batch_size:(j+1)*batch_size, :], y[j*batch_size:(j+1)*batch_size, :], w_b, 0)
            l.append(np.sum(-y[j*batch_size:(j+1)*batch_size, :]*np.log(output+1e-12)/batch_size))
            #grads = model.backprop(x[j*batch_size:(j+1)*batch_size, :], y[j*batch_size:(j+1)*batch_size, :], z1, z2, z3, a1, a2, output, w_b)
            w_b=model.RMSProp(w_b, S, grads ,rho, lr)
            
            if j==iter-1 and x.shape[0]/batch_size!=0:
                output, grads = model.fit(x[(j+1)*batch_size:, :], y[(j+1)*batch_size:, :], w_b, 0)
                l.append(np.sum(-y[(j+1)*batch_size:, :]*np.log(output+1e-12)/y[(j+1)*batch_size:, :].shape[0]))
                #grads = model.backprop(x[j*batch_size:(j+1)*batch_size, :], y[j*batch_size:(j+1)*batch_size, :], z1, z2, z3, a1, a2, output, w_b)
                w_b=model.RMSProp(w_b, S, grads ,rho, lr)
                

    output_train=model.feed_forward(x, w_b, 1)
    b=model.evaluate(output_train, y)
    output_test=model.feed_forward(x_test, w_b, 1)
    a=model.evaluate(output_test, y_test)    
    print("Train Accuracy:", b)   
    print("Test Accuracy:", a) 


TRAIN: [14000 14001 14002 ... 69997 69998 69999] TEST: [    0     1     2 ... 13997 13998 13999]
FOLD:  1
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Train Accuracy: 73.99464285714285
Test Accuracy: 74.12857142857143
TRAIN: [    0     1     2 ... 69997 69998 69999] TEST: [14000 14001 14002 ... 27997 27998 27999]
FOLD:  2
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Train Accuracy: 73.89642857142857
Test Accuracy: 73.7
TRAIN: [    0     1     2 ... 69997 69998 69999] TEST: [28000 28001 28002 ... 41997 41998 41999]
FOLD:  3
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Train Accuracy: 74.2625
Test Accuracy: 74.25
TRA