In [2]:
#!/usr/bin/env python
# coding: utf-8

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../input/deepleaninghw1'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import random
import math
from sklearn.model_selection import train_test_split

In [3]:
class Layer():
    def __init__(self, in_size, out_size,N):
        self.in_size=in_size
        self.out_size=out_size
        self.N=N #num. of instances in the training data
        #self.activation_func=activation_func
        
        self.weights = np.random.rand(self.in_size,self.out_size) * (1/self.N)
        self.bias = np.random.rand(1,self.out_size)
        
class NN():
    def __init__(self,layer1,layer2,layer3):
        self.layer1=layer1
        self.layer2=layer2
        self.layer3=layer3
        
    def softmax(self,x):
        e_x = np.exp(x - np.max(x))
        return e_x / np.sum(e_x)
    
    def tanh(self,x):
        return np.tanh*x

    def forward_propagation(self,inputs):
        z_layer_1 = np.dot(inputs, self.layer1.weights) + self.layer1.bias
        output_layer_1 = self.tanh(z_layer_1) 
        
        z_layer_2 = np.dot(output_layer_1, self.layer2.weights) + self.layer2.bias
        output_layer_2 = self.tanh(z_layer_2)
        
        z_layer_3 = np.dot(output_layer_2, self.layer3.weights) + self.layer3.bias
        output_layer_3 = self.tanh(z_layer_3)
        
        return output_layer_1, output_layer_2, output_layer_3
    
    def loss(self,labels,layer_output):
        return np.negative(np.sum(np.multiply(labels, np.log(layer_output))))
    
    def loss_derivative(self,labels,layer_output):
        return (layer_output - 1) / labels.shape[0]

    def backward_propagation(self, learning_rate, labels, training_inputs, output_layer_1, output_layer_2, output_layer_3):
        
        targets = labels
        delta_layer3 = self.loss_derivative(targets, output_layer_3)
        delta_layer2 = (delta_layer3).dot(self.layer3.weights.T) * output_layer_2 * (1 - output_layer_2)
        delta_layer1 = (delta_layer2).dot(self.layer2.weights.T) * output_layer_1 * (1 - output_layer_1)

        learning_rate_bias = 0.001

        self.layer3.weights -= learning_rate * output_layer_2.T.dot(delta_layer3)
        self.layer3.bias -= learning_rate_bias * (delta_layer3).sum(axis=0)
        
        self.layer2.weights -= learning_rate * output_layer_1.T.dot(delta_layer2)
        self.layer2.bias -= learning_rate_bias * (delta_layer2).sum(axis=0)

        self.layer1.weights -= learning_rate * training_inputs.T.dot(delta_layer1)
        self.layer1.bias -= learning_rate_bias * (delta_layer1).sum(axis=0)

    def train(self, batch_size, training_inputs, labels, n_epochs, learning_rate):
        
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        
        costs=[]
        for j in range(n_epochs):
            i = 0
            print("/// EPOCH: ", j+1, "/", n_epochs, " ///")

            output_layer_1, output_layer_2, output_layer_3 = self.forward_propagation(training_inputs)
            layer3_error = self.loss(labels, output_layer_3)
            self.backward_propagation(learning_rate, labels, training_inputs, output_layer_1, output_layer_2, output_layer_3)

            error_result = layer3_error / len(output_layer_3)
            print("\nError: ", error_result)

            costs.append(error_result)

        print(costs)
        
        
    def test_accuracy(self, test_inputs, test_outputs):
     
        output_layer_1, output_layer_2, output_layer_3 = self.forward_propagation(test_inputs)
        number_of_instance = len(test_outputs)
        nearest_node = np.argmax(output_layer_3, axis=1)
        test_real_value = np.argmax(test_outputs, axis=1)
        correct = (nearest_node == test_real_value).sum()
        print ("Accuracy: ", correct*100/ number_of_instance,"%")
        
    def print_weights(self):
        print ("============= LAYER 1 =============")
        print ("------------ WEIGHT 1 -------------:")
        print (self.layer1.weights)
        print ("------------ BIAS 1 -------------:")
        print ("============= LAYER 2 =============")
        print ("------------ WEIGHT 2 -------------:")
        print (self.layer2.weights)
        print ("------------ BIAS 2 -------------:")
        print (self.layer2.bias)
        print ("============= LAYER 3 =============")
        print ("------------ WEIGHT 3 -------------:")
        print (self.layer3.weights)
        print ("------------ BIAS 3 -------------:")
        print (self.layer3.bias)

In [4]:
#test_y = pd.read_csv('../input/deepleaninghw1/sample_submission.csv')
#test_y=pd.get_dummies(test_y,prefix=['col2'])

In [9]:
if __name__=='__main__':
    train_x = np.load('../input/deepleaninghw1/mnist.train.npy')
    train_x = train_x / 255
    train_x = train_x.reshape(train_x.shape[0], 784)
    
    train_y= np.load('../input/deepleaninghw1/mnist.trainlabel.npy')
    
    def one_hot_encoding(Y):
        n_col = np.amax(Y) + 1
        binarized = np.zeros((len(Y), n_col))
        for i in range(len(Y)):
            binarized[i, Y[i]] = 1.
        return binarized
    
    train_y = train_y.reshape(-1, 1)
    train_y = one_hot_encoding(train_y)
    
    number_of_inputs_node=784
    number_of_layer1_node=512
    number_of_layer2_node=256
    number_of_outputs_node = 10
    N=train_x.shape[0]
    
    layer1 = Layer(number_of_layer1_node, number_of_inputs_node, N)
    layer2 = Layer(number_of_layer2_node, number_of_layer1_node,N)
    layer3 = Layer(number_of_outputs_node, number_of_layer2_node,N)
    
    neural_network = NN(layer1,layer2,layer3)
    neural_network.train(32,train_x,train_y,n_epochs=50, learning_rate=0.01)
    #neural_network.print_weights()


In [36]:
    train_x = np.load('../input/deepleaninghw1/mnist.train.npy')
    test_y = pd.read_csv('../input/deepleaninghw1/sample_submission.csv')
    train_y = np.load('../input/deepleaninghw1/mnist.trainlabel.npy')
    test_x = np.load('../input/deepleaninghw1/mnist.test.npy')


    # one-hot encode the labels
    #Convert array to one-hot encoding
    def one_hot_encoding(Y):
        n_col = np.amax(Y) + 1
        binarized = np.zeros((len(Y), n_col))
        for i in range(len(Y)):
            binarized[i, Y[i]] = 1.
        return binarized

    # Change the shape of the array.
    # (AxB * BxC = AxC) 
    # e.g We will shape the 56000 * 28 * 28 3-D array to 56000 * 784 2-D array
    def matrix_mul(matrix, remain_dimen, pixels):
        matrix = matrix.reshape(matrix.shape[remain_dimen], pixels)
        return matrix

    # Prepare X for train and test
    pixels = 784
    train_x = train_x / 255
    test_x = test_x / 255   
    train_x = train_x.reshape(train_x.shape[0], pixels)
    test_x = test_x.reshape(test_x.shape[0], pixels)        

    # Prepare Y for train and test
    train_y = train_y.reshape(-1, 1)
    test_y = test_y.to_numpy()
    test_y = test_y.reshape(-1, 1)
    train_y = one_hot_encoding(train_y)
    test_y = one_hot_encoding(test_y) 

In [143]:
def seed_everything(SEED): 
    np.random.seed(SEED) 
    random.seed(SEED)

seed_everything(1313)

In [144]:
class FCLayer:
    def __init__(self, input_size, output_size):
        self.input_size = input_size
        self.output_size = output_size
        
        self.weights = np.random.randn(input_size, output_size) / np.sqrt(input_size + output_size)
        self.bias = np.random.randn(1, output_size) / np.sqrt(input_size + output_size)
        
        
    def forward(self, input):
        self.input = input
        return np.dot(input, self.weights) + self.bias

    def backward(self, output_error, learning_rate):
        input_error = np.dot(output_error, self.weights.T)
        weights_error = np.dot(self.input.T, output_error)
        # bias_error = output_error
        
        self.weights -= learning_rate * weights_error
        self.bias -= learning_rate * output_error
        return input_error

In [145]:
class ActivationLayer:
    def __init__(self, activation, activation_prime):
        self.activation = activation
        self.activation_prime = activation_prime
    
    def forward(self, input):
        self.input = input
        return self.activation(input)
    
    def backward(self, output_error, learning_rate):
        return output_error * self.activation_prime(self.input)

In [146]:
# bonus
class FlattenLayer:
    def __init__(self, input_shape):
        self.input_shape = input_shape

    def forward(self, input):
        return np.reshape(input, (1, -1))
    
    def backward(self, output_error, learning_rate):
        return np.reshape(output_error, self.input_shape)

In [147]:
# bonus
class SoftmaxLayer:
    def __init__(self, input_size):
        self.input_size = input_size
    
    def forward(self, input):
        self.input = input
        tmp = np.exp(input)
        self.output = tmp / np.sum(tmp)
        return self.output
    
    def backward(self, output_error, learning_rate):
        input_error = np.zeros(output_error.shape)
        out = np.tile(self.output.T, self.input_size)
        return self.output * np.dot(output_error, np.identity(self.input_size) - out)

In [148]:
def tanh(x):
    return np.tanh(x)

def tanh_prime(x):
    return 1 - np.tanh(x)**2

In [149]:
def softmax(X):
    exps = np.exp(X)
    return exps / np.sum(exps)
    
def cross_entropy(X,y):
    """
    X is the output from fully connected layer (num_examples x num_classes)
    y is labels (num_examples x 1)
    	Note that y is not one-hot encoded vector. 
    	It can be computed as y.argmax(axis=1) from one-hot encoded vectors of labels if required.
    """
    m = y.shape[0]
    p = softmax(X)
    # We use multidimensional array indexing to extract 
    # softmax probability of the correct label for each sample.
    # Refer to https://docs.scipy.org/doc/numpy/user/basics.indexing.html#indexing-multi-dimensional-arrays for understanding multidimensional array indexing.
    log_likelihood = -np.log(p[range(m),y])
    loss = np.sum(log_likelihood) / m
    return loss
    
def delta_cross_entropy(X,y):
    """
    X is the output from fully connected layer (num_examples x num_classes)
    y is labels (num_examples x 1)
    	Note that y is not one-hot encoded vector. 
    	It can be computed as y.argmax(axis=1) from one-hot encoded vectors of labels if required.
    """
    m = y.shape[0]
    grad = softmax(X)
    grad[range(m),y] -= 1
    grad = grad/m
    return grad

In [150]:
train_x = np.load('../input/deepleaninghw1/mnist.train.npy')
test_y = pd.read_csv('../input/deepleaninghw1/sample_submission.csv')
train_y = np.load('../input/deepleaninghw1/mnist.trainlabel.npy')
test_x = np.load('../input/deepleaninghw1/mnist.test.npy')

# Define two functions for preprocessing
# One-hot encoding is used for the labels
# one-hot encode the labels
#Convert array to one-hot encoding
def one_hot_encoding(Y):
    n_col = np.amax(Y) + 1
    binarized = np.zeros((len(Y), n_col))
    for i in range(len(Y)):
        binarized[i, Y[i]] = 1.
    return binarized

# Change the shape of the array.
# (AxB * BxC = AxC) 
# e.g We will shape the 56000 * 28 * 28 3-D array to 56000 * 784 2-D array
def matrix_mul(matrix, remain_dimen, pixels):
    matrix = matrix.reshape(matrix.shape[remain_dimen], pixels)
    return matrix

# Prepare X for train and test
pixels = 784
train_x = train_x / 255
test_x = test_x / 255   
train_x = train_x.reshape(train_x.shape[0], pixels)
test_x = test_x.reshape(test_x.shape[0], pixels)        
     
# Prepare Y for train and test
train_y = train_y.reshape(-1, 1)
test_y = test_y.to_numpy()
test_y = test_y.reshape(-1, 1)
train_y = one_hot_encoding(train_y)
test_y = one_hot_encoding(test_y) 

In [151]:
# unlike the Medium article, I am not encapsulating this process in a separate class
# I think it is nice just like this
network = [
    FlattenLayer(input_shape=(28, 28)),
    FCLayer(28 * 28, 128),
    ActivationLayer(tanh, tanh_prime),
    FCLayer(128, 10),
    SoftmaxLayer(10)
]

epochs = 50
learning_rate = 0.1



# training
for epoch in range(epochs):
    error = 0
    for x, y_true in zip(train_x, train_y):
        # forward
        output = x
        for layer in network:
            output = layer.forward(output)
        
        # error (display purpose only)
        error += cross_entropy_loss(y_true, output)

        # backward
        output_error = delta_cross_entropy(y_true, output)
        for layer in reversed(network):
            output_error = layer.backward(output_error, learning_rate)
    
    error /= len(x_train)
    print('%d/%d, error=%f' % (epoch + 1, epochs, error))

In [None]:
import numpy as np

from network import Network
from fc_layer import FCLayer
from activation_layer import ActivationLayer
from activations import tanh, tanh_prime
from losses import mse, mse_prime

from keras.datasets import mnist
from keras.utils import np_utils

# load MNIST from server
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# training data : 60000 samples
# reshape and normalize input data
x_train = x_train.reshape(x_train.shape[0], 1, 28*28)
x_train = x_train.astype('float32')
x_train /= 255
# encode output which is a number in range [0,9] into a vector of size 10
# e.g. number 3 will become [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
y_train = np_utils.to_categorical(y_train)

# same for test data : 10000 samples
x_test = x_test.reshape(x_test.shape[0], 1, 28*28)
x_test = x_test.astype('float32')
x_test /= 255
y_test = np_utils.to_categorical(y_test)

# Network
net = Network()
net.add(FCLayer(28*28, 100))                # input_shape=(1, 28*28)    ;   output_shape=(1, 100)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(100, 50))                   # input_shape=(1, 100)      ;   output_shape=(1, 50)
net.add(ActivationLayer(tanh, tanh_prime))
net.add(FCLayer(50, 10))                    # input_shape=(1, 50)       ;   output_shape=(1, 10)
net.add(ActivationLayer(tanh, tanh_prime))

# train on 1000 samples
# as we didn't implemented mini-batch GD, training will be pretty slow if we update at each iteration on 60000 samples...
net.use(mse, mse_prime)
net.fit(x_train[0:1000], y_train[0:1000], epochs=35, learning_rate=0.1)

# test on 3 samples
out = net.predict(x_test[0:3])
print("\n")
print("predicted values : ")
print(out, end="\n")
print("true values : ")
print(y_test[0:3])

In [None]:
class NN:
    
    # The following member functions are madantory
    
    def __init__(self, ...):
    
    # forward propagation
    def forward_propagation(self, ...):

    # loss function
    def loss(self, ...):
       
    # backward propagation
    def backward_propagation(self, ...):
 
    
    # The following member functions are optional, but can be helpful
    
    #define softmax function
    def softmax(self, ...):

    #step 6.update parameters
    def update_parameters(self, ...):

In [None]:
class NN(Layer):
    # The following member functions are madantory
    
    def __init__(self, in_size, out_size, activation_func,layer1, layer2, N):
        
        super(NN,self).__init__(in_size, out_size, activation_func)
        
        self.layer1=layer1
        self.layer2=layer2
        self.N=N
        self.weight = 1 / N #initialize 
        self.weight1 = np.random.normal(self.weight, pow(self.layer1, -0.5), (self.layer2, self.layer1))
        self.weight2 = np.random.normal(self.weight, pow(self.layer2, -0.5), (self.out_unit, self.layer2))
      
   
    # forward propagation:miminize the error by changing the parameters in the network
    def forward_propagation(self, inputs,target):
        # calculate signals into hidden layer
        hidden_inputs = np.dot(self.weight1, inputs)
        # calculate the signals emerging from hidden layer
        hidden_outputs = self.tanh(hidden_inputs)
        
        # calculate signals into final output layer
        final_inputs = np.dot(self.weight2, hidden_outputs)
        # calculate the signals emerging from final output layer
        final_outputs = self.tanh(final_inputs)
        
    def mini_batch(self, matrix, batch_size):
        size = matrix.shape[0]
        mask = np.random.choice(size, batch_size)
        matrix_batch = matrix[mask]
        return matrix_batch
    
    # loss function
    def loss(self,y,t):
        batch_size = 32
        y = self.mini_batch(y, batch_size)
        t = self.mini_batch(t, batch_size)
        delta = 1e-7
        whole_loss = -np.sum(t * np.log(y + delta))
        result = whole_loss / batch_size 
        return result
    
    # backward propagation: derivative of the error with respect to its output
    def backward_propagation(self, inputs, target):
        
        # output layer error is the (target - actual)
        output_errors = targets - final_outputs
        # hidden layer error is the output_errors, split by weights, recombined at hidden nodes
        hidden_errors = np.dot(self.weight2.T, output_errors) 
        
        # update the weights for the links between the hidden and output layers
        self.weight2 += self.lr * np.dot((output_errors * final_outputs * (1.0 - final_outputs)), np.transpose(hidden_outputs))
        
        # update the weights for the links between the input and hidden layers
        self.weight1 += self.lr * np.dot((hidden_errors * hidden_outputs * (1.0 - hidden_outputs)), np.transpose(inputs))
        
    def query(self, inputs):
        # calculate signals into hidden layer
        hidden_inputs = np.dot(self.weight1, inputs)
        # calculate the signals emerging from hidden layer
        hidden_outputs = self.tanh(hidden_inputs)
        
        # calculate signals into final output layer
        final_inputs = np.dot(self.weight2, hidden_outputs)
        # calculate the signals emerging from final output layer
        final_outputs = self.tanh(final_inputs)
        
        return final_outputs
    
    
    
    
    
    
    
    """
    # The following member functions are optional, but can be helpful
    
    #define softmax function
    def softmax(self):
        pass
    #step 6.update parameters
    def update_parameters(self):
        pass 
    """


In [None]:
# Define two functions for preprocessing
# One-hot encoding is used for the labels
def one_hot_encoding(a, classes):
    targets = a.reshape(-1)
    a = np.eye(classes)[targets]
    return a

# Change the shape of the array.
# (AxB * BxC = AxC) 
# e.g We will shape the 56000 * 28 * 28 3-D array to 56000 * 784 2-D array
def matrix_mul(matrix, remain_dimen, pixels):
    matrix = matrix.reshape(matrix.shape[remain_dimen], pixels)
    return matrix

# Prepare X for train and test
pixels = 784
train_x = train_x / 255
test_x = test_x / 255   
train_x = train_x.reshape(train_x.shape[0], pixels)
test_x = test_x.reshape(test_x.shape[0], pixels)        
     
# Prepare Y for train and test
train_y = train_y.reshape(-1, 1)
test_y = test_y.to_numpy()
test_y = test_y.reshape(-1, 1)
train_y = one_hot_encoding(train_y, classes = 10)
test_y = one_hot_encoding(test_y, classes = 10)       


# Parameters
hidden1 = 784
hidden2 = 256
output_unit = 10
learning_rate = 0.1
N = train_x.shape[0]


In [None]:
"""
# Driver function

x_train = np.load('../input/deepleaninghw1/mnist.train.npy')
submission = pd.read_csv('../input/deepleaninghw1/sample_submission.csv')
y_train = np.load('../input/deepleaninghw1/mnist.trainlabel.npy')
x_test = np.load('../input/deepleaninghw1/mnist.test.npy')


# one-hot encode the labels
def one_hot_encoding(a, classes):
    targets = a.reshape(-1)
    a = np.eye(classes)[targets]
    return a

# prepare training (80%) and validation data (20%)
x, x_validation, y, y_validation = train_test_split(x_train,y_train,test_size = 0.2)


# Create your NN with two hidden layers
n_1 = 512
n_2 = 256
n_3 = 10



learnning_rate_pool = [0.001,0.002,0.0015]
learnning_rate = np.random.choice(learnning_rate_pool)


# Train your Model using x, x_validation, y, y_validation
for e in range(100):    
    for i in range(0,x.shape[0],32):
        data,target = x[i:i+32],y[i:i+32]   # this is one mini-batch
        data = data.reshape(-1,784)         
        
        ## pass the data to your model and perform forward and backward passes
        ## must show the training process, i.e., running loss, accuracy, etc.
        ## must using the above learning rate to train. 
        

# get predictions for test data x_test


#submit my predictions
submission = pd.read_csv('../input/deepleaninghw1/sample_submission.csv',index_col = 0)
submission['class'] = pred_test
submission.to_csv('Neural_Network_Submission.csv')


submission.head()
"""