In [20]:
import numpy as np 
import pandas as pd 

In [21]:
data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

test_data = np.array(test_data)
X_test = test_data.T

data = np.array(data)
m, n = data.shape
dev_data = data[0:1000].T
Y_dev = dev_data[0]
X_dev = dev_data[1:n]
X_dev = X_dev / 255

train_data = data[1000:m].T
Y_train = train_data[0]
X_train = train_data[1:n]
X_train = X_train / 255

In [22]:
def init_params(layers_dims):
    L = len(layers_dims)
    parameters = {}
    for l in range(1, L):
        
        parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l-1]) * np.sqrt(2 / layers_dims[l-1])
        parameters['b' + str(l)] = np.random.randn(layers_dims[l], 1) * np.sqrt(2 / layers_dims[l-1])
        
    return parameters

In [23]:
def relu(Z):
    return np.maximum(Z, 0)

In [24]:
def softmax(Z):
    return np.exp(Z - np.max(Z, axis = 0)) / np.sum(np.exp(Z - np.max(Z, axis = 0)), axis = 0, keepdims = True)

In [25]:
def forward(X, parameters):
    
    L = len(parameters) // 2
    cache = {}
    activations = {}
    activations['A0'] = X
    for l in range(1, L):
        
        cache['Z' + str(l)] = np.matmul(parameters['W' + str(l)], activations['A' + str(l-1)]) + parameters['b' + str(l)]
        activations['A' + str(l)] = relu(cache['Z' + str(l)])
        
    cache['Z' + str(L)] = np.matmul(parameters['W' + str(L)], activations['A' + str(L-1)]) + parameters['b' + str(L)]
    activations['A' + str(L)] = softmax(cache['Z' + str(L)])
    
    return cache, activations

In [26]:
def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1 
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

In [27]:
def compute_cost(activations, Y):
    m = Y.size
    Y = one_hot(Y)
    L = len(activations)
    cost = -np.sum(Y * np.log(activations['A' + str(L - 1)]))/m
    cost = np.squeeze(cost)
    return cost

In [28]:
def d_relu(Z):
    return Z > 0

In [29]:
def backprop(cache, activations, parameters, Y):
    
    L = len(activations)
    m = Y.size
    Y = one_hot(Y)
    grads = {}
    
    grads['dZ' + str(L-1)] = activations['A' + str(L-1)] - Y
    for l in reversed(range(2, L)):
        
        grads['dW' + str(l)] = 1/m * np.matmul(grads['dZ' + str(l)], activations['A' + str(l-1)].T)
        grads['db' + str(l)] = 1/m * np.sum(grads['dZ' + str(l)])
        grads['dZ' + str(l-1)] = np.matmul(parameters['W' + str(l)].T, grads['dZ' + str(l)]) * d_relu(cache['Z' + str(l-1)])
        grads['dW' + str(l-1)] = 1/m * np.matmul(grads['dZ' + str(l-1)], activations['A' + str(l-2)].T)
        grads['db' + str(l-1)] = 1/m * np.sum(grads['dZ' + str(l-1)])
    
    return grads

In [30]:
def update_params(grads, parameters, learning_rate):
    
    L = len(parameters) // 2
    for l in range(1, L + 1):
        
        parameters['W' + str(l)] = parameters['W' + str(l)] - learning_rate * grads['dW' + str(l)]
        parameters['b' + str(l)] = parameters['b' + str(l)] - learning_rate * grads['db' + str(l)]
            
    return parameters

In [31]:
def get_predictions(AL):
    return np.argmax(AL, 0)

def get_accuracy(predictions, Y):
    return np.sum(predictions == Y) / Y.shape[0]

In [32]:
def model(X, Y, X_dev, Y_dev, layers_dims, learning_rate, iterations):
    
    costs = []
    L = len(layers_dims) - 1
    on_train = []
    on_test = []
    
    parameters = init_params(layers_dims)
    for i in range(1, iterations+1):
        
        cache, activations = forward(X, parameters)
    
        cost = compute_cost(activations, Y)
        costs.append(cost)
    
        grads = backprop(cache, activations, parameters, Y)
    
        parameters = update_params(grads, parameters, learning_rate)
        
        _, As = forward(X_dev, parameters)
    
        if i % 50 == 0:
            print(f'Cost after iterations {i} is {cost}')
            print(f'Accuracy on train after iteration {i} is {get_accuracy(get_predictions(activations["A" + str(L)]), Y)}')
            print(f'Accuracy on test after iteration {i} is {get_accuracy(get_predictions(As["A" + str(L)]), Y_dev)}')
            
        on_train.append(get_accuracy(get_predictions(activations["A" + str(L)]), Y))
        on_test.append(get_accuracy(get_predictions(As["A" + str(L)]), Y_dev))
    return parameters, costs, on_train, on_test

In [33]:
parameters, costs, on_train, on_test = model(X_train, Y_train, X_dev, Y_dev, layers_dims = [784, 100, 100, 100, 200, 10], learning_rate = 0.075, iterations = 150)

Cost after iterations 50 is 0.4837990453888651
Accuracy on train after iteration 50 is 0.8561707317073171
Accuracy on test after iteration 50 is 0.862
Cost after iterations 100 is 0.32061746349005105
Accuracy on train after iteration 100 is 0.9045365853658537
Accuracy on test after iteration 100 is 0.895
Cost after iterations 150 is 0.26385661307421276
Accuracy on train after iteration 150 is 0.9221951219512196
Accuracy on test after iteration 150 is 0.911
