In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import MinMaxScaler


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



/kaggle/input/mnist-in-csv/mnist_test.csv
/kaggle/input/mnist-in-csv/mnist_train.csv


In [2]:
# verified!!
def initialize_random_weights(layer_sizes):
    params = {}
    for l in range(len(layer_sizes) - 1):
        params['W' + str(l + 1)] = np.random.randn(layer_sizes[l + 1], layer_sizes[l]) * .01
        params['b' + str(l + 1)] = np.zeros((layer_sizes[l + 1], 1))
    return params

In [3]:
# verified!!
def relu(z):
    a = np.where(z < 0, 0, z)
    return a

In [4]:
# verified!!
def sigmoid(z):
    a = 1 / (1 + np.exp(-1 * z))
    return a

In [5]:
# verified!!
def drelu(z):
    p = np.where(z < 0, 0, 1)
    return p

In [6]:
# verified!!
def dsigmoid(z):
    p = sigmoid(z) * (1 - sigmoid(z))
    return p

In [7]:
# i think verified?
def forward_prop(W, a, b, activation):
    z = np.dot(W, a) + b
    if activation == 'relu':
        a = relu(z)
    else:
        a = sigmoid(z)
    return z, a

In [8]:
# generates predictions based on input and parameters
def predict(params, X, activations):
    # this is verified
    num_layers = len(activations)
    caches = {}
    
    # good
    a = X
    caches['a0'] = X
    for l in range(1, num_layers + 1):
        z, a = forward_prop(params['W' + str(l)], a, params['b' + str(l)], activations[l - 1])
        caches['z' + str(l)] = z
        caches['a' + str(l)] = a
    predictions = a
    return predictions, caches

In [9]:
data = np.loadtxt('/kaggle/input/mnist-in-csv/mnist_train.csv', skiprows = 1, delimiter = ',')
X_train = data[:, 1:]
Y_train_raw = data[:, 0]

In [10]:
# verified!!
def convert_Y(Y_raw):
    Y = np.zeros((Y_raw.shape[0], 10))
    for i in range(10):
        Y[:, i] = np.where(Y_raw == i, 1, Y[:, i])
    return Y

In [11]:
# input: da[l], z[l], a[l - 1]
def back_proplite(da, z, a, activation, m):
    if activation == 'relu':
        dz = drelu(z) * da
    else:
        dz = dsigmoid(z) * da
    dW = 1/m * np.dot(dz, a.T)
    db = 1/m * np.sum(dz, axis = 1, keepdims = True)
    return dz, dW, db

In [12]:
# input: w[l + 1], dz[l + 1], z[l], a[l - 1]
# returns: da[l], dz[l], dw[l], db[l]
def back_prop(W, dz, z, a, activation, m):
    da = np.dot(W.T, dz)
    dz, dW, db = back_proplite(da, z, a, activation, m)
    return dz, dW, db

In [13]:
# input: predictions, caches (z and a), params (w and b), Y, activations
# output: grads
def compute_grads(predictions, caches, params, Y, activations):
    m = Y.shape[1]
    grads = {}
    num_layers = len(activations)
    da = predictions - Y
    dz, dW, db = back_proplite(da, caches['z' + str(num_layers)], caches['a' + str(num_layers - 1)], activations[num_layers - 1], m)
    grads['dW' + str(num_layers)] = dW
    grads['db' + str(num_layers)] = db
    for l in range(num_layers - 1, 0, -1):
        # for l = 1: uses W2, dz2, z1, a0. I don't know why this is wrong?
        dz, dW, db = back_prop(params['W' + str(l + 1)], dz, caches['z' + str(l)], caches['a' + str(l - 1)], activations[l - 1], m)
        grads['dW' + str(l)] = dW
        grads['db' + str(l)] = db
    return grads

In [14]:
# changes parameters based on gradient
def alter_parameters(params, grads, learning_rate):
    num_layers = len(params) // 2
    print(len(params))
    for l in range(1, num_layers + 1):
        params['W' + str(l)] -= learning_rate * grads['dW' + str(l)]
        params['b' + str(l)] -= learning_rate * grads['db' + str(l)]
    return params

In [15]:
# verified? I don't know how to do any better
def compute_loss(predictions, Y):
    n = Y.shape[0]
    m = Y.shape[1]
    #loss_array = -1 * Y * np.log(predictions) - (1 - Y) * np.log(1 - predictions)
    #loss = 1/m * 1/n * np.squeeze(np.sum(np.sum(loss_array)))
    loss = (np.dot(Y, np.log(predictions).T) + np.dot(1-Y, np.log(1 - predictions).T)) * -1/(m * n)
    loss = np.squeeze(np.sum(loss))
    return loss

In [16]:
# verified!!
def minmax_normalize(X):
    for c in range(X.shape[1]):
        mn = np.min(X[:, c])
        X[:, c] -= mn
        rnge = np.ptp(X[:, c])
        if(rnge != 0):
            X[:, c] /= rnge
    
    return X

In [17]:
# verified!!
def minibatches(X, Y, size):
    X_batches = []
    Y_batches = []
    n_complete_batches = X.shape[0] // size
    for i in range(n_complete_batches):
        start = i * size
        end = (i + 1) * size
        X_batch = X[start:end, :]
        X_batch = minmax_normalize(X_batch)
        X_batches.append(X_batch)
        Y_batch = Y[start:end, :]
        Y_batches.append(Y_batch)
    X_batch_final = X[n_complete_batches * size:, :]
    Y_batch_final = Y[n_complete_batches * size:, :]
    X_batches.append(X_batch_final)
    Y_batches.append(Y_batch_final)
    return X_batches, Y_batches, n_complete_batches + 1

In [18]:
# verified!!
def test_accuracy(predictions, Y):
    converted_predictions = np.argmax(predictions, axis = 0)
    converted_Y = np.argmax(Y, axis = 0)
    truth = np.where(converted_predictions == converted_Y, 1, 0)
    accuracy = np.squeeze(np.sum(truth)) / truth.shape[0]
    return accuracy

In [19]:
def train_model(X_train, Y_train, layer_sizes, activations, learning_rate, epochs, batch_size):
# complete workflow:
# initialize random weights
# get X_train
# array of activation functions

# predict
# compute gradients
# alter parameters
    layer_sizes = [784, 16, 16, 16, 10]
    params = initialize_random_weights(layer_sizes)
    activations = ['relu', 'relu', 'relu', 'sigmoid']
    Y_train = convert_Y(Y_train_raw)
    m = Y_train.shape[0]
    X_batches, Y_batches, n_batches = minibatches(X_train, Y_train, batch_size)


    for i in range(epochs):
        for j in range(n_batches):
            X_batch = X_batches[j]
            Y_batch = Y_batches[j]
            predictions, caches = predict(params, X_batch.T, activations)
            print('loss: ' + str(compute_loss(predictions, Y_batch.T)))
            print(test_accuracy(predictions, Y_batch.T))
            grads = compute_grads(predictions, caches, params, Y_batch.T, activations)
            params = alter_parameters(params, grads, learning_rate)

    return params
    