In [76]:
from sklearn.datasets import load_digits
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pickle

In [2]:
digits = load_digits()
X = digits['data']
y = digits['target']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [4]:
def get_random_weight(incoming_conn, outgoing_conn):
    epsilon_init = 0.12
    weight = np.random.rand(outgoing_conn, incoming_conn + 1) * 2 * epsilon_init - epsilon_init
    return weight

In [5]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [6]:
def sigmoid_gradient(z):
    sigmoid_ = sigmoid(z)
    gradient = sigmoid_ * (1 - sigmoid_)
    return gradient

In [7]:
def get_predictions(X, theta1, theta2):
    m = X.shape[0]
    A1 = np.hstack((np.ones((m, 1)), X))
    Z2 = A1 @ theta1.T
    A2 = np.c_[np.ones((Z2.shape[0], 1)), sigmoid(Z2)]
    Z3 = A2 @ theta2.T
    A3 = H = sigmoid(Z3)
    return H

In [13]:
def vectorize_targets(y, num_labels=10):
    m = y.shape[0]
    I = np.eye(10)
    Y = np.zeros((m, 10))
    
    for i in range(m):
        Y[i, :] = I[y[i], :]
        
    return Y

In [17]:
def cost_function(X, Y, theta1, theta2, num_labels, lambda_=0.3):
    m = X.shape[0]
    H = get_predictions(X, theta1, theta2)
    penalty = (lambda_ / (2 * m)) * (np.sum(theta1[:, 1:]**2) + np.sum(theta2[:, 1:]**2))
    cost = np.sum( (-Y * np.log(H)) - ((1 - Y) * np.log(1 - H)) ) / m
    cost += penalty
    return cost

In [18]:
def get_gradients(X, Y, theta1, theta2, lambda_):
    m = X.shape[0]
    A1 = np.hstack((np.ones((m, 1)), X))
    Z2 = A1 @ theta1.T
    A2 = np.c_[np.ones((Z2.shape[0], 1)), sigmoid(Z2)]
    Z3 = A2 @ theta2.T
    A3 = H = sigmoid(Z3)

    sigma3 = H - Y
    sigma2 = (sigma3 @ theta2) * sigmoid_gradient(np.c_[np.ones(Z2.shape[0]), Z2])
    sigma2 = sigma2[:, 1:]

    delta1 = sigma2.transpose() @ A1
    delta2 = sigma3.transpose() @ A2

    theta1_grad = (delta1 / m) + (lambda_ / m) *  np.c_[np.zeros(theta1.shape[0]), theta1[:, 1:]]
    theta2_grad = (delta2 / m) + (lambda_ / m) *  np.c_[np.zeros(theta2.shape[0]), theta2[:, 1:]]
    return theta1_grad, theta2_grad

In [68]:
theta1 = get_random_weight(X.shape[1], 120)
theta2 = get_random_weight(120, 10)
Y = vectorize_targets(y_train)
alpha = 0.03
cost_history = []

for i in range(500):
    cost = cost_function(X_train, Y, theta1, theta2, 1)
    cost_history.append(cost)
    theta1_grad, theta2_grad = get_gradients(X_train, Y, theta1, theta2, 1)
    theta1 = theta1 - alpha * theta1_grad
    theta2 = theta2 - alpha * theta2_grad

In [93]:
predictions_test = np.argmax(get_predictions(X_test, theta1, theta2), axis=1)

In [94]:
print("Percentage of right predictions in test test: {}".format((sum(predictions_test == y_test) * 100) / len(y_test)))

Percentage of right predictions in test test: 94.22222222222223


In [95]:
predictions_train = np.argmax(get_predictions(X_train, theta1, theta2), axis=1)

In [96]:
print("Percentage of right predictions in train set: {}".format((sum(predictions_train == y_train) * 100) / len(y_train)))

Percentage of right predictions in train set: 95.02598366740905


In [97]:
with open('./train_data.pickle', 'rb') as f:
    train_data = pickle.load(f)

In [98]:
test = train_data['features'][1797: 1847]
test_target = train_data['targets'][1797:-1]

In [99]:
predictions_unseen_test = np.argmax(get_predictions(test, theta1, theta2), axis=1)

In [100]:
print("Percentage of right predictions in unseen data set: {}".format((sum(predictions_unseen_test == test_target) * 100) / len(test_target)))

Percentage of right predictions in unseen data set: 6.0


In [146]:
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(train_data['features'], train_data['targets'], test_size=0.25)

In [178]:
first_hidden_layer_size = 80
theta1 = get_random_weight(train_data['features'].shape[1], first_hidden_layer_size)
theta2 = get_random_weight(first_hidden_layer_size, 10)
Y = vectorize_targets(train_data['targets'])
alpha = 0.03
cost_history = []

for i in range(500):
    cost = cost_function(train_data['features'], Y, theta1, theta2, 1)
    cost_history.append(cost)
    theta1_grad, theta2_grad = get_gradients(train_data['features'], Y, theta1, theta2, 1)
    theta1 = theta1 - alpha * theta1_grad
    theta2 = theta2 - alpha * theta2_grad

In [179]:
predictions_unseen_test = np.argmax(get_predictions(test, theta1, theta2), axis=1)
predictions_test = np.argmax(get_predictions(X_test, theta1, theta2), axis=1)
predictions_train = np.argmax(get_predictions(X_train, theta1, theta2), axis=1)
print("Percentage of right predictions in unseen data set: {}".format((sum(predictions_unseen_test == test_target) * 100) / len(test_target)))
print("Percentage of right predictions in test test: {}".format((sum(predictions_test == y_test) * 100) / len(y_test)))
print("Percentage of right predictions in train set: {}".format((sum(predictions_train == y_train) * 100) / len(y_train)))

Percentage of right predictions in unseen data set: 68.0
Percentage of right predictions in test test: 98.0
Percentage of right predictions in train set: 96.21380846325167


### Run a cross validation to find optional parameters

In [180]:
def get_optional_thetas(X, y, alpha, hidden_layer_node_count):
    feature_count = X.shape[0]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    theta1 = get_random_weight(feature_count, hidden_layer_node_count)
    theta2 = get_random_weight(hidden_layer_node_count, 10)
    Y = vectorize_targets(y)

    for i in range(500):
        cost = cost_function(X, Y, theta1, theta2, 1)
        cost_history.append(cost)
        theta1_grad, theta2_grad = get_gradients(train_data['features'], Y, theta1, theta2, 1)
        theta1 = theta1 - alpha * theta1_grad
        theta2 = theta2 - alpha * theta2_grad
    
    return theta1, theta2

In [195]:
def get_right_prediction_percentage(X, y, theta1, theta2):
    predictions = np.argmax(get_predictions(X, theta1, theta2), axis=1)
    return sum(predictions == y) * 100 / len(y)

In [210]:
def get_optimum_parameters(X, y, parameters: dict):
    results = []

    for alpha in parameters['alpha_values']:
        for hidden_layer_node_count in parameters['hidden_layer_node_counts']:
            theta1, theta2 = get_optional_thetas(X, y, alpha, hidden_layer_node_count)
            results.append({
                'alpha': alpha,
                'hidden_layer_node_count': hidden_layer_node_count,
                'percentage': get_right_prediction_percentage(X, y, theta1, theta2)
            })
    
    percentages = [result['percentage'] for result in results]
    return params[np.argmax(percentages)]

In [211]:
parameters = {
    'alpha_values': [0.03, 0.01, 0.1, 0.3, 1],
    'hidden_layer_node_counts': range(60, 100, 5),
}

In [212]:
X, y = train_data['features'], train_data['targets']

In [213]:
optimum_parameters = get_optimum_parameters(X, y, parameters)

  


In [214]:
optimum_parameters

{'alpha': 0.3, 'hidden_layer_node_count': 95, 'percentage': 99.51298701298701}