<h1><center>Homemade implementations of Extension 3 - Neural Net with Nesterov momentum</center></h1>

### Imports

In [1]:
from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import numpy.random as r
import matplotlib.pyplot as plt
import pandas as pd

### Get and scale the data

In [2]:
digits = load_digits()
X_all = digits.data
X_all = StandardScaler().fit_transform(X_all)
y_all = digits.target

### Make train/test split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, train_size=0.8, random_state=0)

### One-hot encoding

In [4]:
def convert_y_to_vect(y):
    y_vect = np.zeros((len(y), 10))
    for i in range(len(y)):
        y_vect[i, y[i]] = 1
    return y_vect

In [5]:
y_v_train = convert_y_to_vect(y_train)
y_v_test = convert_y_to_vect(y_test)

### Get the spam data

In [6]:
df = pd.read_table('SMSSpamCollection', sep = '\t', header=None, names=['label', 'sms_message'])
df['label']=df.label.map({'spam':1, 'ham':0})
df_train_msgs, df_test_msgs, df_ytrain, df_ytest = train_test_split(df['sms_message'],df['label'], random_state=0)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(binary = True, stop_words='english')
vectorizer.fit(df_train_msgs)
X_train_msgs = vectorizer.transform(df_train_msgs).toarray()
X_test_msgs = vectorizer.transform(df_test_msgs).toarray()
y_train_msgs = df_ytrain.to_numpy()
y_test_msgs = df_ytest.to_numpy()

In [7]:
def convert_y_to_vect_spam(y):
    y_vect = np.zeros((len(y), 2))
    for i in range(len(y)):
        y_vect[i, y[i]] = 1
    return y_vect

In [8]:
y_v_train_msgs = convert_y_to_vect_spam(y_train_msgs)
y_v_test_msgs = convert_y_to_vect_spam(y_test_msgs)

### Neural network helper functions

In [9]:
def tanh(z):
    return np.tanh(z)

def tanh_deriv(z):
    return 1 - np.square(tanh(z))

def setup_and_init_weights(nn_structure):
    W = {} #creating a dictionary i.e. a set of key: value pairs
    b = {}
    prev_vel_w = {}
    vel_w = {}
    prev_vel_b = {}
    vel_b = {}
    for l in range(1, len(nn_structure)):
        xavier_range = 4 * np.sqrt(6 / (nn_structure[l] + nn_structure[l-1]))
        W[l] = r.uniform(-xavier_range, xavier_range, (nn_structure[l], nn_structure[l-1]))
        b[l] = r.random_sample((nn_structure[l],))
        prev_vel_w[l] = vel_w[l] = np.zeros((nn_structure[l], nn_structure[l-1]))
        vel_w[l] = np.zeros((nn_structure[l], nn_structure[l-1]))
        prev_vel_b[l] = np.zeros((nn_structure[l],))
        vel_b[l] = np.zeros((nn_structure[l],))
        
        
    return W, b, prev_vel_w, vel_w, prev_vel_b, vel_b

def init_tri_values(nn_structure):
    tri_W = {}
    tri_b = {}
    for l in range(1, len(nn_structure)):
        tri_W[l] = np.zeros((nn_structure[l], nn_structure[l-1]))
        tri_b[l] = np.zeros((nn_structure[l],))
    return tri_W, tri_b

def feed_forward(x, W, b):
    a = {1: x} # create a dictionary for holding the a values for all levels
    z = { } # create a dictionary for holding the z values for all the layers
    for l in range(1, len(W) + 1): # for each layer
        node_in = a[l]
        z[l+1] = W[l].dot(node_in) + b[l]  # z^(l+1) = W^(l)*a^(l) + b^(l)
        a[l+1] = tanh(z[l+1]) # a^(l+1) = f(z^(l+1))
    return a, z

def calculate_out_layer_delta(y, a_out, z_out):
    return -(y-a_out) * tanh_deriv(z_out) 

def calculate_hidden_delta(delta_plus_1, w_l, z_l):
    return np.dot(np.transpose(w_l), delta_plus_1) * tanh_deriv(z_l)

def predict_y(W, b, X, n_layers):
    N = X.shape[0]
    y = np.zeros((N,))
    for i in range(N):
        a, z = feed_forward(X[i, :], W, b)
        y[i] = np.argmax(a[n_layers])
    return y

### Backpropagation

In [10]:
nn_structure = [64, 30, 10]

In [11]:
def train_nn(nn_structure, X, y, iter_num=3000, alpha=0.25, lamb=0, momentum=False):
    W, b, prev_vel_w, vel_w, prev_vel_b, vel_b = setup_and_init_weights(nn_structure)
    N = len(y)
    for i in range(iter_num):
        tri_W, tri_b = init_tri_values(nn_structure)
        for i in range(N):
            delta = {}
            a, z = feed_forward(X[i, :], W, b)
            for l in range(len(nn_structure), 0, -1):
                if l == len(nn_structure):
                    delta[l] = calculate_out_layer_delta(y[i,:], a[l], z[l])
                else:
                    if l > 1:
                        delta[l] = calculate_hidden_delta(delta[l+1], W[l], z[l])
                    tri_W[l] += np.dot(delta[l+1][:,np.newaxis], np.transpose(a[l][:,np.newaxis]))# np.newaxis increase the number of dimensions
                    tri_b[l] += delta[l+1]

        for l in range(len(nn_structure) - 1, 0, -1):
            grad_w = 1.0/N * tri_W[l] + lamb * W[l]
            grad_b = 1.0/N * tri_b[l]
            if momentum:
                prev_vel_w[l] = vel_w[l]
                vel_w[l] = 0.9 * vel_w[l] - alpha * grad_w
                W[l] += -0.9 * prev_vel_w[l] + 1.9 * vel_w[l]
                
                prev_vel_b[l] = vel_b[l]
                vel_b[l] = 0.9 * vel_b[l] - alpha * grad_b
                b[l] += -0.9 * prev_vel_b[l] + 1.9 * vel_b[l]
            else:
                W[l] += -alpha * grad_w
                b[l] += -alpha * grad_b
    return W, b

# Evaluate my homemade implementation's accuracy

### i) Check the accuracy of my neural network with Nesterov momentum on the digits data

In [12]:
mome_w, mome_b = train_nn(nn_structure, X_train, y_v_train, iter_num=3000, alpha=0.1, lamb=0.001, momentum=True)
mome_y_pred = predict_y(mome_w, mome_b, X_test, 3)
print(f"The accuracy of my implementation of NN with Nesterov momentum on the digits data is {100 * accuracy_score(y_test, mome_y_pred)}%")

The accuracy of my implementation of NN with Nesterov momentum on the digits data is 97.5%


### ii) Check the accuracy of my neural network with Nesterov momentum on the spam data

In [13]:
nn_structure_spam = [X_train_msgs.shape[1], 30, 2]

In [14]:
mome_w_spam, mome_b_spam = train_nn(nn_structure_spam, X_train_msgs, y_v_train_msgs, iter_num=100, alpha=0.1, lamb=0.001, momentum=True)
mome_y_pred_spam = predict_y(mome_w_spam, mome_b_spam, X_test_msgs, 3)
print(f"The accuracy of my implementation of NN with Nesterov momentum is {100 * accuracy_score(y_test_msgs, mome_y_pred_spam)}%")

The accuracy of my implementation of NN with Nesterov momentum is 98.27709978463747%
