In [1]:
import numpy as np
import scipy.special
import random
import matplotlib.pyplot
%matplotlib inline

# neural network class definition
# 3 layer neural network
class neuralNetwork:
    
    #initialize the neural network
    def __init__(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # set the number of nodes in each input, hidden and output layer
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        #setting the weights
        #get the same weights each time for testing purposes
        np.random.seed(42)
        
        # weights between input layer and the hidden layer
        self.wih = np.random.normal(0.0, pow(self.input_nodes, -0.5),(self.hidden_nodes, self.input_nodes))
        
        # weights between the hidden layer and the output layer
        self.who = np.random.normal(0.0, pow(self.hidden_nodes, -0.5),(self.output_nodes, self.hidden_nodes))
        
        #set the learning rate
        self.learning_rate = learning_rate
        
        # our activation function is the sigmoid function
        self.activation_function = lambda x: scipy.special.expit(x)
        
    #train the neural network
    def train(self, inputs_list, targets_list):
        # convert inputs list to 2d array
        inputs = np.array(inputs_list, ndmin=2).T
        targets = np.array(targets_list, ndmin=2).T
        
        # calculate signals into hidden layer
        hidden_inputs = np.dot(self.wih, inputs)

        #calculate the signals emerging from hidden layer
        hidden_outputs = self.activation_function(hidden_inputs)

        # calculate signals into final output layer
        final_inputs = np.dot(self.who, hidden_outputs)
              
        # calculate signals emerging from final output layer
        final_outputs = self.activation_function(final_inputs)
              
        # error is the (target - actual program output)
        node_errors = (targets - final_outputs) 
         
        # hidden layer error is the outputs_errors, split by the weights, recombined at the hidden nodes
        hidden_errors = np.dot(self.who.T, node_errors)
   
        #update the weights for the links between the hidden and output layers
        self.who += self.learning_rate * np.dot((node_errors * final_outputs * (1.0 - final_outputs)), np.transpose(hidden_outputs))
        self.wih += self.learning_rate * np.dot((hidden_errors * hidden_outputs * (1.0 - hidden_outputs)), np.transpose(inputs))
        
    #query the neural network
    def query(self, inputs_list):
        # convert input list to 2d array
        inputs = np.array(inputs_list, ndmin=2).T
        
        # calculate the signals into hidden layer
        hidden_inputs = np.dot(self.wih, inputs)
        
        #calculate the signals emerging from hidden layer
        hidden_outputs = self.activation_function(hidden_inputs)
        
        # calculate signals into final output layer
        final_inputs = np.dot(self.who, hidden_outputs)
        
        # calculate signals emerging from final output layer
        final_outputs = self.activation_function(final_inputs)
        
        return final_outputs
    
    def get_wih(self):
        return self.wih
    
    def get_who(self):
        return self.who
    
    # Save the weights to a file
    def save_weights(self, wih_filename, who_filename):
        np.save(wih_filename, self.wih)
        np.save(who_filename, self.who)

    # Load weights from a file
    def load_weights(self, wih_filename, who_filename):
        self.wih = np.load(wih_filename)
        self.who = np.load(who_filename)
    
    def __repr__(self):
        return f"number of input nodes: {self.input_nodes}, number of hidden nodes: {self.hidden_nodes}, number of output nodes: {self.output_nodes}. learning rate: {self.learning_rate}."


In [2]:
# number of input, hidden and output nodes
input_nodes = 784
hidden_nodes = 200
output_nodes = 10

# learning rate
learning_rate = 0.01

# create an instance of neural network
n = neuralNetwork(input_nodes, hidden_nodes, output_nodes, learning_rate)

In [3]:
# first number in the list is the label (the actual label of the image)
# the subsequent 784 numbers (28x28 pixel image) are the 0-255 color values
# load the mnist training data CSV file into a list
training_data_file = open("mnist_train.csv", 'r')
training_data_list = training_data_file.readlines() #list of strings
training_data_file.close()

In [None]:
# training data set of 60,000 records, and the test data set of 10,000 records. 
# train the neural network
epochs = 5

for e in range(epochs):
    # go through all records in the training data set
    for record in training_data_list:
        # split the record by the ',' commas
        all_values = record.split(',')
        # scale and shift the inputs
        inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99) + 0.01
        # create the target output values (all 0.01, except the desired label which is 0.99)
        targets = np.zeros(output_nodes) + 0.01
        # all_values[0] is the target label for this record
        targets[int(all_values[0])] = 0.99
        n.train(inputs, targets) 

In [None]:
# load the mnist data CSV file into a list
test_data_file = open("mnist_test.csv", 'r')
test_data_list = test_data_file.readlines()
test_data_file.close()

In [None]:
# test the neural network
# scorecard for how well the network performs, initially empty
scorecard = []

# go through all the records in the test data set 
for record in test_data_list:
    # split the record by the ',' commans
    all_values = record.split(',')
    # the correct answer is the first vaklue
    correct_label = int(all_values[0])
    #print(f"correct label: {correct_label}")
    # scale and shift the inputs
    inputs = (np.asfarray(all_values[1:])/ 255.0 * .99) + 0.01
    # query the network
    outputs = n.query(inputs)
    # index with the highest value corresponds to the label
    label = np.argmax(outputs)
    #print(f"network's answer {label}")
    if (label == correct_label):
        scorecard.append(1)
    else: 
        scorecard.append(0)

In [None]:
# calculate the performance score, the fraction of correct answers
scorecard_array = np.asarray(scorecard)
print("performance =", scorecard_array.sum() / len(scorecard))

In [None]:
print(n.get_wih())

In [None]:
print(n.get_who())

In [None]:
n.save_weights('wih.npy', 'who.npy')

In [None]:
# number of input, hidden and output nodes
input_nodes = 784
hidden_nodes = 200
output_nodes = 10

# learning rate
learning_rate = 0.01

# create an instance of neural network
new_nn = neuralNetwork(input_nodes, hidden_nodes, output_nodes, learning_rate)
new_nn.load_weights('wih.npy', 'who.npy')


In [None]:
# test the neural network with the new_nn instance
# scorecard for how well the network performs, initially empty
scorecard = []

# go through all the records in the test data set
for record in test_data_list:
    # split the record by the ',' commas
    all_values = record.split(',')
    # correct answer is first value
    correct_label = int(all_values[0])
    # scale and shift the inputs
    inputs = (np.asfarray(all_values[1:]) / 255.0 * 0.99) + 0.01
    # query the network using new_nn
    outputs = new_nn.query(inputs)
    # the index of the highest value corresponds to the label
    label = np.argmax(outputs)
    # append correct or incorrect to list
    if (label == correct_label):
        # network's answer matches correct answer, add 1 to scorecard
        scorecard.append(1)
    else:
        # network's answer doesn't match correct answer, add 0 to scorecard
        scorecard.append(0)
    
# calculate the performance score, the fraction of correct answers
scorecard_array = np.asarray(scorecard)
print ("performance = ", scorecard_array.sum() / scorecard_array.size)


In [None]:
wih_loaded = np.load('wih.npy')
who_loaded = np.load('who.npy')

#print the weight matrices
print("Weights Input-Hidden (wih):\n", wih_loaded)
print("\nWeights Hidden-Output (who):\n", who_loaded)

In [None]:
#plot the image
image_array = np.asfarray(all_values[1:]).reshape((28,28))
matplotlib.pyplot.imshow(image_array, cmap='Greys', interpolation='None')

In [None]:
# example calculations using the diagram below

# 3-layer neural net
# 3 input nodes, 3 hidden nodes, 3 output nodes


<div>
    <img src="https://i.imgur.com/Fvj7t8A.png" width="400"/>
</div>

In [None]:
'''
we encode the weights into matrices.
the weights between the input/hidden layer in the above graph are encoded in the following way

             [w_1,1, w_2,1, w_3,1]
weights_ih = [w_1,2, w_2,2, w_3,2]
             [w_1,3, w_2,3, w_3,3]      
eg

the weights between the input/hidden layer in the above graph are:

[w_1,1 = 0.9,  w_2,1 = 0.3, w_3,1 = 0.4]
[w_1,2 = 0.2,  w_2,2 = 0.8, w_3,2 = 0.2]
[w_1,3 = 0.1,  w_2,3 = 0.5, w_3,3 = 0.6]

which is represented in the code below, along with the inputs/weights_ho
(note: not all of the weights are shown in the diagram)
'''
inputs = np.array([[0.9], [0.1], [0.8]])
weights_ih = np.array([[0.9, 0.3, 0.4], [0.2, 0.8, 0.2], [0.1, 0.5, 0.6]])
weights_ho = np.array([[0.3, 0.7, 0.5], [0.6, 0.5, 0.2], [0.8, 0.1, 0.9]])

In [None]:
# hidden layer input calculatiom
hidden_inputs = np.dot(weights_ih,inputs)

<div>
    <img src="https://i.imgur.com/6E4M5kK.png" width="400"/>
</div>


In [None]:
hidden_inputs

<div>
    <img src="https://i.imgur.com/SaFSRNZ.png" width="400"/>
</div>

In [None]:
# calculating hidden_output

In [None]:
activation_function = lambda x: scipy.special.expit(x)

In [None]:
hidden_outputs = activation_function(hidden_inputs)

In [None]:
hidden_outputs

<div>
    <img src="https://i.imgur.com/UvfNuFy.png" width="400"/>
</div>

In [None]:
# calculating the output layer inputs

In [None]:
output_layer_inputs = np.dot(weights_ho,hidden_outputs )

In [None]:
output_layer_inputs

In [None]:
# calculating the output layers output
output_layer_outputs = activation_function(output_layer_inputs)
output_layer_outputs

<div>
    <img src="https://i.imgur.com/h9ifRY5.png" width="400"/>
</div>

In [None]:
# using the multivariate chain rule, we obtain the gradient descent algorithms
# we dropped the constant 2 term

In [None]:
learning_rate = 0.1

In [None]:
def update_who(who, node_errors, final_outputs, hidden_outputs):
    who += learning_rate * np.dot((node_errors * final_outputs * (1.0 - final_outputs)), np.transpose(hidden_outputs))
    return who

In [None]:
def update_wih(hidden_errors, hidden_outputs, inputs, wih):
    wih += learning_rate * np.dot((hidden_errors * hidden_outputs * (1.0 - hidden_outputs)), np.transpose(inputs))
    return wih

In [None]:
# let's consider a simpler neural network to see how gradient descent is used to update the weight who_11
# first we will need to see how to calculate the error terms, which are node_errors and hidden_errors
# we will only need the node_errors term for our example, but we also show how to obtain hidden_errors for updating wih
# the node_errors and hidden_errors terms are used to insert into the gradient descent algorithms for updating who and wih

<div>
    <img src="https://i.imgur.com/2iCQMhS.png" width="500"/>
</div>

In [None]:
# node_errors is calculated as (target - actual). this is calculating the term that appears in
# the gradient descent algorithm for updating the weights who. 
# note: this is distinct from calculating the loss, we are merely getting the term that appears in the gradient descent algorithm for who 
# in this example the calculation has already been done for us
node_errors = np.array([[0.8], [0.5]])

In [None]:
# to calculate the hidden_errors term that appears in the gradient descent algorithm for wih we do what is seen below
# first we have to obtain the who matrix which is:
who = np.array([[2.0/(2.0+3.0), 3.0/(3.0+2.0)], [1.0/(1.0+4.0), 4.0/(4.0+1.0)]])

<div>
    <img src="https://i.imgur.com/0lObJPw.png" width="400"/>
</div>


In [None]:
# next we tranpose the who matrix and multiply it by the node errors
hidden_errors = np.dot(who.T, node_errors)

In [None]:
# these are the term that appears in the gradient descent algorithm for calculating wih
hidden_errors

In [None]:
# in this example we now have the node_error and hidden_error to insert into the gradient descent algorithms
# note, the actual code drops the denominator/normalization terms that appear in the who matrix, whereas the handwritten example above includes it
# from henceforth we will also drop the normalization 
# now that we have node_errors let's the updated weight who_1,1, which is currently equal to 2.0

In [None]:
# the relevant node error for who_1,1 is 0.8
node_error = 0.8

In [None]:
# before we move further we will review some background material: the book's derivation of the gradient descent algorithm is included below:

<div>
    <img src="https://i.imgur.com/bDAuYGs.png" width="400"/>
</div>


In [None]:
# and putting it all together with the learning rate (where we drop the constant term 2), we have:

<div>
    <img src="https://i.imgur.com/37P0suW.png" width="400"/>
</div>


In [None]:
# the old weight are adjusted by the negative of the error's slope
# we want to decrease the new weight if we have a positive slope
# and increase it if we have a negative slope
# also note dE/dwjk has a negative sign
# there is a distinct negative sign in front of the learning rate
# these two negatives signs cancel each other out and do not appear in
# the gradient descent algorithms

In [None]:
node_errors = np.array([[0.8], [0.5]])
final_inputs = np.dot(who, hidden_outputs)
final_inputs
final_outputs = activation_function(final_inputs).reshape(2,1)
learning_rate = 0.1
who = np.array([[2.0, 3.0], [1.0, 4.0]])
hidden_outputs = np.array([[0.4], [0.5]])

In [None]:
node_errors

In [None]:
final_outputs 

In [None]:
final_outputs * (1-final_outputs)

In [None]:
who += learning_rate * np.dot((node_errors * final_outputs * (1.0 - final_outputs)), np.transpose(hidden_outputs))

In [None]:
# our updated who_11 weight is below
who[0][0]

In [None]:
# the rest of the who weight updates
who