# Intro to Neural Networks with PyTorch

> Deep Learning NanoDegree

---

In [1]:
# === Imports === #
import numpy as np
import pandas as pd

---

## Perceptrons as logical operators

### AND Perceptron

What are the weights and bias for the AND perceptron?

Set the weights (weight1, weight2) and bias (bias) to values that will correctly determine the AND operation as shown above.
More than one set of values will work!

In [5]:
# TODO: Set weight1, weight2, and bias
weight1 = 1.0
weight2 = 1.0
bias = -2

# DON'T CHANGE ANYTHING BELOW
# Inputs and outputs
test_inputs = [(0, 0), (0, 1), (1, 0), (1, 1)]
correct_outputs = [False, False, False, True]
outputs = []

# Generate and check output
for test_input, correct_output in zip(test_inputs, correct_outputs):
    linear_combination = weight1 * test_input[0] + weight2 * test_input[1] + bias
    output = int(linear_combination >= 0)
    is_correct_string = 'Yes' if output == correct_output else 'No'
    outputs.append([test_input[0], test_input[1], linear_combination, output, is_correct_string])

# Print output
num_wrong = len([output[4] for output in outputs if output[4] == 'No'])
output_frame = pd.DataFrame(outputs, columns=['Input 1', '  Input 2', '  Linear Combination', '  Activation Output', '  Is Correct'])
if not num_wrong:
    print('Nice!  You got it all correct.\n')
else:
    print('You got {} wrong.  Keep trying!\n'.format(num_wrong))
print(output_frame.to_string(index=False))


Nice!  You got it all correct.

 Input 1    Input 2    Linear Combination    Activation Output   Is Correct
       0          0                  -2.0                    0          Yes
       0          1                  -1.0                    0          Yes
       1          0                  -1.0                    0          Yes
       1          1                   0.0                    1          Yes


### NOT Perceptron

Unlike the other perceptrons we looked at, the NOT operation only cares about one input. The operation returns a 0 if the input is 1 and a 1 if it's a 0. The other inputs to the perceptron are ignored.

In this quiz, you'll set the weights (weight1, weight2) and bias bias to the values that calculate the NOT operation on the second input and ignores the first input.


In [None]:
# TODO: Set weight1, weight2, and bias
weight1 = 0.0
weight2 = -1.0
bias = 0.0


# DON'T CHANGE ANYTHING BELOW
# Inputs and outputs
test_inputs = [(0, 0), (0, 1), (1, 0), (1, 1)]
correct_outputs = [True, False, True, False]
outputs = []

# Generate and check output
for test_input, correct_output in zip(test_inputs, correct_outputs):
    linear_combination = weight1 * test_input[0] + weight2 * test_input[1] + bias
    output = int(linear_combination >= 0)
    is_correct_string = 'Yes' if output == correct_output else 'No'
    outputs.append([test_input[0], test_input[1], linear_combination, output, is_correct_string])

# Print output
num_wrong = len([output[4] for output in outputs if output[4] == 'No'])
output_frame = pd.DataFrame(outputs, columns=['Input 1', '  Input 2', '  Linear Combination', '  Activation Output', '  Is Correct'])
if not num_wrong:
    print('Nice!  You got it all correct.\n')
else:
    print('You got {} wrong.  Keep trying!\n'.format(num_wrong))
print(output_frame.to_string(index=False))

---

## Gradient Descent

In [None]:
# === Implementing Gradient Descent - data_prep.py === #

import numpy as np
import pandas as pd

admissions = pd.read_csv('binary.csv')

# Make dummy variables for rank
data = pd.concat([admissions, pd.get_dummies(admissions['rank'], prefix='rank')], axis=1)
data = data.drop('rank', axis=1)

# Standarize features
for field in ['gre', 'gpa']:
    mean, std = data[field].mean(), data[field].std()
    data.loc[:,field] = (data[field]-mean)/std
    
# Split off random 10% of the data for testing
np.random.seed(42)
sample = np.random.choice(data.index, size=int(len(data)*0.9), replace=False)
data, test_data = data.ix[sample], data.drop(sample)

# Split into features and targets
features, targets = data.drop('admit', axis=1), data['admit']
features_test, targets_test = test_data.drop('admit', axis=1), test_data['admit']

In [None]:
# === Implementing Gradient Descent - gradient.py === #

import numpy as np
from data_prep import features, targets, features_test, targets_test


def sigmoid(x):
    """
    Calculate sigmoid
    """
    return 1 / (1 + np.exp(-x))

# TODO: We haven't provided the sigmoid_prime function like we did in
#       the previous lesson to encourage you to come up with a more
#       efficient solution. If you need a hint, check out the comments
#       in solution.py from the previous lecture.

# Use to same seed to make debugging easier
np.random.seed(42)

n_records, n_features = features.shape
last_loss = None

# Initialize weights
weights = np.random.normal(scale=1 / n_features**.5, size=n_features)

# Neural Network hyperparameters
epochs = 1000
learnrate = 0.5

for e in range(epochs):
    del_w = np.zeros(weights.shape)
    for x, y in zip(features.values, targets):
        # Loop through all records, x is the input, y is the target

        # Note: We haven't included the h variable from the previous
        #       lesson. You can add it if you want, or you can calculate
        #       the h together with the output

        # Calculate the output
        output = sigmoid(np.dot(x, weights))

        # Calculate the error
        # error = y * np.log(output) - (1 - y) * np.log(1 - output)
        error = y - output

        # Calculate the error term
        #   Notice we calulate f'(h) here instead of defining a separate
        #   sigmoid_prime function. This just makes it faster because we
        #   can re-use the result of the sigmoid function stored in
        #   the output variable
        error_term = (y - output) * output * (1 - output)

        # TODO: Calculate the change in weights for this sample
        #       and add it to the total weight change
        del_w += error_term * x

    # TODO: Update weights using the learning rate and the average change in weights
    weights += learnrate * del_w / n_records

    # Printing out the mean square error on the training set
    if e % (epochs / 10) == 0:
        out = sigmoid(np.dot(features, weights))
        loss = np.mean((out - targets) ** 2)
        if last_loss and last_loss < loss:
            print("Train loss: ", loss, "  WARNING - Loss Increasing")
        else:
            print("Train loss: ", loss)
        last_loss = loss


# Calculate accuracy on test data
tes_out = sigmoid(np.dot(features_test, weights))
predictions = tes_out > 0.5
accuracy = np.mean(predictions == targets_test)
print("Prediction accuracy: {:.3f}".format(accuracy))

---

### Multilayer perceptrons

In [9]:
# === Initialize weights with numpy === #
import numpy as np

# Number of records and input units
features = np.array([[0.49671415, -0.1382643 ,  0.64768854]]).T
n_records, n_inputs = features.shape

In [10]:
features

array([[ 0.49671415],
       [-0.1382643 ],
       [ 0.64768854]])

In [5]:
# Number of hidden units
n_hidden = 2
weights_input_to_hidden = np.random.normal(0, n_inputs ** -0.5, size=(n_inputs, n_hidden))

In [None]:
# Multiply the inputs (row vector) by the weights
# dot product of the inputs with each column in the weights matrix
hidden_inputs = np.dot(inputs, weights_input_to_hidden)

In [None]:
# Other method of making a column vector out of a row vector
features[:, None]

> Implement a forward pass through a 4x3x2 network with sigmoid activation functions for both layers

1. Calculate input to the hidden layer
2. Calculate output from hidden layer
3. Calculate input to the output layer
4. Calculate the output of the network

In [14]:
import numpy as np

def sigmoid(x):
    """Calculate sigmoid"""
    return 1/(1+np.exp(-x))

# Network size
N_input = 4
N_hidden = 3
N_output = 2

np.random.seed(42)
# Make some fake data
X = np.random.randn(4)

weights_input_to_hidden = np.random.normal(0, scale=0.1, size=(N_input, N_hidden))
weights_hidden_to_output = np.random.normal(0, scale=0.1, size=(N_hidden, N_output))

# TODO: Make a forward pass through the network

# Multiply inputs by weights
hidden_layer_in = np.dot(X, weights_input_to_hidden)
hidden_layer_out = sigmoid(hidden_layer_in)

print('Hidden-layer Output:')
print(hidden_layer_out)

output_layer_in = np.dot(hidden_layer_out, weights_hidden_to_output)
output_layer_out = sigmoid(output_layer_in)

print('Output-layer Output:')
print(output_layer_out)

Hidden-layer Output:
[0.41492192 0.42604313 0.5002434 ]
Output-layer Output:
[0.49815196 0.48539772]


---

## Backpropagation

> Implement the code to calculate one backpropagation update step for two sets of weights.

1. Calculate the network's output error
2. Calculate the output layer's error term
3. Use backpropagation to calculate the hidden layer's error term
4. Calculate change in weights that result from propagating the errors back through the network

In [28]:
import numpy as np


def sigmoid(x):
    """
    Calculate sigmoid
    """
    return 1 / (1 + np.exp(-x))


x = np.array([0.5, 0.1, -0.2])
target = 0.6
learnrate = 0.5

weights_input_hidden = np.array([[0.5, -0.6],
                                 [0.1, -0.2],
                                 [0.1, 0.7]])

weights_hidden_output = np.array([0.1, -0.3])

## Forward pass
hidden_layer_input = np.dot(x, weights_input_hidden)
hidden_layer_output = sigmoid(hidden_layer_input)

output_layer_in = np.dot(hidden_layer_output, weights_hidden_output)
output = sigmoid(output_layer_in)

# === Backwards pass === #

# Calculate output error
error = target - output

# Calculate error term for output layer
output_error_term = error * output * (1 - output)

# Calculate error term for hidden layer
# hidden_error_term = [sum(w * output_error_term * (1 - hidden_layer_output)) for w in weights_hidden_output]  # Incorrect
hidden_error_term = np.dot(output_error_term, weights_hidden_output) * hidden_layer_output * (1 - hidden_layer_output)

# Calculate change in weights for hidden layer to output layer
# del_w += lr * error_term * x
delta_w_h_o = learnrate * output_error_term * hidden_layer_output

# Calculate change in weights for input layer to hidden layer
delta_w_i_h = learnrate * hidden_error_term * x[:, None]

print('Change in weights for hidden layer to output layer:')
print(delta_w_h_o)
print('Change in weights for input layer to hidden layer:')
print(delta_w_i_h)

Change in weights for hidden layer to output layer:
[0.00804047 0.00555918]
Change in weights for input layer to hidden layer:
[[ 1.77005547e-04 -5.11178506e-04]
 [ 3.54011093e-05 -1.02235701e-04]
 [-7.08022187e-05  2.04471402e-04]]


In [19]:
print(output_error_term)
print(hidden_error_term)

0.028730669543515018
[ 0.00070802 -0.00204471]


In [20]:
# Hidden error term
np.dot(output_error_term, weights_hidden_output) * hidden_layer_output * (1 - hidden_layer_output)

array([ 0.00070802, -0.00204471])

In [21]:
# Delta W: hidden -> output
learnrate * output_error_term * hidden_layer_output

array([0.00804047, 0.00555918])

In [22]:
# Delta W: input -> hidden
learnrate * hidden_error_term * x[:, None]

array([[ 1.77005547e-04, -5.11178506e-04],
       [ 3.54011093e-05, -1.02235701e-04],
       [-7.08022187e-05,  2.04471402e-04]])

---

### Another Backpropagation exercise

Implement the backprop algorithm for a network trained on the graduate school admission data.

1. Implement the forward pass
2. Implement the backpropagation algorithm
3. Update the weights

In [2]:
# === data_prep.py === #
import numpy as np
import pandas as pd

admissions = pd.read_csv('binary.csv')

# Make dummy variables for rank
# data = pd.concat([admissions, pd.get_dummies(admissions['rank'], prefix='rank')], axis=1)
# data = data.drop('rank', axis=1)
data = pd.get_dummies(admissions, prefix="rank")

# Standarize features
for field in ['gre', 'gpa']:
    mean, std = data[field].mean(), data[field].std()
    data.loc[:,field] = (data[field]-mean)/std
    
# Split off random 10% of the data for testing
np.random.seed(21)
sample = np.random.choice(data.index, size=int(len(data)*0.9), replace=False)
data, test_data = data.iloc[sample], data.drop(sample)

# Split into features and targets
features, targets = data.drop('admit', axis=1), data['admit']
features_test, targets_test = test_data.drop('admit', axis=1), test_data['admit']

In [4]:
data.shape

(360, 4)

In [8]:
features.shape

(360, 3)

In [19]:
# === backprop.py === #
# import numpy as np
# from data_prep import features, targets, features_test, targets_test

np.random.seed(21)

def sigmoid(x):
    """Calculate sigmoid"""
    return 1 / (1 + np.exp(-x))

# Hyperparameters
n_hidden = 2  # number of hidden units
epochs = 900
learnrate = 0.005

n_records, n_features = features.shape
last_loss = None
# Initialize weights
weights_input_hidden = np.random.normal(scale=1 / n_features ** .5,
                                        size=(n_features, n_hidden))
weights_hidden_output = np.random.normal(scale=1 / n_features ** .5,
                                         size=n_hidden)

for e in range(epochs):
    del_w_input_hidden = np.zeros(weights_input_hidden.shape)
    del_w_hidden_output = np.zeros(weights_hidden_output.shape)
    for x, y in zip(features.values, targets):
        # === Forward pass === #
        # TODO: Calculate the output
        hidden_input = np.dot(x, weights_input_hidden)
        hidden_output = sigmoid(hidden_input)
        
        output_input = np.dot(hidden_output, weights_hidden_output)
        output = sigmoid(output_input)

        # === Backward pass === #
        # TODO: Calculate the network's prediction error
        error = y - output

        # TODO: Calculate error term for the output unit
        output_error_term = error * output * (1 - output)

        ## Propagate errors to hidden layer

        # TODO: Calculate the hidden layer's contribution to the error
        hidden_error = np.dot(output_error_term, weights_hidden_output)
        
        # TODO: Calculate the error term for the hidden layer
        hidden_error_term = hidden_error * hidden_output * (1 - hidden_output)
        
        # TODO: Update the change in weights
        del_w_hidden_output += output_error_term * hidden_output
        del_w_input_hidden += hidden_error_term * x[:, None]

    # TODO: Update weights  (don't forget to division by n_records or number of samples)
    weights_input_hidden +=  (learnrate * del_w_input_hidden) / features.shape[0]
    weights_hidden_output += (learnrate * del_w_hidden_output) / features.shape[0]

    # Printing out the mean square error on the training set
    if e % (epochs / 10) == 0:
        hidden_output = sigmoid(np.dot(x, weights_input_hidden))
        out = sigmoid(np.dot(hidden_output,
                             weights_hidden_output))
        loss = np.mean((out - targets) ** 2)

        if last_loss and last_loss < loss:
            print("Train loss: ", loss, "  WARNING - Loss Increasing")
        else:
            print("Train loss: ", loss)
        last_loss = loss

# Calculate accuracy on test data
hidden = sigmoid(np.dot(features_test, weights_input_hidden))
out = sigmoid(np.dot(hidden, weights_hidden_output))
predictions = out > 0.5
accuracy = np.mean(predictions == targets_test)
print("Prediction accuracy: {:.3f}".format(accuracy))

Train loss:  0.24118192880283867
Train loss:  0.24062575601203856
Train loss:  0.2400814383582655
Train loss:  0.23954853271352386
Train loss:  0.2390266287606974
Train loss:  0.23851534658239512
Train loss:  0.23801433439211203
Train loss:  0.23752326640053162
Train loss:  0.2370418408101549
Train loss:  0.23656977793179068
Prediction accuracy: 0.650


In [15]:
weights_input_hidden

array([[-0.03000454, -0.06420119],
       [ 0.60147895, -0.725581  ],
       [ 0.43036219, -0.98787204]])