## 2 -Layer neural network 
* dataset used : planar data generated manually
* This data is not linearly classifiable and thus we need to use NN

In [1]:
# imports 
import numpy as np 
import matplotlib.pyplot as plt 
from testCases_v2 import *
import sklearn 
import sklearn.datasets
import sklearn.linear_model
from planar_utils import plot_decision_boundary, sigmoid, load_planar_dataset, load_extra_datasets

%matplotlib inline 

# setting random seed for getting same results 
np.random.seed(1)

## Description of neural network we will be using 

* It will be a 2 layer network, with one hidden layer and one output layer 
* Hidden layer will use Tanh activation and output layer will use sigmoid activation 
* Hidden layer will have 4 hidden units and output layer will have one unit.

## Basic network flow 
1. Define the neural network 
2. Initialize the paramerters
3. Loop: 
    - Implement forward propagation 
    - Compute loss 
    - Implement backward propagation to get the gradients 
    - Update parameters (gradient descent)


## Supporting functions for NN

In [2]:
# layer size finding function 

def layer_sizes(X, Y):
    """
    Arguments:
    X : input dataset of shape (input size, number of examples)
    Y : labels of shape (output size, number of examples)
    
    Returns:
    n_x : the size of the input layer
    n_y : the size of the output layer
    """
    n_x = X.shape[0] 
    n_y = Y.shape[0] 
    return (n_x, n_y)

In [3]:
# Initialize the parameters 

def initialize_parameters(n_x, n_h, n_y):
    """
    Arguments: 
    n_x : size of the input layer 
    n_h : size of the hidden layer 
    n_y : size of the ouput layer
    
    returns: 
    parameters : dictionary containing W1, b1, W2, b2
    """
    
    np.random.seed(2)
    
    W1 = np.random.randn(n_h, n_x)*0.01
    b1 = np.zeros((n_h, 1))
    W2 = np.random.randn(n_y, n_h)*0.01
    b2 = np.zeros((n_y,1))
    
    # assertion check points 
    assert(W1.shape == (n_h, n_x))
    assert (b1.shape == (n_h, 1))
    assert (W2.shape == (n_y, n_h))
    assert (b2.shape == (n_y, 1))
    
    parameters = {
        "W1": W1,
        "b1": b1,
        "W2": W2,
        "b2": b2
    }
    return parameters

In [4]:
# forward propagation 

def forward_propagation(X, parameters):
    
    """
    Arguments: 
    X : input data of size (n_x, m)
    parameters : output of initialization function 
    
    returns: 
    A2 : final out of NN
    cache : output of all layers an its activations, i.e. Z1, Z2, A1 and A2
    """
    
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    
    Z1 = np.dot(W1,X) + b1
    A1 = np.tanh(Z1)
    Z2 = np.dot(W2, A1) + b2 
    A2 = sigmoid(Z2)
    
    assert(A2.shape == (1, X.shape[1]))
    
    cache = {
        "Z1": Z1,
        "A1": A1,
        "Z2": Z2,
        "A2": A2
    }
    return A2, cache
    

In [5]:
# cost calculation 

def compute_cost(A2, Y):
    
    """
    Arguments: 
    A2 : output of final layer
    Y : True labels 
    
    Returns: 
    cost : total cost
    """
    
    m = Y.shape[1]
    
    # cross entropy loss 
    
    loss = np.multiply(np.log(A2), Y) + np.multiply(np.log(1 - A2), (1-Y))
    cost = (-1/m)*(np.sum(loss))
    
    cost = float(np.squeeze(cost))
    
    assert(isinstance(cost, float))
    
    return cost

In [6]:
# back prop

def backward_propagation(parameters, cache, X, Y):
    
    """
    Arguments: 
    parameters : parameters dictionary 
    cache : each layer output and activation output 
    X : input data 
    Y : input labels
    
    Returns: 
    grads : all gradient values for each layers 
    """
    
    m = X.shape[1]
    # W1, W2 from parameters 
    W1 = parameters["W1"]
    W2 = parameters["W2"]
    # A1, A2 from cache 
    A1 = cache["A1"]
    A2 = cache["A2"]
    
    # back prop
    dZ2 = A2 - Y
    dW2 = (1/m)*np.dot(dZ2, A1.T)
    db2 = (1/m)* np.sum(dZ2, axis=1, keepdims=True)
    
    dZ1 = np.dot(W2.T, dZ2)*(1-np.power(A1,2))
    dW1 = (1/m)*np.dot(dZ1, X.T)
    db1 = (1/m)*np.sum(dZ1, axis=1, keepdims=True)
    
    grads = {
        "dW1": dW1,
        "db1": db1,
        "dW2": dW2,
        "db2": db2
    }
    return grads

In [7]:
# updating parameters 

def update_parameters(parameters, grads, learning_rate = 1.2):
    
    """
    Arguments: 
    parameters : a parameters dictionary 
    grads : gradient values in dictionary 
    learning_rate : network learning rate 
    
    Returns: 
    updated parameters dictionary
    """
    
    # from parameters 
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    # from grads 
    dW1 = grads["dW1"]
    db1 = grads["db1"]
    dW2 = grads["dW2"]
    db2 = grads["db2"]
    # update equation 
    W1 = W1 - learning_rate*dW1
    b1 = b1 - learning_rate*db1
    W2 = W2 - learning_rate*dW2
    b2 = b2 - learning_rate*db2
    ### END CODE HERE ###
    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    
    return parameters

In [8]:
# The model 

def nn_model(X, Y, n_h, num_iterations = 1000, print_cost=False):
    
    """
    Arguments: 
    X : input (n_x, m)
    Y : true label 
    n_h : number of hidden units 
    num_iterations: number of iteration, user defined
    print_cost : flag for printing
    """
    
    np.random.seed(3)
    n_x = layer_sizes(X,Y)[0]
    n_y = layer_sizes(X,Y)[1]
    
    # initialize the parameters 
    parameters = initialize_parameters(n_x, n_h, n_y)
    
    # loop 
    for i in range(num_iterations):
        
        # forward prop
        A2, cache = forward_propagation(X, parameters)
        # cost calcluation 
        cost = compute_cost(A2, Y)
        # Back prop
        grads = backward_propagation(parameters, cache, X, Y)
        # gradient descent 
        parameters = update_parameters(parameters, grads)
        
        if print_cost and i % 1000 == 0:
            print("Cost after iteration :",i, cost)
            
    return parameters

In [9]:
# prediction 

def predict ( parameters, X):
    
    """
    prediction on new examples 
    
    Arguments: 
    parameters : the parameters that we learn from nn-model
    X : new input example 
    
    Returns: 
    predictions: vector of predictions 
    """
    
    A2, cache = forward_propagation(X, parameters)
    predictions = A2 > 0.5
    
    return predictions

In [22]:
# Running on planar data 
X, Y = load_planar_dataset()
# plt.scatter(X[0, :], X[1, :], c=X[1,:], s=40, cmap=plt.cm.Spectral);
parameters = nn_model(X, Y, n_h = 4, num_iterations = 10000, print_cost=True)

# Print accuracy
predictions = predict(parameters, X)
print ('Accuracy: %d' % float((np.dot(Y,predictions.T) + np.dot(1-Y,1-predictions.T))/float(Y.size)*100) + '%')

Cost after iteration : 0 0.6930480201239823
Cost after iteration : 1000 0.28808329356901846
Cost after iteration : 2000 0.2543854940732458
Cost after iteration : 3000 0.23386415038952207
Cost after iteration : 4000 0.22679248744854014
Cost after iteration : 5000 0.22264427549299023
Cost after iteration : 6000 0.21973140404281322
Cost after iteration : 7000 0.21750365405131294
Cost after iteration : 8000 0.21950577641358185
Cost after iteration : 9000 0.21862107625156163
Accuracy: 90%


# conlcusion 
* We see that after each iteration the cost is reducing so our model gets better after each iteration
* With only four hidden units we boosted our accuracy to 90 percent.
* We can experiment with number of hidden units 

In [None]:
# Experiment with number of hidden layers 

hidden_layers_sizes = [1,2,3,4,5,20, 50]
for i, n_h in enumerate(hidden_layers_sizes):
    parameters = nn_model(X, Y, n_h, num_iterations=5000, print_cost=True)
    predictions = predict(parameters, X)
    print ('Accuracy: %d' % float((np.dot(Y,predictions.T) + np.dot(1-Y,1-predictions.T))/float(Y.size)*100) + '%')

Cost after iteration : 0 0.6931475438458984
Cost after iteration : 1000 0.6366208325642342
Cost after iteration : 2000 0.6347568331836081
Cost after iteration : 3000 0.6338139675205536
Cost after iteration : 4000 0.6332045174926639
Accuracy: 67%
Cost after iteration : 0 0.6931162485343118
Cost after iteration : 1000 0.5823245569184224
Cost after iteration : 2000 0.5789483062264426
Cost after iteration : 3000 0.5772913743998501
Cost after iteration : 4000 0.5761901410265899
Accuracy: 67%
Cost after iteration : 0 0.6931142222248914
Cost after iteration : 1000 0.28550159668379077
Cost after iteration : 2000 0.2730627923489794
Cost after iteration : 3000 0.2663671337276788
Cost after iteration : 4000 0.2620671615366344
Accuracy: 90%
Cost after iteration : 0 0.6930480201239823
Cost after iteration : 1000 0.28808329356901846
Cost after iteration : 2000 0.2543854940732458
Cost after iteration : 3000 0.23386415038952207
Cost after iteration : 4000 0.22679248744854014
Accuracy: 90%
Cost after i

# observation from hidden unit tuning 

* for 1,2 hidden unit the accuracy is 67%
* for 3 and 4 the accuracy shoots to 90%
* for 5 hidden unit, the accuracy is highest of 91% 
* but then the accuracy drops to 90% even though the cost has decreased, this shows the the model has started to overfit the data. 
