In [263]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [264]:
df = pd.read_csv('exam1_train.csv', index_col = 0)

### Define Functions

In [265]:
def split_train_test(X, test_ratio):
    np.random.seed(1)
    shuffled_indices = np.random.permutation(len(X))
    test_set_size = int(len(X) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return X.iloc[train_indices], X.iloc[test_indices]

In [266]:
y_train = df.iloc[:,400]
x_train = df.iloc[:,0:400]

In [267]:
def one_hot_encoding(mat):
    mat = mat.as_matrix()
    labels = []
    for e in mat:
        if e not in labels:
            labels.append(e)
    labels.sort()
    
    enc = np.zeros(shape=(len(mat),len(labels)), dtype=int)
    #print result[0,0]
    for key, val in enumerate(mat):
        enc[key][labels.index(val)] = 1
    return enc

In [268]:
y_train_encoded = one_hot_encoding(y_train)

In [269]:
y_train_encoded = pd.DataFrame(y_train_encoded)

In [270]:
def sigmoid(z):
    A = 1. / (1 + np.exp(-z))
#     z_temp = z
    return A

In [271]:
def relu(x):
    A = np.maximum(0,x)
    
    return A

In [272]:
def forward_propagation(X, parameters):
    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']
    W3 = parameters['W3']
    b3 = parameters['b3']
    
    Z1 = W1.dot(X) + b1
    A1 = relu(Z1)
    Z2 = W2.dot(A1) + b2

    A2 = relu(Z2)
    
    Z3 = W3.dot(A2) + b3
    A3 = sigmoid(Z3)
    
    assert(A2.shape == (W2.shape[0], A1.shape[1]))
    
    cache_variables = {"Z1": Z1, "A1": A1, "Z2": Z2, "A2": A2, "Z3": Z3, "A3":A3}

    return A3, cache_variables

In [273]:
def forward_propagation_dropout_regularization(X, parameters, keep_prob):
    
    np.random.seed(1)
    
    
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    W3 = parameters["W3"]
    b3 = parameters["b3"]
    
    
    Z1 = W1.dot(X) + b1
    A1 = relu(Z1)
             
    D1 = np.random.rand(A1.shape[0], A1.shape[1])     
    D1 = D1 < keep_prob                      
    A1 = A1 * D1                                      
    A1 = A1 / keep_prob                               
    
    Z2 = W2.dot(A1) + b2
    A2 = relu(Z2)
    
    D2 = np.random.rand(A2.shape[0], A2.shape[1])     
    D2 = D2 < keep_prob                                   
    A2 = A2 * D2                                    
    A2 = A2 / keep_prob                               
    
    Z3 = W3.dot(A2) + b3
    A3 = sigmoid(Z3)
    
    cache_variables = (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3)
    
    return A3, cache_variables

In [274]:
def backward_propagation_dropout_regularization(X, Y, cache, keep_prob):
   
    m = X.shape[1]
    (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3) = cache
    
    dZ3 = A3 - Y
    dZ3 = np.array(dZ3)
    dW3 = 1. / m * np.dot(dZ3, A2.T)
    db3 = 1. / m * np.sum(dZ3, axis=1, keepdims=True)
    dA2 = np.dot(W3.T, dZ3)
    
    dA2 = dA2 * D2              
    dA2 = dA2 / keep_prob              
    
    dZ2 = np.multiply(dA2, np.int64(A2 > 0))
    dW2 = 1. / m * np.dot(dZ2, A1.T)
    dZ2 = np.array(dZ2)
    db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True)
    
    dA1 = np.dot(W2.T, dZ2)
    
    dA1 = dA1 * D1             
    dA1 = dA1 / keep_prob             
    
    dZ1 = np.multiply(dA1, np.int64(A1 > 0))
    dW1 = 1. / m * np.dot(dZ1, X.T)
    dZ1 = np.array(dZ1)
    db1 = 1. / m * np.sum(dZ1, axis=1, keepdims=True)
    
    gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,"dA2": dA2, "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1, "dZ1": dZ1, "dW1": dW1, "db1": db1}
    
    return gradients

In [275]:
def softmax(x):
    exp_scores = np.array(np.exp(x))
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    y_predict = np.array(np.argmax(probs, axis=0))
    return y_predict

### Split the data
- Since the data is already splitted when you provided to us, and for the sake of better accuracy, I don't use the split data function to split the training set but use the whole training set to train and test with the whole test set. 

### Initialize Parameters

In [276]:
def initialize_parameters(layer_size):    
    np.random.seed(1)
    parameters = {}
    L = len(layer_size) 
    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layer_size[l], layer_size[l-1]) / np.sqrt(layer_size[l-1])
        parameters['b' + str(l)] = np.zeros((layer_size[l], 1))
        
    return parameters

### Deep Neural Network Model

In [277]:
def calculate_cost(A3, Y):
    
    m = Y.shape[1]
    
    logprobs = np.log(A3) * Y + (1 - Y) * np.log(1 - A3)
    cost = - np.nansum(logprobs) / m
    return cost

- The function below equals to gradient_descent function

In [278]:
def update_parameters(parameters, gradients, learning_rate):
    
    dW1 = gradients['dW1']
    db1 = gradients['db1']
    dW2 = gradients['dW2']
    db2 = gradients['db2']
    dW3 = gradients['dW3']
    db3 = gradients['db3']
   
    parameters["W1"] = parameters["W1"] - 0.9 * dW1
    parameters["b1"] = parameters["b1"] - 0.9 * db1
    parameters["W2"] = parameters["W2"] - 0.9 * dW2
    parameters["b2"] = parameters["b2"] - 0.9 * db2
    parameters["W3"] = parameters["W3"] - learning_rate * dW3
    parameters["b3"] = parameters["b3"] - learning_rate * db3
    return parameters

#### Define fit function for the model

In [279]:
def fit(X, Y, learning_rate = 0.3, num_iterations = 8000, n_h_1=390, n_h_2=390, print_cost = True, lambd = 0, keep_prob = 1):
    grads = {}
    costs = []                           
    m = X.shape[1]                        
    layers_sizes = [X.shape[0], n_h_1, n_h_2, 10]
    
    
    parameters = initialize_parameters(layers_sizes)
    

    for i in range(0, num_iterations):
        if keep_prob < 1:
            a3, cache = forward_propagation_dropout_regularization(X, parameters, keep_prob)
            cost = calculate_cost(a3, Y)
            
        assert(lambd == 0 or keep_prob == 1)   
        if keep_prob < 1:
            gradients = backward_propagation_dropout_regularization(X, Y, cache, keep_prob)
        
        parameters = update_parameters(parameters, gradients, learning_rate)
        
        if print_cost and i % 1000 == 0:
            print("Cost after {} iterations: {}".format(i, cost))
        if print_cost and i % 1000 == 0:
            costs.append(cost)
    
    return parameters, costs

#### Prediction function for the model

In [280]:
def predict(parameters, X):
    A3, cache = forward_propagation(X, parameters)
    #print cache[10].shape
    y_predict = softmax(cache["Z3"])
    return y_predict

### Optimized model_fit or Optimization
- In this optimization, the accuracy for the test set in my machine (Macbook Pro 13, Intel i5 Dual Cores, 2.5GHz) is __90.8%__

In [281]:
parameters, costs = fit(x_train.T, y_train_encoded.T, keep_prob=0.89, n_h_1 = 150, n_h_2 = 100, learning_rate=1.12, num_iterations=7000)

Cost after 0 iterations: 6.98663719482


  """


Cost after 1000 iterations: 0.00284356224986
Cost after 2000 iterations: 0.000937536490227
Cost after 3000 iterations: 0.000524399421809
Cost after 4000 iterations: 0.000353530959333
Cost after 5000 iterations: 0.000262536046163
Cost after 6000 iterations: 0.000206823446946


### Make Predictions 

In [283]:
predictions = predict(parameters, x_train.T)

In [284]:
correct = [1 if a == b else 0 for (a, b) in zip(np.array(y_train), predictions)]  
accuracy = (float(sum(map(int, correct))) / float(len(correct)))  
print 'Optimized train set accuracy = {0}%'.format(accuracy * 100)

Optimized train set accuracy = 95.3142857143%


In [285]:
df1 = pd.read_csv('exam1_test.csv', index_col=0)

In [286]:
x_test = df1.iloc[:,0:400]
y_test = df1.iloc[:,400]

In [287]:
predictions1 = predict(parameters, x_test.T)
correct = [1 if a == b else 0 for (a, b) in zip(y_test, predictions1)]  
accuracy = (float(sum(map(int, correct))) / float(len(correct)))  
print 'Optimized test set accuracy = {0}%'.format(accuracy * 100)

Optimized test set accuracy = 90.8%


## Warning: Please Let the Optimization above finish running and then run this. 
### Other Optimization
- In this optimization, I will just apply dropout regularization for layer #2, the accuracy for the test set in my machine (Macbook Pro 13, Intel i5 Dual Cores, 2.5GHz) is __90.8%__

In [293]:
def forward_propagation_dropout_regularization_layer2(X, parameters, keep_prob):
    
    np.random.seed(1)
    
    
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    W3 = parameters["W3"]
    b3 = parameters["b3"]
    
    
    Z1 = W1.dot(X) + b1
    A1 = relu(Z1)
             
    D1 = np.random.rand(A1.shape[0], A1.shape[1])     
#     D1 = D1 < keep_prob                      
#     A1 = A1 * D1                                      
#     A1 = A1 / keep_prob                               
    
    Z2 = W2.dot(A1) + b2
    A2 = relu(Z2)
    
    D2 = np.random.rand(A2.shape[0], A2.shape[1])     
    D2 = D2 < keep_prob                                   
    A2 = A2 * D2                                    
    A2 = A2 / keep_prob                               
    
    Z3 = W3.dot(A2) + b3
    A3 = sigmoid(Z3)
    
    cache_variables = (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3)
    
    return A3, cache_variables

In [294]:
def backward_propagation_dropout_regularization_layer2(X, Y, cache, keep_prob):
   
    m = X.shape[1]
    (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3) = cache
    
    dZ3 = A3 - Y
    dZ3 = np.array(dZ3)
    dW3 = 1. / m * np.dot(dZ3, A2.T)
    db3 = 1. / m * np.sum(dZ3, axis=1, keepdims=True)
    dA2 = np.dot(W3.T, dZ3)
    
    dA2 = dA2 * D2              
    dA2 = dA2 / keep_prob              
    
    dZ2 = np.multiply(dA2, np.int64(A2 > 0))
    dW2 = 1. / m * np.dot(dZ2, A1.T)
    dZ2 = np.array(dZ2)
    db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True)
    
    dA1 = np.dot(W2.T, dZ2)
    
#     dA1 = dA1 * D1             
#     dA1 = dA1 / keep_prob             
    
    dZ1 = np.multiply(dA1, np.int64(A1 > 0))
    dW1 = 1. / m * np.dot(dZ1, X.T)
    dZ1 = np.array(dZ1)
    db1 = 1. / m * np.sum(dZ1, axis=1, keepdims=True)
    
    gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,"dA2": dA2,
                 "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1, 
                 "dZ1": dZ1, "dW1": dW1, "db1": db1}
    
    return gradients

In [295]:
def fit_reg_layer2(X, Y, learning_rate = 0.3, num_iterations = 8000, n_h_1=390, n_h_2=390, print_cost = True, lambd = 0, keep_prob = 1):
    grads = {}
    costs = []                           
    m = X.shape[1]                        
    layers_sizes = [X.shape[0], n_h_1, n_h_2, 10]
    
    
    parameters = initialize_parameters(layers_sizes)
    

    for i in range(0, num_iterations):
        if keep_prob < 1:
            a3, cache = forward_propagation_dropout_regularization_layer2(X, parameters, keep_prob)
            cost = calculate_cost(a3, Y)
            
        assert(lambd == 0 or keep_prob == 1)   
        if keep_prob < 1:
            gradients = backward_propagation_dropout_regularization_layer2(X, Y, cache, keep_prob)
        
        parameters = update_parameters(parameters, gradients, learning_rate)
        
        if print_cost and i % 1000 == 0:
            print("Cost after {} iterations: {}".format(i, cost))
        if print_cost and i % 1000 == 0:
            costs.append(cost)
    
    return parameters, costs

In [296]:
parameters1, costs1 = fit_reg_layer2(x_train.T, y_train_encoded.T, keep_prob=0.89, n_h_1 = 150, n_h_2 = 20, learning_rate=1.12, num_iterations=7000)

Cost after 0 iterations: 7.01204288618


  """


Cost after 1000 iterations: 0.00924788541855
Cost after 2000 iterations: 0.00288899908325
Cost after 3000 iterations: 0.00138091592438
Cost after 4000 iterations: 0.000850495800032
Cost after 5000 iterations: 0.000598966141862
Cost after 6000 iterations: 0.000455474906956


In [297]:
predictions2 = predict(parameters1, x_train.T)
correct = [1 if a == b else 0 for (a, b) in zip(np.array(y_train), predictions2)]  
accuracy = (float(sum(map(int, correct))) / float(len(correct)))  
print 'Optimized train set accuracy = {0}%'.format(accuracy * 100)

Optimized train set accuracy = 98.3428571429%


In [298]:
predictions3 = predict(parameters1, x_test.T)
correct = [1 if a == b else 0 for (a, b) in zip(y_test, predictions3)]  
accuracy = (float(sum(map(int, correct))) / float(len(correct)))  
print 'Optimized test set accuracy = {0}%'.format(accuracy * 100)

Optimized test set accuracy = 90.8%
