# L-Layers Generalized Neural Network

## Table of Contents

1. Import/Prep Test Dataset
2. Create Activation functions (Forward and Backward)
3. Individual Functions
    - Initializing Parameters
    - Forward Propogation
    - Compute Cost
    - Backward Propogation
    - Gradients
    - Update Parameters
9. Step-by-Step

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import h5py

## Import/Prep Test Dataset

In [2]:
def load_data():
    train_dataset = h5py.File('datasets/train_catvnoncat.h5', "r")
    train_set_x_orig = np.array(train_dataset["train_set_x"][:]) # your train set features
    train_set_y_orig = np.array(train_dataset["train_set_y"][:]) # your train set labels

    test_dataset = h5py.File('datasets/test_catvnoncat.h5', "r")
    test_set_x_orig = np.array(test_dataset["test_set_x"][:]) # your test set features
    test_set_y_orig = np.array(test_dataset["test_set_y"][:]) # your test set labels

    classes = np.array(test_dataset["list_classes"][:]) # the list of classes
    
    train_set_y_orig = train_set_y_orig.reshape((1, train_set_y_orig.shape[0]))
    test_set_y_orig = test_set_y_orig.reshape((1, test_set_y_orig.shape[0]))
    
    return train_set_x_orig, train_set_y_orig, test_set_x_orig, test_set_y_orig, classes

In [3]:
train_x_orig, train_y, test_x_orig, test_y, classes = load_data()
print (train_x_orig.shape)
print (train_y.shape)
print (test_x_orig.shape)
print (test_y.shape)
print (train_y.shape)
print (classes.shape)

(209, 64, 64, 3)
(1, 209)
(50, 64, 64, 3)
(1, 50)
(1, 209)
(2,)


Flatten the training and testing input sets and normalize them

In [4]:
train_x = train_x_orig.reshape(train_x_orig.shape[0],-1).T/255
test_x = test_x_orig.reshape(test_x_orig.shape[0],-1).T/255
print (train_x.shape)
print (test_x.shape)

(12288, 209)
(12288, 50)


## Create Activation Functions

### Sigmoid

In [5]:
def sigmoid(x):
    return 1/(1+np.exp(-x)), x
sigmoid(3)

def sigmoid_backward(dA, cache):
    s = 1/(1+np.exp(-cache))
    return dA*s*(1-s)

### RELU

In [6]:
def relu(x):
    return np.maximum(0, x), x
def relu_backward(dA, cache):
    Z = cache
    dZ = np.array(dA, copy=True)
    dZ[Z<=0] = 0
    return dZ

## Individual Functions

### Parameters

In [7]:
layer_dims = [12288, 20, 7, 5, 1]
def par_deep(layer_dims):
    np.random.seed(1)
    parameters = {}
    L = len(layer_dims)
    for l in range(1,L):
        parameters.update({
                            'W{0}'.format(l) : np.random.randn(layer_dims[l], layer_dims[l-1])/np.sqrt(layer_dims[l-1]),
                            'b{0}'.format(l) : np.zeros((layer_dims[l], 1))
                          })
    return parameters

In [8]:
layer_dims = [12288, 20, 7, 5, 1]
[l for l in range(1,len(layer_dims))]

[1, 2, 3, 4]

### Linear Forward Calculations

In [9]:
def l_forward(A, W, b):
    return W.dot(A)+b, (A,W,b)

### Forward Activation

In [10]:
def lact_forward(A_prev, W, b, activation = 'sigmoid'):
    Z, l_cache = l_forward(A_prev, W, b)
    if activation == 'relu':
        A, a_cache = relu(Z)
        return A, (l_cache, a_cache)
    A, a_cache = sigmoid(Z)
    return A, (l_cache, a_cache)

### Forward Propogation

In [11]:
def L_forward(X, parameters):
    caches = []
    A = X
    L = len(parameters)//2
    for l in range(1, L):
        A_prev = A
        A, cache = lact_forward(A_prev, parameters['W{0}'.format(l)], parameters['b{0}'.format(l)], 'relu')
        caches.append(cache)
    AL, cache = lact_forward(A, parameters["W{0}".format(L)], parameters["b{0}".format(L)], 'sigmoid')
    caches.append(cache)
    return AL, caches

### Compute Cost

In [12]:
def cost(AL, Y):
    return np.squeeze((1./Y.shape[1])*(-np.dot(Y, np.log(AL).T)- np.dot(1-Y, np.log(1-AL).T)))

### Linear Backward Calculations

In [13]:
def l_backward(dZ, cache):
    A_prev, W, b = cache
    m = A_prev.shape[1]
    dW = 1./m * np.dot(dZ, A_prev.T)
    db = 1./m * np.sum(dZ, axis = 1, keepdims = True)
    dA_prev = np.dot(W.T, dZ)
    return dA_prev, dW, db

### Backward Activation

In [14]:
def lact_backward(dA, cache, activation = 'relu'):
    l_cache, a_cache = cache
    if activation == 'sigmoid':
        dZ = sigmoid_backward(dA, a_cache)
    elif activation == 'relu':
        dZ = relu_backward(dA, a_cache)
    dA_prev, dW, db = l_backward(dZ, l_cache)
    return dA_prev, dW, db

### Backward Propogation

In [15]:
def L_backward(AL, Y, caches):
    grads = {}
    L = len(caches)
    m = AL.shape[1]
    Y = Y.reshape(AL.shape)
    dAL = - (np.divide(Y,AL) - np.divide(1-Y, 1-AL))
    current_cache = caches[L-1]
    grads["dA{0}".format(L)], grads['dW{0}'.format(L)], grads['db{0}'.format(L)] = lact_backward(dAL, current_cache, 'sigmoid')
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        grads['dA{0}'.format(l+1)], grads['dW{0}'.format(l+1)], grads['db{0}'.format(l+1)] = lact_backward(grads['dA{0}'.format(l+2)], 
                                                                                                     current_cache)
    return grads

### Update Parameters

In [16]:
def update_par(parameters, grads, learning_rate):
    L = len(parameters)//2
    for l in range(L):
        parameters['W{0}'.format(l+1)] = parameters['W{0}'.format(l+1)] - learning_rate*grads['dW{0}'.format(l+1)]
        parameters['b{0}'.format(l+1)] = parameters['b{0}'.format(l+1)] - learning_rate*grads['db{0}'.format(l+1)]
    return parameters

### Prediction

In [17]:
def predict(X, y, parameters):
    probas, cache = L_forward(X, parameters)
    p = np.vectorize(lambda x:1 if x>0.5 else 0)(probas)
    accuracy = np.sum(np.vectorize(lambda x:1 if x==0 else 0)(p-y), axis=1)/y.shape[1]
    #print("Accuracy: "  + str(accuracy))
    return accuracy, p, probas

## Step-by-Step

In [18]:
layer_dims

[12288, 20, 7, 5, 1]

### Parameters Setting

In [19]:
parameters = par_deep(layer_dims)
print (parameters)

{'W1': array([[ 0.01465338, -0.00551871, -0.00476469, ..., -0.00475605,
        -0.00343108,  0.00856474],
       [ 0.00910437,  0.00207385, -0.0059909 , ...,  0.00622328,
        -0.0044052 ,  0.00187287],
       [-0.00321458, -0.00176345,  0.00574466, ...,  0.00742212,
        -0.00094203, -0.00593549],
       ..., 
       [ 0.00157639, -0.0011742 ,  0.01656117, ..., -0.00832292,
        -0.00744053, -0.00138343],
       [-0.00031528, -0.00376196, -0.00771457, ...,  0.02027339,
        -0.00481399, -0.00026661],
       [-0.01293895, -0.0100192 ,  0.00655218, ...,  0.01983405,
         0.01387618,  0.00673519]]), 'b1': array([[ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.]]), 'W2': array([[ 0.20259018, -0.3085417 , -0.37746612, -0.04024358,  0.05940078,
         0.

### Linear Outputs of the first Layer
$$W^{[1]T}A^{[0]}+b^{[1]}$$
*where*
$$A^{[0]} = X$$
*Output*
$$Z^{[1]}$$

In [20]:
l_forward(train_x, parameters['W1'], parameters['b1'])[0]

array([[ 0.07196659,  0.29407045,  0.52046231, ...,  0.38338442,
         0.19791222,  0.15667228],
       [-0.43149218, -0.34794355,  0.00957635, ..., -0.0440066 ,
         0.17248276, -0.14145979],
       [ 0.31849711,  0.11535033,  0.33366745, ...,  0.01670034,
         0.09194017,  0.11541118],
       ..., 
       [-0.07870905,  0.03190587, -0.01356782, ...,  0.18634333,
         0.18820044,  0.17551744],
       [-0.47167149, -0.15582351, -0.71908939, ..., -0.60416858,
         0.07156221, -0.30375025],
       [ 0.22106564,  0.29806307,  0.36277627, ...,  0.58466116,
         0.1014699 ,  0.02667976]])

*and*
$$A^{[0]}, W^{[1]}, b^{[1]}$$

In [21]:
l_forward(train_x, parameters['W1'], parameters['b1'])[1]

(array([[ 0.06666667,  0.76862745,  0.32156863, ...,  0.56078431,
          0.08627451,  0.03137255],
        [ 0.12156863,  0.75294118,  0.27843137, ...,  0.60784314,
          0.09411765,  0.10980392],
        [ 0.21960784,  0.74509804,  0.26666667, ...,  0.64705882,
          0.09019608,  0.20784314],
        ..., 
        [ 0.        ,  0.32156863,  0.54117647, ...,  0.33333333,
          0.01568627,  0.        ],
        [ 0.        ,  0.31372549,  0.55294118, ...,  0.41960784,
          0.01960784,  0.        ],
        [ 0.        ,  0.31764706,  0.55686275, ...,  0.58431373,
          0.        ,  0.        ]]),
 array([[ 0.01465338, -0.00551871, -0.00476469, ..., -0.00475605,
         -0.00343108,  0.00856474],
        [ 0.00910437,  0.00207385, -0.0059909 , ...,  0.00622328,
         -0.0044052 ,  0.00187287],
        [-0.00321458, -0.00176345,  0.00574466, ...,  0.00742212,
         -0.00094203, -0.00593549],
        ..., 
        [ 0.00157639, -0.0011742 ,  0.01656117, ...,

### Linear Activations of the First Layer

Linear Output of the first layer, activated with RELU (**RE**ctified **L**inear **U**nit)
$$relu(W^{[l]T}A^{[l-1]}b^{[l]})$$
**Output**
$$A^{[l]}, ((A^{[l-1]}, W^{[l]}, b^{[l]}), Z^{[l]})$$

Example: $$A^{[1]}$$

In [22]:
lact_forward(train_x, parameters['W1'], parameters['b1'], 'relu')[0]

array([[ 0.07196659,  0.29407045,  0.52046231, ...,  0.38338442,
         0.19791222,  0.15667228],
       [ 0.        ,  0.        ,  0.00957635, ...,  0.        ,
         0.17248276,  0.        ],
       [ 0.31849711,  0.11535033,  0.33366745, ...,  0.01670034,
         0.09194017,  0.11541118],
       ..., 
       [ 0.        ,  0.03190587,  0.        , ...,  0.18634333,
         0.18820044,  0.17551744],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.07156221,  0.        ],
       [ 0.22106564,  0.29806307,  0.36277627, ...,  0.58466116,
         0.1014699 ,  0.02667976]])

### Complete Forward Propogation

All L-1 layers are activated using RELU while the final layer(outcome) is activated using sigmoid(binary classification).
*Output*
$$A^{[l]}$$

In [23]:
AL, cache = L_forward(train_x, parameters)
len(cache)

4

### Compute Costs

In [24]:
cost(AL, train_y)

array(0.7717493284237686)

## Derivative outputs of the last layer

Each `cache` element houses...
$$A^{[l]}, ((A^{[l-1]}, W^{[l]}, b^{[l]}), Z^{[l]})$$
*where*

$A^{[l]}$ : Layer $l$'s activated output

$A^{[l-1]}$ : Previous layer's activated output.  $A^{[0]}$ corresponds to the input layer, $X$

$W^{[l]}$ : Corresponding weights for the layer $l$

$b^{[l]}$ : Biases

$Z^{[l]}$ : Linear outputs of layer $l$

Let's print out the $Z^{[L]}$

In [25]:
l, a = cache[-1] # the last layer
a

array([[ 0.23682965,  0.3617703 ,  0.4547016 ,  0.11587638,  0.53696866,
         0.33363595,  0.22705512,  0.00654325,  0.25821207,  0.18041241,
         0.44291608,  0.47733644,  0.4013233 ,  0.56109576,  0.36690789,
         0.85804303,  0.36050484,  0.73619789,  0.41969672,  0.29960153,
         0.42308701,  0.30968265,  0.59063732,  0.66340383,  0.66475275,
         0.13592181,  0.4935429 ,  0.3435607 ,  0.32336825,  0.39839019,
         0.28137059,  0.65873112,  0.33357562,  0.69655044,  0.37610349,
         0.21004222,  0.23689138,  0.21534942,  0.07174055,  0.48913821,
         0.45587072,  0.31859619,  0.56083865,  0.50423751,  0.2747355 ,
         0.21311506,  0.59085843,  0.3384595 ,  0.41909391,  0.30677565,
         0.6126749 ,  0.35049577,  0.44611779,  0.48544772,  0.27416875,
         0.50024744,  0.31504915,  0.61115568,  0.175417  ,  0.19048603,
         0.23223954,  0.36397181,  0.38315242,  0.30036867,  0.46848748,
         0.22486395,  0.46274062,  0.18505195,  0.4

Since this is the final layer, we cannot derive the $dAL$ from the next layer since there is 
For this layer, we used a $\sigma(x)$ activation, so we must use the derivation.

In [26]:
dAL = - (np.divide(train_y, AL)-np.divide(1-train_y, 1-AL))
dAL

array([[ 2.26722523,  2.43586908, -1.63463731,  2.12285706,  2.71081293,
         2.39603482,  2.25489904, -1.99347811,  2.29461334,  2.19771121,
         2.55724164, -1.62043375,  2.49380014, -1.5705835 , -1.69287346,
         3.35854057,  2.4340532 ,  3.08798167,  2.52150004, -1.74111348,
         2.52666712,  2.3629925 ,  2.80513851,  2.94138927, -1.5144007 ,
        -1.87291088,  2.63810962, -1.70924043,  2.3817741 , -1.6714    ,
         2.32494452,  2.93233887,  2.3959506 ,  3.00681811,  2.45659787,
         2.23373015,  2.26730345,  2.2402952 , -1.93077235,  2.63091011,
         2.57754639, -1.72716913, -1.57073022,  2.65572256,  2.3161825 ,
         2.23752704,  2.80553768, -1.71286765,  2.52058314,  2.35903604,
        -1.5418994 ,  2.41977125,  2.56223547,  2.62490235, -1.76020378,
         2.64912928, -1.72975301, -1.54272329,  2.19174307, -1.82655731,
        -1.7927562 , -1.69491078,  2.4669016 ,  2.35035655,  2.597576  ,
         2.25215235,  2.58842129,  2.20328094, -1.6

The `sigmoid_backward`, or derivative of the sigmoid function, will take a the $dAL$ and the activation_cache, `a`, which is also the $Z^{[L]}$.

In [27]:
sigmoid_backward(dAL, a)

array([[ 0.55893222,  0.58946891, -0.3882435 ,  0.52893672,  0.63110697,
         0.58264379,  0.55652116, -0.49836419,  0.56419673,  0.54498116,
         0.60895365, -0.38288128,  0.59900556, -0.36329396, -0.40928839,
         0.70225162,  0.58916264,  0.67616388,  0.60341067, -0.4256549 ,
         0.60422171,  0.5768078 ,  0.64351136,  0.6600246 , -0.33967278,
        -0.46607176,  0.62094069, -0.4149448 ,  0.5801449 , -0.40169918,
         0.56988221,  0.65897529,  0.58262912,  0.66742252,  0.59293297,
         0.55231835,  0.55894744,  0.55363025, -0.48207255,  0.6199034 ,
         0.61203414, -0.4210179 , -0.36335343,  0.62345464,  0.56825509,
         0.55307803,  0.64356209, -0.41618373,  0.60326641,  0.57609804,
        -0.35144926,  0.5867378 ,  0.60971581,  0.61903345, -0.43188396,
         0.62251748, -0.42188278, -0.35179562,  0.54374214, -0.45252197,
        -0.44219967, -0.40999844,  0.5946332 ,  0.57453264,  0.6150257 ,
         0.5559803 ,  0.61366413,  0.54613142, -0.3

The result of sigmoid_backward is then inserted to the linear_backward function with the linear cache which gives you the...
$$dA^{[L-1]}, dW^{[L]}, db^{[L]}$$

Example : 
$$dA^{[L-1]}$$

In [28]:
dA_prev, dW, db = l_backward(sigmoid_backward(dAL, a), l)
dA_prev

array([[-0.21476162, -0.2264949 ,  0.14917695, ..., -0.21695443,
        -0.20805406, -0.20979245],
       [-0.01103975, -0.0116429 ,  0.00766839, ..., -0.01115248,
        -0.01069495, -0.01078432],
       [ 0.27991914,  0.29521224, -0.19443643, ...,  0.28277724,
         0.27117655,  0.27344236],
       [ 0.50173315,  0.52914482, -0.34851209, ...,  0.50685607,
         0.48606274,  0.49012402],
       [ 0.01045101,  0.01102199, -0.00725944, ...,  0.01055772,
         0.01012459,  0.01020919]])

...The gradients for the current layer, $dW^{[L]}, db^{[L]}$, and the derivative of the activation for the previous layer, $dA^{[L-1]}$, which is used in conjunction with the activation_cache for that layer then inserted in to the proper backward_activation function, which in this case, all $L-1$ (in between) layers are activated with RELU.  The result is then $dZ^{[L-1]}$, and the cycle continues until we reach the first layer. 

**Note** that this entire process happens nested inside the linear_activation_backward functions which performs the specific backward_activation calculation which gets put inside the linear_backward function.  That function is called the `lact_backward`.  Below, the equivalency and convenience is shown printing the $dW^{[L]}$ using both methods.

In [29]:
print(lact_backward(dAL, cache[-1], 'sigmoid')[1])
print(l_backward(sigmoid_backward(dAL, a), l)[1])

[[ 0.02203238  0.00799533  0.05356974  0.08891859  0.        ]]
[[ 0.02203238  0.00799533  0.05356974  0.08891859  0.        ]]


Now we do the full backward propogation to all the layers.  This results in a dictionary of gradients.  Our `layer_dims` was `[12288, 20, 7, 5, 1]`, so we have 3 hidden-layers and 1 output-layer and 1 input-layer.  We should have the following gradients for the hidden layers.
\begin{bmatrix}
dA^{[4]} & dA^{[3]} & dA^{[2]} & dA^{[1]} \\
dW^{[4]} & dW^{[3]} & dW^{[2]} & dW^{[1]} \\
db^{[4]} & db^{[3]} & db^{[2]} & db^{[1]}
\end{bmatrix}

**Note** $dA^{[4]}$ is equivalent to the last layer, $dA^{[L]}$

In [30]:
g = L_backward(AL, train_y, cache)
g.keys()

dict_keys(['dA4', 'dW4', 'db4', 'dA3', 'dW3', 'db3', 'dA2', 'dW2', 'db2', 'dA1', 'dW1', 'db1'])

Example : 
$$ dA^{[4]} $$

In [34]:
g['dA4']

array([[-0.21476162, -0.2264949 ,  0.14917695, ..., -0.21695443,
        -0.20805406, -0.20979245],
       [-0.01103975, -0.0116429 ,  0.00766839, ..., -0.01115248,
        -0.01069495, -0.01078432],
       [ 0.27991914,  0.29521224, -0.19443643, ...,  0.28277724,
         0.27117655,  0.27344236],
       [ 0.50173315,  0.52914482, -0.34851209, ...,  0.50685607,
         0.48606274,  0.49012402],
       [ 0.01045101,  0.01102199, -0.00725944, ...,  0.01055772,
         0.01012459,  0.01020919]])

Now we use the `update_parameters` function to iteratively update our old parameters.

In [35]:
print ("Old W1 was")
print (parameters['W1'])
print ("Now it is...")
print(update_par(parameters, g, 0.01)['W1'])

Old W1 was
[[ 0.01443107 -0.00578819 -0.00498028 ..., -0.00496957 -0.00367704
   0.008408  ]
 [ 0.00923991  0.00223314 -0.00587055 ...,  0.00635394 -0.00425329
   0.0019733 ]
 [-0.00326254 -0.00181919  0.00569773 ...,  0.00733889 -0.00102616
  -0.00599855]
 ..., 
 [ 0.00135389 -0.00144156  0.0163466  ..., -0.00851114 -0.0076693
  -0.0015267 ]
 [-0.00032176 -0.00376954 -0.00772468 ...,  0.02027105 -0.00481998
  -0.00027055]
 [-0.01280684 -0.00985645  0.00668177 ...,  0.01995729  0.01401994
   0.00683061]]
Now it is...
[[ 0.01420875 -0.00605767 -0.00519586 ..., -0.00518309 -0.00392299
   0.00825126]
 [ 0.00937544  0.00239243 -0.00575019 ...,  0.0064846  -0.00410139
   0.00207374]
 [-0.00331051 -0.00187493  0.00565079 ...,  0.00725566 -0.00111028
  -0.0060616 ]
 ..., 
 [ 0.00113138 -0.00170892  0.01613202 ..., -0.00869936 -0.00789808
  -0.00166996]
 [-0.00032824 -0.00377712 -0.00773478 ...,  0.02026871 -0.00482596
  -0.00027449]
 [-0.01267472 -0.00969371  0.00681137 ...,  0.02008054  0.01

In [36]:
def L_model(X, Y, layer_dims, learning_rate = 0.0075, iterations = 3000, print_cost=False, plot=False):
    np.random.seed(1)
    costs = []
    ac = []
    parameters = par_deep(layer_dims)
    for i in range(iterations):
        AL, caches = L_forward(X, parameters)
        c = cost(AL, Y)
        grads = L_backward(AL, Y, caches)
        parameters = update_par(parameters, grads, learning_rate)
        if print_cost and i%100==0:
            print ("Cost after iteration {0}: {1}".format(i, c))
            costs.append(c)
            accuracy = predict(X, Y, parameters)[0]
            print (accuracy)
            ac.append(accuracy)
    # plot the cost
    if plot:
        plt.plot(np.squeeze(costs))
        plt.ylabel('cost')
        plt.xlabel('iterations (per tens)')
        plt.title("Learning rate =" + str(learning_rate))
        plt.show()
        plt.plot(ac)
        plt.ylabel('Accuracy Score')
        plt.xlabel('iterations (per tens)')
        plt.title("Learning rate =" + str(learning_rate))
        plt.show()
    return parameters

In [37]:
p = L_model(train_x, train_y, 
            learning_rate = 0.0075, 
            layer_dims = [12288,25, 20, 15, 10, 5, 1], 
            iterations = 3000, 
            print_cost = True)

Cost after iteration 0: 0.6812016076456814
[ 0.65550239]
Cost after iteration 100: 0.5822890145429783
[ 0.65550239]
Cost after iteration 200: 0.5177450012151461
[ 0.65550239]
Cost after iteration 300: 0.42934243552094764
[ 0.76555024]
Cost after iteration 400: 0.44323696356161496
[ 0.70813397]
Cost after iteration 500: 0.5379355986044362
[ 0.93779904]
Cost after iteration 600: 0.27654314252570855
[ 0.89952153]
Cost after iteration 700: 0.24328195786987822
[ 0.91866029]
Cost after iteration 800: 0.31925010023564165
[ 0.93779904]
Cost after iteration 900: 0.179015662541538
[ 0.96650718]
Cost after iteration 1000: 0.15177952063998984
[ 0.9569378]
Cost after iteration 1100: 0.44147412354313653
[ 0.84210526]
Cost after iteration 1200: 0.6889495955461816
[ 0.53588517]
Cost after iteration 1300: 0.5167874680985126
[ 0.86124402]
Cost after iteration 1400: 0.4517956917837608
[ 0.89952153]
Cost after iteration 1500: 0.3802982242515906
[ 0.89952153]
Cost after iteration 1600: 0.6269628155292167
[

## Multi-class testing

In [55]:
three_class = np.random.randint(0,2, (3,209))
for i in range(three_class.shape[1]):
    if three_class[0][i]==1:
        three_class[1][i]=0
        three_class[2][i]=0
    elif three_class[0][i]==0 and three_class[1][i]==0:
        three_class[2][i]=1
    elif three_class[0][i]==0 and three_class[2][i]==0:
        three_class[2][i]=1
print (np.apply_over_axes(np.sum, three_class, 1))
print (three_class)
p = L_model(train_x, three_class, 
            learning_rate = 0.0075, 
            layer_dims = [12288,25, 20, 15, 10, 5, 3], 
            iterations = 1000, 
            print_cost = True)

[[104]
 [ 57]
 [105]]
[[0 1 1 0 1 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 0 1 1 0 0 1 1 1 0 1
  1 1 1 1 0 0 1 1 1 1 0 0 1 1 0 0 0 1 1 1 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 1 0
  0 1 1 1 0 1 0 1 0 1 1 1 1 0 0 1 1 1 1 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1
  0 1 1 1 1 0 0 1 0 1 1 1 1 0 0 0 1 0 1 0 1 1 1 0 0 0 0 1 0 0 1 1 1 0 1 1 1
  1 1 0 0 1 1 1 0 1 0 0 0 1 0 1 1 1 1 1 0 1 0 0 1 0 1 0 0 1 1 1 0 1 0 0 0 0
  0 1 0 1 1 0 1 0 1 0 0 1 1 0 1 1 0 0 0 1 1 0 0 0]
 [1 0 0 0 0 0 1 1 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 1 0 0 0
  1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 0 0
  1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0
  0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 1 0 1 1 0 1 0 1 0 0 0 0 1 0 1 0 0 0
  1 0 1 0 0 1 0 0 0 1 1 0 0 1 0 0 1 1 0 0 0 0 1 0]
 [1 0 0 1 0 1 1 1 1 0 0 1 1 1 0 1 1 1 0 0 1 1 0 1 1 1 1 1 0 0 1 1 0 0 0 1 0
  0 0 0 0 1 1 0 0 0 0 1 1 0 0 1 1 1 0 0 

In [68]:
preds = predict(train_x, three_class, p)[1]
or_preds = predict(train_x, train_y, parameters)[1]
probs = predict(train_x, three_class, p)[2]
print (probs)
np.argmax(probs, axis = 0)

[[  8.00617291e-03   9.89059023e-01   9.93333977e-01   4.67096752e-02
    9.99927192e-01   6.67682515e-02   5.55016020e-02   1.03602499e-02
    6.42008343e-03   9.99170315e-01   9.97944155e-01   1.87350354e-01
    8.35935659e-02   8.40985961e-03   9.98125490e-01   1.73422762e-02
    8.44285016e-03   6.44454513e-03   9.75934064e-01   9.88703868e-01
    1.51070846e-02   6.35050889e-02   9.87506679e-01   5.69781502e-03
    4.74495677e-02   9.32314123e-02   4.27209221e-02   2.17368680e-03
    9.98390543e-01   9.99630838e-01   6.37345148e-02   6.86502369e-03
    9.97519982e-01   9.49038644e-01   9.99522703e-01   3.36471724e-02
    9.96254559e-01   9.98777577e-01   9.50457753e-01   9.38624176e-01
    9.99842209e-01   1.71620694e-02   9.84324204e-03   9.99933675e-01
    9.97922630e-01   9.77413493e-01   9.99606238e-01   1.63327009e-02
    3.83026711e-03   9.91344536e-01   9.16613633e-01   5.99547110e-03
    2.29425087e-02   4.92278269e-01   9.98426842e-01   9.96118786e-01
    9.87189670e-01  

array([2, 0, 0, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 0,
       2, 2, 2, 2, 2, 0, 0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0,
       0, 2, 2, 0, 0, 2, 2, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 2, 2, 2, 2, 2,
       2, 2, 0, 0, 2, 2, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 0, 0, 2, 2, 0, 0, 0,
       0, 2, 0, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 0, 0, 0,
       0, 2, 2, 0, 2, 0, 0, 0, 0, 2, 2, 2, 0, 2, 0, 2, 0, 0, 0, 2, 2, 2, 2,
       0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0,
       2, 0, 0, 0, 0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 0, 2, 0, 2, 2, 2,
       2, 2, 0, 2, 0, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 2, 2, 2, 0, 0, 2,
       2, 2], dtype=int64)

In [402]:
or_probs = L_forward(train_x, parameters)[0]
three_probs = L_forward(train_x, p)[0]
print(three_probs[0][9], three_probs[1][9])
def onezero(x):
    l = []
    if np.max(x)>=0.5:
        for i in x:
            if i==np.max(x):
                l.append(1)
            else:
                l.append(0)
    else:
        l = [0]*len(x)
    return l
            

0.35796235544 0.489590674873


In [403]:
print(numpy.apply_along_axis(onezero,0,three_probs))
print(three_class)

[[0 0 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1
  0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 1 1 1 1 0 1 0 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
  0 1 0 1 1 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1 1 1 1 1 0 0 1 0 0 1 1 0 1 1 1 0 0
  1 0 1 0 0 1 0 1 0 1 1 0 0 0 1 0 1 0 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1
  1 0 1 1 1 1 0 0 0 1 0 1 0 1 0 1 0 1 0 0 0 1 1 1 1 0 1 1 0 0 1 0 0 1 1 0 1
  1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 1 1 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]]
[[0 0 0 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 0 0 1 1 1 0 1 1 0 0 1
  0 1 1 0 0 1 1 0 0 1 0 0 0 1 0 1 1 1 0 0 0 0 0 1 0 0 0 0 1 0

In [401]:
[0]*3

[0, 0, 0]

In [3]:
1080/128*3000

25312.5