In [1]:
import numpy as np

In [15]:
''' create datasets '''

def create_dataset(col_num):
    # create entry dataset X (train or test)
    X = np.random.randint(2, size=(2,col_num))
    
    # create label dataset Y (train or test)
    Y = np.sum(X, axis=0, keepdims=True)
    Y[Y!=1] = 0

    # create noises in the entry dataset X by adding (-0.6, 0.6) to the data
    X_noise = np.random.randn(2,col_num)
    X_noise = X + (X_noise / 20)
    
    return X_noise, Y


''' initialize parameters -- W, b '''
def initialize_parameters(layer_dims):
    
    parameters = {}
    L = len(layer_dims)
    
    # for the first L-1 layers, we use a heristic to initialize weight that is customized to the relu function
    for i in range(1, L):
        parameters[f'W{i}'] = np.random.randn(layer_dims[i], layer_dims[i-1]) * relu(None, heuristic=layer_dims[i-1])
        parameters[f'b{i}'] = np.zeros((layer_dims[i], 1))
    
    # for the last layer (L), we use a heristic to initialize weight that is customized to the sigmoid function
    parameters[f'W{i}'] = np.random.randn(layer_dims[L-1], layer_dims[L-2]) * sigmoid(None, heuristic=layer_dims[L-2])
    parameters[f'b{i}'] = np.zeros((layer_dims[L-1], 1))
    
    return parameters


''' define activation function (sigmoid) and its derivative '''
def sigmoid(F, derivative=False, heuristic=False):
    
    # calculate the derivative of sigmoid
    if derivative:
        return F * (1 - F) # F = A
    
    # calculate the heuristic to initialize weight that is customized to the sigmoid function 
    if heuristic:
        return np.sqrt(1 / heuristic)
    
    # calucate the sigmoid function
    else:
        return 1 / (1 + np.exp(-F)) # F = Z
    
def relu(F, derivative=False, heuristic=False):
    
    # calculate the derivative of relu
    if derivative:
        return 1 * (F > 0) # F = Z
    
    # calculate the heuristic to initialize weight that is customized to the relu function
    elif heuristic:
        return np.sqrt(2 / heuristic)
    
    # calucate the relu function
    else:
        return F * (F > 0) # F = Z
    

''' 1. forward propagation function - calculate pre-activation fn (Z) & activation fn (A) ''' 
    
def forward_pass(X, parameters, layer_nums):
    
    cache = {}
    cache['A0'] = X
    L = len(layer_nums)
    
    for i in range(1, L-1):
        
        # for the first L-1 layers, use relu as an activation function
        cache[f'Z{i}'] = np.dot(parameters[f'W{i}'], cache[f'A{i-1}']) + parameters[f'b{i}']
        cache[f'A{i}'] = relu(cache[f'Z{i}'])

    # for the last layer L, use sigmoid as an activation function
    cache[f'Z{L-1}'] = np.dot(parameters[f'W{L-1}'], cache[f'A{L-2}']) + parameters[f'b{L-1}']
    cache[f'A{L-1}'] = sigmoid(cache[f'Z{L-1}'])

    return cache


''' 2. calculate cost '''
def cost(A, Y):
    
    m = Y.shape[1]
    J = - np.sum (Y * np.log(A) + (1 - Y) * np.log(1 - A)) / m

    return J


''' 3. backward propagation fonction - calculate dW & db from dA & dZ'''

# backward non_linear function to calculate dA & dZ
def backward_pass(cache, parameters, Y, layer_dims):
    
    grads = {}
    m = Y.shape[1]
    L = len(layer_dims)
    
    # for last layer, use the derivative of sigmoid, which is simply (A-Y). So no need to call the sigmoid function
    dZ = cache[f'A{L-1}'] - Y
    grads[f'dW{L-1}'] = np.dot(dZ, cache[f'A{L-2}'].T) / m
    grads[f'db{L-1}'] = np.sum(dZ, axis = 1, keepdims = True) / m 

    # for L-1 precendent layers, use the derivative of relu
    for i in range(L-2, 0, -1):
        dA = np.dot(parameters[f'W{i+1}'].T, dZ)
        dZ = dA * relu(cache[f'Z{i}'], derivative=True)
        grads[f'dW{i}'] = np.dot(dZ, cache[f'A{i-1}'].T) / m
        grads[f'db{i}'] = np.sum(dZ, axis = 1, keepdims = True) / m
    
    return grads


''' 4. update parameters - W & b'''

# update parameters W & b using the gradients that were calculated from the backward pass
def update_parameters(parameters, grads, learning_rate, layer_dims):
    
    for i in range(1, len(layer_dims)):
        parameters[f'W{i}'] -= learning_rate * grads[f'dW{i}']
        parameters[f'b{i}'] -= learning_rate * grads[f'db{i}']
        
    return parameters

In [16]:
''' training the model '''
col_num = 10000
layer_dims = 2, 5, 1
learning_rate = 0.1
iteration = 50000

X, Y = create_dataset(col_num)
parameters = initialize_parameters(layer_dims)

for i in range(iteration):
    
    # 1. forward propagation
    cache = forward_pass(X, parameters, layer_dims)
    # 2. cost function
    J = cost(cache[f'A{len(layer_dims)-1}'], Y)
    # 3. backward propagation
    grads = backward_pass(cache, parameters, Y, layer_dims)
    # 4. update parameters
    parameters = update_parameters(parameters, grads, learning_rate, layer_dims)
    
    if i % 5000 == 0:
        print(f'cost{i}: {J}')

cost0: 0.9142763543194594
cost5000: 0.001379920320682325
cost10000: 0.0006036301993199402
cost15000: 0.0003765649114484425
cost20000: 0.0002704438416725135
cost25000: 0.0002095812832933118
cost30000: 0.00017035274916517716
cost35000: 0.00014306940874573944
cost40000: 0.00012305186381462166
cost45000: 0.0001077705853611852


In [11]:
''' calculate accuracy '''

def predict(col_num, X, Y, parameters, layer_nums):
    # create test dataset
    X_test, Y_test = create_dataset(col_num)

    # calculate the 'prediction' of the train dataset
    cache_train = forward_pass(X, parameters, layer_nums)
    A_train = (cache_train[f'A{len(layer_dims)-1}'] > 0.5) * 1

    # calculate the prediction of the test dataset
    cache_test = forward_pass(X_test, parameters, layer_nums)
    A_test = (cache_test[f'A{len(layer_dims)-1}'] > 0.5) * 1

    # calculate the accuracy of the both datasets
    accuracy_train = (A_train == Y) * 1
    accuracy_test = (A_test == Y_test) * 1

    # print out the results
    print(f'accuracy_train: {np.average(accuracy_train) * 100}%')
    print(f'accuracy_test: {np.average(accuracy_test) * 100}%')
    
    return

In [14]:
predict(col_num, X, Y, parameters, layer_dims)

accuracy_train: 99.98%
accuracy_test: 99.94%
