In [127]:
import numpy as np

In [128]:
#printer function
def print_dim(name,input):
    print(f'shape of {name}: {input.shape}')

def print_arr(name,input):
    print(f'array of {name}: {input}')

In [129]:
#functions
def lin_transform(input, W, b):
    output = np.dot(input,W) + b
    return output

def relu(x):
    return np.maximum(0,x)

def relu_deriv(x):
    return (x>0).astype(float)

def softmax(x):
    exp_x = np.exp(x- np.max(x, axis =1, keepdims = True))
    return exp_x / np.sum(exp_x, axis =1, keepdims = True)

def softmax_deriv(y_true, y_pred):
    return y_pred - y_true

def y_pred_allocation(x):
    return np.argmax(x)

def forward(input, hidden_W, hidden_b, output_W, output_b):
    hidden_input = lin_transform(input, hidden_W, hidden_b)
    relu_output = relu(hidden_input)
    softmax_input = lin_transform(relu_output, output_W, output_b)
    softmax_output = softmax(softmax_input)
    print_dim('hidden_input', hidden_input)
    print_dim('relu_output', relu_output)
    print_dim('softmax_input', softmax_input)
    print_dim('softmax_output', softmax_output)
    print('')
    print_arr('softmax_input', softmax_input)
    print_arr('softmax_output', softmax_output)

    return softmax_output, relu_output

def backward(input,y_true, y_pred, hidden_output, hidden_W, hidden_b, output_W, output_b):
    # loss in output layer
    loss_output = softmax_deriv(y_true, y_pred)

    # Gradients of the output layer
    grad_output_W = np.dot(hidden_output.T, loss_output)
    grad_output_b = np.sum(loss_output, axis=0)
    
    # loss in hidden layer
    loss_hidden = np.dot(loss_output, output_W.T) * relu_deriv(hidden_output)

    # Gradients of the hidden layer
    grad_hidden_W = np.dot(input.T, loss_hidden)
    grad_hidden_b = np.sum(loss_hidden, axis=0)

    print_dim('loss_output',loss_output)
    print_dim('grad_hidden_W',grad_hidden_W)
    print_dim('grad_hidden_b',grad_hidden_b)
    print('')
    print_dim('loss_hidden',loss_hidden)
    print_dim('grad_output_W',grad_output_W)
    print_dim('grad_output_b',grad_output_b)
    
    return grad_hidden_W, grad_hidden_b, grad_output_W, grad_output_b

def update(lr, hidden_W, hidden_b, output_W, output_b, grad_hidden_W, grad_hidden_b, grad_output_W, grad_output_b):
    hidden_W -= lr * grad_hidden_W
    hidden_b -= lr * grad_hidden_b
    output_W -= lr * grad_output_W
    output_b -= lr * grad_output_b
    print_dim("hidden_W", hidden_W)
    print_dim("hidden_b",hidden_b)
    print_dim("output_W",output_W)
    print_dim("output_b",output_b)

    return hidden_W, hidden_b, output_W, output_b,

def predict(softmax_output):
    y_pred = np.argmax(softmax_output, axis = 1)
    
    return y_pred

# Example usage

In [130]:
#initialize dataset
count_data = 2
n_data = 10 # length of each data
n_hidden = 5 # width of hidden layer
n_output = 2 # width of output layer

# initialize each layer
input = np.ones((count_data,n_data))
hidden_W = np.random.randn(n_data,n_hidden)
hidden_b = np.zeros(n_hidden,)
output_W = np.random.randn(n_hidden,n_output)
output_b = np.zeros(n_output)

y_true = np.array([[1,0],[0,1]])

print_dim("input",input)
print_dim("hidden_W", hidden_W)
print_dim("hidden_b",hidden_b)
print_dim("output_W",output_W)
print_dim("output_b",output_b)
print_dim('y true', y)
print('')
print_arr('hidden_W', hidden_W)
print_arr('output_W', output_W)

shape of input: (2, 10)
shape of hidden_W: (10, 5)
shape of hidden_b: (5,)
shape of output_W: (5, 2)
shape of output_b: (2,)
shape of y true: (2, 2)

array of hidden_W: [[-0.14848118  1.17751219 -1.03127701 -0.0772543   0.52297342]
 [ 0.92566197 -0.52334616  1.28815022 -0.0834638   0.90277481]
 [-0.82047439  1.185998    1.31734853  0.75150415 -1.32165343]
 [-0.04466684  0.51872513  0.95785976  0.65656255  1.01318212]
 [ 0.60996038  0.98028128  0.02317266 -0.8461365   0.21649808]
 [-1.67645635  1.4349832  -0.90422134 -0.17637378 -0.65783895]
 [ 1.48905172 -1.48027333  0.36390227  1.0280659  -0.11066521]
 [-1.55171384  1.78703477 -2.34232048 -1.39481407  0.13092026]
 [ 0.54745995  0.66286395 -0.7036196   0.07264728 -0.33473137]
 [-1.2195222   0.87494506  0.3630566   0.09251444 -0.58696307]]
array of output_W: [[ 0.18810901 -0.18789522]
 [ 1.06556972 -0.17555015]
 [ 0.12755784 -0.96823612]
 [-0.44129954  0.66660278]
 [-0.29150912  0.80823897]]


In [131]:
#run forward
softmax_output, relu_output = forward(input,hidden_W, hidden_b, output_W, output_b)


shape of hidden_input: (2, 5)
shape of relu_output: (2, 5)
shape of softmax_input: (2, 2)
shape of softmax_output: (2, 2)

array of softmax_input: [[ 7.04245096 -1.14641821]
 [ 7.04245096 -1.14641821]]
array of softmax_output: [[9.99722349e-01 2.77650658e-04]
 [9.99722349e-01 2.77650658e-04]]


In [132]:
y_pred = predict(softmax_output)
print(y_pred)

[0 0]


In [133]:
#implement backward
grad_hidden_W, grad_hidden_b, grad_output_W, grad_output_b = backward(input,y_true, y_pred, relu_output, hidden_W, hidden_b, output_W, output_b)

shape of loss_output: (2, 2)
shape of grad_hidden_W: (10, 5)
shape of grad_hidden_b: (5,)

shape of loss_hidden: (2, 5)
shape of grad_output_W: (5, 2)
shape of grad_output_b: (2,)


In [134]:
#implement update
hidden_W, hidden_b, output_W, output = update(0.1, hidden_W, hidden_b, output_W, output_b, grad_hidden_W, grad_hidden_b, grad_output_W, grad_output_b)
print('')
print_arr('hidden_W', hidden_W)
print_arr('output_W', output_W)

shape of hidden_W: (10, 5)
shape of hidden_b: (5,)
shape of output_W: (5, 2)
shape of output_b: (2,)

array of hidden_W: [[-0.14848118  1.26651414 -1.03127701 -0.05472398  0.52297342]
 [ 0.92566197 -0.4343442   1.28815022 -0.06093347  0.90277481]
 [-0.82047439  1.27499996  1.31734853  0.77403448 -1.32165343]
 [-0.04466684  0.60772709  0.95785976  0.67909288  1.01318212]
 [ 0.60996038  1.06928323  0.02317266 -0.82360618  0.21649808]
 [-1.67645635  1.52398516 -0.90422134 -0.15384345 -0.65783895]
 [ 1.48905172 -1.39127137  0.36390227  1.05059623 -0.11066521]
 [-1.55171384  1.87603673 -2.34232048 -1.37228375  0.13092026]
 [ 0.54745995  0.75186591 -0.7036196   0.0951776  -0.33473137]
 [-1.2195222   0.96394702  0.3630566   0.11504476 -0.58696307]]
array of output_W: [[ 0.18810901 -0.18789522]
 [ 1.72744213  0.48632226]
 [ 0.12755784 -0.96823612]
 [-0.43897435  0.66892796]
 [-0.29150912  0.80823897]]
