In [573]:
import numpy as np

#printer function
def print_dim(name,input):
    print(f'shape of {name}: {input.shape}')

def print_arr(name,input):
    print(f'array of {name}: {input}')

In [574]:
#functions
def lin_transform(input, W, b):
    output = np.dot(input,W) + b
    return output

def relu(x):
    return np.maximum(0,x)

def relu_deriv(x):
    return (x>0).astype(float)

def softmax(x):
    exp_x = np.exp(x- np.max(x, axis =1, keepdims = True))
    return exp_x / np.sum(exp_x, axis =1, keepdims = True)

def softmax_deriv(y_true, y_pred):
    return y_pred - y_true

def y_pred_allocation(x):
    return np.argmax(x)

def loss(y_true, softmax_output):
    n_class = softmax_output.shape[1]
    n_samples = softmax_output.shape[0]
    y_true_ohe = np.zeros((n_samples, n_class))
    y_true = y_true.flatten().astype(int)
    y_true_ohe[np.arange(n_samples),y_true] = 1

    return softmax_output-y_true_ohe

def forward(input, hidden_W, hidden_b, output_W, output_b):
    hidden_input = lin_transform(input, hidden_W, hidden_b)
    relu_output = relu(hidden_input)
    softmax_input = lin_transform(relu_output, output_W, output_b)
    softmax_output = softmax(softmax_input)
    print('FORWARD:')
    print_dim('hidden_input', hidden_input)
    print_dim('relu_output', relu_output)
    print_dim('softmax_input', softmax_input)
    print_dim('softmax_output', softmax_output)
    print('')
    #print_arr('softmax_input', softmax_input)
    #print_arr('softmax_output', softmax_output)
    #print('')

    return softmax_output, relu_output

def backward(input,y_true, softmax_output, hidden_output, hidden_W, hidden_b, output_W, output_b):
    # loss in output layer
    loss_output = loss(y_true, softmax_output)

    # Gradients of the output layer
    grad_output_W = np.dot(hidden_output.T, loss_output)
    grad_output_b = np.sum(loss_output, axis=0)
    
    # loss in hidden layer
    loss_hidden = np.dot(loss_output, output_W.T) * relu_deriv(hidden_output)

    # Gradients of the hidden layer
    grad_hidden_W = np.dot(input.T, loss_hidden)
    grad_hidden_b = np.sum(loss_hidden, axis=0)

    print('BACKWARD:')
    print_dim('y_true', y_true)
    print_dim('softmax_output', softmax_output)
    print_dim('loss_output',loss_output)
    print_dim('grad_output_W',grad_output_W)
    print_dim('grad_output_b',grad_output_b)

    print('')
    print_dim('loss_hidden',loss_hidden)
    print_dim('grad_hidden_W',grad_hidden_W)
    print_dim('grad_hidden_b',grad_hidden_b)    
    print('')
    
    return grad_hidden_W, grad_hidden_b, grad_output_W, grad_output_b

def update(lr, hidden_W, hidden_b, output_W, output_b, grad_hidden_W, grad_hidden_b, grad_output_W, grad_output_b):
    hidden_W -= lr * grad_hidden_W
    hidden_b -= lr * grad_hidden_b
    output_W -= lr * grad_output_W
    output_b -= lr * grad_output_b
    print('UPDATE:')
    print_dim("hidden_W", hidden_W)
    print_dim("hidden_b",hidden_b)
    print_dim("output_W",output_W)
    print_dim("output_b",output_b)
    print('')

    return hidden_W, hidden_b, output_W, output_b

def predict(softmax_output):
    y_pred = np.argmax(softmax_output, axis = 1)
    
    return y_pred

In [575]:
#initialize dataset
count_data = 1
n_class = 2
n_data = 5 # length of each data
n_hidden = 3 # width of hidden layer
n_output = n_class # width of output layer

# initialize each layer
input = np.ones((count_data,n_data))
hidden_W = np.random.randn(n_data,n_hidden)
hidden_b = np.zeros(n_hidden,)
output_W = np.random.randn(n_hidden,n_output)
output_b = np.zeros(n_output)

y_true = np.random.randint(0,n_class, count_data)

print_dim("input",input)
print_dim("hidden_W", hidden_W)
print_dim("hidden_b",hidden_b)
print_dim("output_W",output_W)
print_dim("output_b",output_b)
print_dim('y true', y_true)
print('')
print_arr('hidden_W', hidden_W)
print_arr('output_W', output_W)
print_arr('y_true', y_true)

shape of input: (1, 5)
shape of hidden_W: (5, 3)
shape of hidden_b: (3,)
shape of output_W: (3, 2)
shape of output_b: (2,)
shape of y true: (1,)

array of hidden_W: [[-0.29757411 -1.56526315  0.51642671]
 [-1.43666905  0.35057554 -0.63461515]
 [-0.31406078 -0.17363556 -0.69292359]
 [-2.54358023  0.81889583  0.58924507]
 [-0.4299372   1.18089037  0.26863935]]
array of output_W: [[ 0.01071466  1.08207792]
 [-1.26716354 -0.64267912]
 [ 0.4719933  -1.50544545]]
array of y_true: [1]


In [576]:
#forward
softmax_output, relu_output = forward(input,hidden_W, hidden_b, output_W, output_b)

#backward
grad_hidden_W, grad_hidden_b, grad_output_W, grad_output_b = backward(input,y_true, softmax_output, relu_output, 
                                                                      hidden_W, hidden_b, output_W, output_b)

#implement update
hidden_W, hidden_b, output_W, output = update(0.1, hidden_W, hidden_b, output_W, output_b, 
                                              grad_hidden_W, grad_hidden_b, grad_output_W, grad_output_b)

FORWARD:
shape of hidden_input: (1, 3)
shape of relu_output: (1, 3)
shape of softmax_input: (1, 2)
shape of softmax_output: (1, 2)

BACKWARD:
shape of y_true: (1,)
shape of softmax_output: (1, 2)
shape of loss_output: (1, 2)
shape of grad_output_W: (3, 2)
shape of grad_output_b: (2,)

shape of loss_hidden: (1, 3)
shape of grad_hidden_W: (5, 3)
shape of grad_hidden_b: (3,)

UPDATE:
shape of hidden_W: (5, 3)
shape of hidden_b: (3,)
shape of output_W: (3, 2)
shape of output_b: (2,)



In [577]:
y_pred = predict(softmax_output)
print(y_pred)

[1]


In [578]:
#step by step


#FORWARD

#hidden layer
lin_transform_hidden = np.dot(input,hidden_W) + hidden_b 

print('lin transform in hidden layer:')
print_dim('input', input)
print_dim('hidden_W', hidden_W)
print_dim('hidden_b', hidden_b)
print_dim('lin_transform_hidden', lin_transform_hidden)
print('')

relu_output = relu(lin_transform_hidden) #relu activation
print('relu output in hidden layer:')
print_dim('relu_output', relu_output)
print('')

#softmax
lin_transform_softmax = np.dot(relu_output,output_W) + output_b # lin transform at softmax
print('lin transform in softmax layer:')
print_dim('input', relu_output)
print_dim('hidden_W', output_W)
print_dim('hidden_b', output_b)
print_dim('lin_transform_softmax', lin_transform_softmax)
print('')

np_max_output = np.max(lin_transform_softmax, axis = -1 , keepdims = True) 
softmax_numerator = np.exp(lin_transform_softmax - np_max_output)
softmax_denominator = np.sum(np_max_output, axis = -1, keepdims = True)
softmax_output = softmax_numerator / softmax_denominator
print('softmax calculation:')
print_dim('np_max_output', np_max_output)
print_dim('relu_output', relu_output)
print_dim('softmax_numerator', softmax_numerator)
print_dim('softmax_denominator', softmax_denominator)
print_dim('softmax_output', softmax_output)
print('')

#BACKWARD

# softmax loss
softmax_loss = loss(y_true,softmax_output) 
print('softmax loss calculation:')
print_dim('y_true', y_true)
print_dim('softmax_output', softmax_output)
print_dim('softmax_loss', softmax_loss)
print('')

#softmax grad calculation
softmax_grad_W = np.dot(relu_output.T, softmax_loss) #output_grad_W
softmax_grad_b = np.sum(softmax_loss, axis =0) #output_grad_b
print('softmax grad calculation:')
print_dim('relu_output.T', relu_output.T)
print_dim('softmax_loss', softmax_loss)
print_dim('softmax_grad_W', softmax_grad_W)
print_dim('softmax_grad_b', softmax_grad_b)
print('')

#relu loss

relu_loss = np.dot(softmax_loss, output_W.T) * relu_deriv(relu_output)# loss relu
print_dim('softmax_loss', softmax_loss)
print_dim('output_W.T', output_W.T)
print_dim('relu_output', relu_output)
print('relu loss calculation:')
print_dim('relu_loss', relu_loss)
print('')

# relu grad calculation
hidden_grad_W = np.dot(input.T, relu_loss) #hidden_grad_W
hidden_grad_b = np.sum(relu_loss, axis =0) #hidden_grad_b
print('relu grad calculation:')
print_dim('input.T', input.T)
print_dim('relu_loss', relu_loss)
print_dim('hidden_grad_W', hidden_grad_W)
print_dim('hidden_grad_b', hidden_grad_b)
print('')

#update calculation
lr = 0.1
hidden_W -= lr * grad_hidden_W
hidden_b -= lr * grad_hidden_b
output_W -= lr * grad_output_W
output_b -= lr * grad_output_b
print('update calculation:')
print_dim('hidden_W', hidden_W)
print_dim('hidden_b', hidden_b)
print_dim('output_W', output_W)
print_dim('output_b', output_b)
print('')

lin transform in hidden layer:
shape of input: (1, 5)
shape of hidden_W: (5, 3)
shape of hidden_b: (3,)
shape of lin_transform_hidden: (1, 3)

relu output in hidden layer:
shape of relu_output: (1, 3)

lin transform in softmax layer:
shape of input: (1, 3)
shape of hidden_W: (3, 2)
shape of hidden_b: (2,)
shape of lin_transform_softmax: (1, 2)

softmax calculation:
shape of np_max_output: (1, 1)
shape of relu_output: (1, 3)
shape of softmax_numerator: (1, 2)
shape of softmax_denominator: (1, 1)
shape of softmax_output: (1, 2)

softmax loss calculation:
shape of y_true: (1,)
shape of softmax_output: (1, 2)
shape of softmax_loss: (1, 2)

softmax grad calculation:
shape of relu_output.T: (3, 1)
shape of softmax_loss: (1, 2)
shape of softmax_grad_W: (3, 2)
shape of softmax_grad_b: (2,)

shape of softmax_loss: (1, 2)
shape of output_W.T: (2, 3)
shape of relu_output: (1, 3)
relu loss calculation:
shape of relu_loss: (1, 3)

relu grad calculation:
shape of input.T: (5, 1)
shape of relu_loss: 

In [579]:
#BN implementation

def bn_forward(x, gamma, beta, eps):
    N, D = x.shape
    mean = 1./N * np.sum(input, axis = 0)
    input_mean = input - mean
    var = 1./N * np.sum(input_mean ** 2, axis = 0)
    sqrtvar = np.sqrt(var + eps)
    ivar = 1./sqrtvar
    input_hat = input_mean * ivar
    gammax = gamma * input_hat
    output = gammax + beta
    cache = (input_hat, gamma, input_mean, ivar, sqrtvar, var, eps)
    print('BN FORWARD')
    print_dim('input', input)
    print_dim('gamma', gamma)
    print_dim('beta', beta)
    print_dim('mean', mean)
    print_dim('input_mean', input_mean)
    print_dim('var', var)
    print_dim('sqrtvar', sqrtvar)
    print_dim('ivar', ivar)
    print_dim('input_hat', input_hat)
    print_dim('gammax', gammax)
    print_dim('output', output)
    return output, cache

def bn_backward(dout, cache):
    x_hat, gamma, x_mean, ivar, sqrtvar, var, eps = cache
    N,D = dout.shape

    dbeta = np.sum(dout, axis = 0)
    dgammax = dout
    dgamma = np.sum(dgammax*x_hat, axis = 0)
    dxhat = dgammax * gamma
    divar = np.sum(dxhat * x_mean, axis = 0)
    dxmu1 = dxhat * ivar
    dsqrtvar = -1. / (sqrtvar**2) * divar
    dvar = 0.5 * 1./np.sqrt(var+eps) * dsqrtvar
    dsq = 1./N * np.ones((N,D))*dvar
    dxmu2 = 2*x_mean * dsq
    dx1 = (dxmu1 + dxmu2)
    dmu = -1 * np.sum(dxmu1+dxmu2, axis = 0)
    dx2 = 1./N * np.ones((N,D)) * dmu
    dx = dx1 + dx2

    print('BN BACKWARD')
    print_dim('dbeta', input)
    print_dim('dgammax', dgammax)
    print_dim('xhat', x_hat)
    print_dim('dgamma', dgamma)
    return dx, dgamma, dbeta

In [630]:
matrix = np.array([[1,2,3,4,5],[6,7,8,9,10]])
matrix_mean = np.mean(matrix, axis=1, keepdims=True)
matrix_var = np.var(matrix, axis=1, keepdims=True)
matrix_subtracted = matrix - matrix_mean
matrix_std = np.sqrt(matrix_var)
matrix_hat = matrix_subtracted / matrix_std

gamma_softmax = np.ones((matrix_hat.shape))
beta_softmax = np.zeros(matrix.shape[-1])
output = matrix_hat * gamma_softmax + beta_softmax

print(matrix_mean)
print(matrix_var)
print(output)

[[3.]
 [8.]]
[[2.]
 [2.]]
[[-1.41421356 -0.70710678  0.          0.70710678  1.41421356]
 [-1.41421356 -0.70710678  0.          0.70710678  1.41421356]]


In [659]:
x = np.array([[1,2,3,4,5],[6,7,8,9,10]])
#FORWARD
N,D = matrix.shape

gamma = np.ones((x.shape))
print('gamma')
print(gamma)
beta = np.zeros(x.shape[-1])
eps = 1e-9

mean = 1./D * np.sum(x, axis = 1, keepdims = True)
x_mean = x - mean
var = 1./D * np.sum(x_mean ** 2, axis = 1, keepdims = True)
sqrtvar = np.sqrt(var + eps)
ivar = 1./sqrtvar
x_hat = x_mean * ivar
print('x_hat')
print(x_hat)
gammax = gamma * x_hat
output = gammax + beta
print(output)



cache = (x_hat, gamma, x_mean, ivar, sqrtvar, var, eps)
#BACKWARDS
x_hat, gamma, x_mean, ivar, sqrtvar, var, eps = cache

N,D = output.shape

dbeta = np.sum(output, axis = 1)
dgammax = output

dgamma = np.sum(dgammax*x_hat, axis = 0)
print('dgamma')
print(dgamma)

dxhat = dgammax * gamma
print('dxhat')
print(dxhat)
divar = np.sum(dxhat * x_mean, axis = 0)
dxmu1 = dxhat * ivar
dsqrtvar = -1. / (sqrtvar**2) * divar
dvar = 0.5 * 1./np.sqrt(var+eps) * dsqrtvar
dsq = 1./N * np.ones((N,D))*dvar
dxmu2 = 2*x_mean * dsq
dx1 = (dxmu1 + dxmu2)
dmu = -1 * np.sum(dxmu1+dxmu2, axis = 0)
dx2 = 1./N * np.ones((N,D)) * dmu
dx = dx1 + dx2


gamma
[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]]
x_hat
[[-1.41421356 -0.70710678  0.          0.70710678  1.41421356]
 [-1.41421356 -0.70710678  0.          0.70710678  1.41421356]]
[[-1.41421356 -0.70710678  0.          0.70710678  1.41421356]
 [-1.41421356 -0.70710678  0.          0.70710678  1.41421356]]
dgamma
[4. 1. 0. 1. 4.]
dxhat
[[-1.41421356 -0.70710678  0.          0.70710678  1.41421356]
 [-1.41421356 -0.70710678  0.          0.70710678  1.41421356]]


In [586]:

eps = 1e-9
lin_input = lin_transform(relu_output, output_W, output_b)
print_dim('lin_input', lin_input)

gamma_softmax = np.ones((lin_input.shape))
beta_softmax = np.zeros((lin_input.shape[-1]))
print_dim('gamma', gamma_softmax)
print_dim('beta', beta_softmax)

matrix_mean = np.mean(matrix, axis=1, keepdims=True)
matrix_var = np.var(matrix, axis=1, keepdims=True)
matrix_subtracted = matrix - matrix_mean
matrix_std = np.sqrt(matrix_var)
matrix_hat = matrix_subtracted / matrix_std




shape of lin_input: (3, 2)
shape of mean: (2,)


ValueError: operands could not be broadcast together with shapes (2,2) (3,2) 

In [661]:
#forward

gamma_relu = np.ones((hidden_W.shape[-1],hidden_W.shape[-1]))
beta_relu = np.zeros((hidden_b.shape))
eps = 1e-9

#implementation of BN forward on relu layer
def forward_relu(input, hidden_W, hidden_b, gamma_relu, beta_relu):
    lin_input = lin_transform(input, hidden_W, hidden_b)
    relu_input, cache = bn_forward(lin_input, gamma_relu, beta_relu, eps) #BN INSERT
    relu_output = relu(relu_input)
    print('FORWARD RELU:')
    print_dim('input', input)
    print_dim('hidden_W', hidden_W)
    print_dim('hidden_b', hidden_b)
    print_dim('lin_input', lin_input)
    print_dim('hidden_input', relu_input)
    print_dim('gamma_relu', gamma_relu)
    print_dim('beta_relu', beta_relu)
    print_dim('relu_output', relu_output)
    print('')

    return relu_output, cache

relu_output, relu_cache = forward_relu(input,hidden_W, hidden_b, gamma_relu, beta_relu)

gamma_softmax = np.ones((output_W.shape[-1],output_W.shape[-1]))
beta_softmax = np.zeros((output_b.shape))

def forward_softmax(input, output_W, output_b, gamma_softmax, beta_softmax):
    lin_input = lin_transform(input, output_W, output_b)
    softmax_input, cache = bn_forward(lin_input, gamma_softmax, beta_softmax, eps) #BN INSERT
    softmax_output = softmax(softmax_input)
    print('FORWARD SOFTMAX:')
    print_dim('input', input)
    print_dim('output_W', output_W)
    print_dim('output_b', output_b)
    print_dim('lin_input', lin_input)
    print_dim('softmax_input', softmax_input)
    print_dim('gamma_softmax', gamma_softmax)
    print_dim('beta_softmax', beta_softmax)
    print_dim('softmax_input', softmax_input)
    print_dim('softmax_output', softmax_output)
    print('')

    return softmax_output, cache

softmax_output, softmax_cache = forward_softmax(relu_output, output_W, output_b, gamma_softmax, beta_softmax)

BN FORWARD
shape of input: (1, 3)
shape of gamma: (3, 3)
shape of beta: (3,)
shape of mean: (1,)
shape of input_mean: (1, 3)
shape of var: (1,)
shape of sqrtvar: (1,)
shape of ivar: (1,)
shape of input_hat: (1, 3)
shape of gammax: (3, 3)
shape of output: (3, 3)
FORWARD RELU:
shape of input: (1, 5)
shape of hidden_W: (5, 3)
shape of hidden_b: (3,)
shape of lin_input: (1, 3)
shape of hidden_input: (3, 3)
shape of gamma_relu: (3, 3)
shape of beta_relu: (3,)
shape of relu_output: (3, 3)



ValueError: operands could not be broadcast together with shapes (3,2) (3,) 

In [None]:
#backward

def backward_softmax(y_true, softmax_output, hidden_output, output_W, output_b, softmax_cache):
    # loss in output layer
    loss_output = loss(y_true, softmax_output)

    # Gradients of the output layer
    grad_output_W = np.dot(hidden_output.T, loss_output)
    grad_output_b = np.sum(loss_output, axis=0)
    
    #propagate through BN layer
    bn_input = np.dot(loss_output, output_W.T)
    dx, dgamma, dbeta = bn_backward(bn_input, softmax_cache)

    print('BACKWARD:')
    print_dim('y_true', y_true)
    print_dim('softmax_output', softmax_output)
    print_dim('loss_output',loss_output)
    print_dim('grad_output_W',grad_output_W)
    print_dim('grad_output_b',grad_output_b)
    print_dim('dx',dx)
    print_dim('dgamma',dgamma)
    print_dim('dbeta',dbeta)
    print('')

    return  dx, grad_output_W, grad_output_b, dgamma, dbeta

dx, grad_output_W, grad_output_b, dgamma, dbeta = backward_softmax(y_true, softmax_output, relu_output, output_W, output_b, softmax_cache)

BN BACKWARD
shape of dbeta: (10, 5)
shape of dgammax: (10, 3)
shape of xhat: (10, 3)
shape of dgamma: (3,)
BACKWARD:
shape of y_true: (10,)
shape of softmax_output: (10, 2)
shape of loss_output: (10, 2)
shape of grad_output_W: (3, 2)
shape of grad_output_b: (2,)
shape of dx: (10, 3)
shape of dgamma: (3,)
shape of dbeta: (3,)

