# XOR RELU forward propagation
> A programming introduc

- toc: true 
- badges: true
- comments: true
- author: Nipun Batra
- categories: [ML]

In [108]:
import autograd.numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [109]:
X = np.array([[0, 0],
             [0, 1],
             [1, 0],
             [1, 1]
             ])

y = np.array([[0], [1], [1], [0]])

In [110]:
X.shape, y.shape

((4, 2), (4, 1))

In [111]:
N, N_0 = X.shape
N, N_2 = y.shape
N_1 = 2

In [112]:
W = [np.array([[1, 1], [1, 1]]), np.array([[1, -2]])]

b = [np.array([[0], [-1]]), np.array([[0]])]
B = []


In [113]:
A = [X]
A.extend([None]*(len(W)))
Z = [None]*(len(W)+1)

assert(A[0].shape==(N, N_0))


In [114]:
for i in range(1, len(W)+1):
    print(i)
    Z[i] = A[i-1]@(W[i-1].T) + b[i-1].T
    A[i] = Z[i]
    A[i][A[i]<0] = 0
    print(Z[i], A[i])

1
[[0 0]
 [1 0]
 [1 0]
 [2 1]] [[0 0]
 [1 0]
 [1 0]
 [2 1]]
2
[[0]
 [1]
 [1]
 [0]] [[0]
 [1]
 [1]
 [0]]


In [115]:
len(A), len(Z)

(3, 3)

Excellent, now let us start from random weight initialisations and use backprop to come to our result

In [116]:
shapes = [X.shape[1], 2, 1]
activations = ['sigmoid','sigmoid']


def sigmoid(z):
    return 1./(1+np.exp(-z))

def relu(z):
    temp = z.copy()
    temp[temp<0] = 0
    return temp

activation_func = {'sigmoid':sigmoid, 'relu':relu}

N, N0 = X.shape
N1 = 2
N2 = 1

W = [None]*(len(shapes)-1)
b = [None]*(len(shapes)-1)

np.random.seed(0)

for i in range(1, len(shapes)):
    W[i-1] = np.random.randn(shapes[i], shapes[i-1])
    b[i-1] = np.random.randn(shapes[i], 1)
    
Z = [None]*(len(W)+1)

A = [X]
A.extend([None]*(len(W)))

In [117]:
A

[array([[0, 0],
        [0, 1],
        [1, 0],
        [1, 1]]), None, None]

In [118]:
for i in range(1, len(W)+1):
    print(i)
    Z[i] = A[i-1]@(W[i-1].T) + b[i-1].T
    A[i] = activation_func[activations[i-1]](Z[i])
    print(Z[i], A[i])

loss = (-y.T@np.log(A[2]) - (1-y).T@np.log(1-A[2])).squeeze()
print(loss)

1
[[ 1.86755799e+00 -9.77277880e-01]
 [ 2.26771520e+00  1.26361532e+00]
 [ 3.63161034e+00  1.46010423e-03]
 [ 4.03176754e+00  2.24235330e+00]] [[0.86617546 0.27343225]
 [0.9061677  0.77964784]
 [0.97420925 0.50036503]
 [0.98256638 0.9039889 ]]
2
[[0.67833848]
 [0.63971526]
 [0.74663222]
 [0.69348085]] [[0.66336776]
 [0.65468909]
 [0.67844443]
 [0.66674081]]
2.9991464995409807


In [174]:
epochs = 500
alpha =1

del_Z = [None]*(len(W)+1)
del_A = [None]*(len(W)+1)
del_W = [None]*(len(W))
del_b = [None]*(len(W))

for iteration in range(epochs):
    
    for i in range(1, len(W)+1):
        Z[i] = A[i-1]@(W[i-1].T) + b[i-1].T
        A[i] = activation_func[activations[i-1]](Z[i])

        y_hat = A[2]
        loss = (-y.T@np.log(y_hat) - (1-y).T@np.log(1-y_hat)).squeeze()
        print(i, loss)
        
    del_A[2] = -np.multiply(y, A[2]) + np.multiply((1-y), (1-A[2]))
    del_Z[2] = A[2]-y
    del_W[1] = (A[2]-y).T@A[1]
    del_b[1] = (del_Z[2].sum(axis=0)).reshape(-1, 1)
    del_A[1] = del_Z[2]@W[1]
    del_Z[1]  = np.multiply(del_A[1], sigmoid(Z[1])*(1-sigmoid(Z[1])))
    del_W[0] = del_Z[1].T@A[0]
    del_b[0] = (del_Z[1].sum(axis=0)).reshape(-1, 1)
    
    for i in range(0, len(shapes)-1):
        W[i] = W[i] - alpha*del_W[i]
        b[i] = b[i] - alpha*del_b[i]
        #print(W, b)
        
    

1 1.900917288811307
2 1.8990782101635078
1 1.8990782101635078
2 1.895087705042622
1 1.895087705042622
2 1.8905198878127458
1 1.8905198878127458
2 1.8852422633948591
1 1.8852422633948591
2 1.8790824426514607
1 1.8790824426514607
2 1.87181365673781
1 1.87181365673781
2 1.8631340070326043
1 1.8631340070326043
2 1.852637598542766
1 1.852637598542766
2 1.8397740259314084
1 1.8397740259314084
2 1.8237947995594221
1 1.8237947995594221
2 1.803687539628661
1 1.803687539628661
2 1.7781150772260836
1 1.7781150772260836
2 1.7454104412325124
1 1.7454104412325124
2 1.7037598684781243
1 1.7037598684781243
2 1.6518020587184286
1 1.6518020587184286
2 1.5898145191893995
1 1.5898145191893995
2 1.5209483016687049
1 1.5209483016687049
2 1.4507542405683878
1 1.4507542405683878
2 1.38405887020842
1 1.38405887020842
2 1.3222689315400031
1 1.3222689315400031
2 1.2641685598460128
1 1.2641685598460128
2 1.2081654145958272
1 1.2081654145958272
2 1.1533532272707079
1 1.1533532272707079
2 1.0994873114762564
1 1.099

2 0.02568952280709917
1 0.02568952280709917
2 0.02562003271769778
1 0.02562003271769778
2 0.02555091586015333
1 0.02555091586015333
2 0.025482169232176718
1 0.025482169232176718
2 0.02541378986368143
1 0.02541378986368143
2 0.02534577481634894
1 0.02534577481634894
2 0.025278121183205646
1 0.025278121183205646
2 0.025210826088203554
1 0.025210826088203554
2 0.02514388668580942
1 0.02514388668580942
2 0.02507730016060092
1 0.02507730016060092
2 0.025011063726865713
1 0.025011063726865713
2 0.02494517462821215
1 0.02494517462821215
2 0.02487963013718157
1 0.02487963013718157
2 0.024814427554869952
1 0.024814427554869952
2 0.024749564210552556
1 0.024749564210552556
2 0.024685037461317512
1 0.024685037461317512
2 0.024620844691702577
1 0.024620844691702577
2 0.02455698331333964
1 0.02455698331333964
2 0.02449345076460327
1 0.02449345076460327
2 0.02443024451026531
1 0.02443024451026531
2 0.024367362041154243
1 0.024367362041154243
2 0.02430480087382166
1 0.02430480087382166
2 0.0242425585

In [175]:
A[2]

array([[0.0058137 ],
       [0.99552735],
       [0.99552913],
       [0.00493209]])

In [121]:
y

array([[0],
       [1],
       [1],
       [0]])

In [122]:
W

[array([[1.76405235, 0.40015721],
        [0.97873798, 2.2408932 ]]), array([[ 0.95008842, -0.15135721]])]

In [123]:
b

[array([[ 1.86755799],
        [-0.97727788]]), array([[-0.10321885]])]

In [124]:
def tanh(x):                 # Define a function
    y = np.exp(-2.0 * x)
    return (1.0 - y) / (1.0 + y)

In [125]:
from autograd import elementwise_grad as egrad
from autograd import grad

In [126]:
grad_tanh = grad(tanh)
grad_sigmoid = grad(sigmoid)

In [127]:
grad_tanh(np.array([1.]))

array([0.41997434])

In [128]:
grad_sigmoid(1.)

0.19661193324148188

In [129]:
sigmoid(1.)*(1-sigmoid(1.))

0.19661193324148185

In [136]:
def objective(W, b):
    for i in range(1, len(W)+1):
        print(i)
        Z[i] = A[i-1]@(W[i-1].T) + b[i-1].T
        A[i] = activation_func[activations[i-1]](Z[i])
        print(Z[i], A[i])

    y_hat = A[2]
    loss = (-y.T@np.log(y_hat) - (1-y).T@np.log(1-y_hat)).squeeze()
    return loss
    

In [140]:
grad_objective = grad(objective, argnum=[0, 1])

In [141]:
grad_objective(W, b)

1
Autograd ArrayBox with value [[ 1.86755799e+00 -9.77277880e-01]
 [ 2.26771520e+00  1.26361532e+00]
 [ 3.63161034e+00  1.46010423e-03]
 [ 4.03176754e+00  2.24235330e+00]] Autograd ArrayBox with value [[0.86617546 0.27343225]
 [0.9061677  0.77964784]
 [0.97420925 0.50036503]
 [0.98256638 0.9039889 ]]
2
Autograd ArrayBox with value [[0.67833848]
 [0.63971526]
 [0.74663222]
 [0.69348085]] Autograd ArrayBox with value [[0.66336776]
 [0.65468909]
 [0.67844443]
 [0.66674081]]


([array([[ 0.003175  , -0.01704455],
         [ 0.00340863,  0.00022022]]), array([[0.60353799, 0.35399637]])],
 [array([[ 0.04833612],
         [-0.00755961]]), array([[0.6632421]])])

In [143]:
t  = (A[2]-y).T@A[1]

In [146]:
print(t)

Autograd ArrayBox with value [[0.60353799 0.35399637]]


In [149]:
print((A[2]-y).sum(axis=0).reshape(-1, 1))

Autograd ArrayBox with value [[0.6632421]]


### Both the above seem correct

In [154]:
dela = (A[2]-y)@W[1]

In [156]:
delz = np.multiply(dela, sigmoid(Z[1])*(1-sigmoid(Z[1])))

In [157]:
print(delz)

Autograd ArrayBox with value [[ 0.07305669 -0.01994726]
 [-0.02789557  0.00897903]
 [-0.00767602  0.01216743]
 [ 0.01085102 -0.0087588 ]]


In [168]:
print(delz.T@A[0])

Autograd ArrayBox with value [[ 0.003175   -0.01704455]
 [ 0.00340863  0.00022022]]
