In [325]:
import numpy as np
import pandas as pd
from Linear import mse, init_params


#### Basic Gradient Checking

In [326]:

data = pd.read_csv('data/random1.csv')
data = np.array(data)

X_train = data[:, :1].T
Y_train = data[:, 1].reshape(1, -1)

w, b = init_params()

def forward(x, w):
    pred = np.dot(w, x)
    return pred

J = forward(X_train, w )
print(f"Shape of J: {J.shape}") 
print(f"First val of J: {J[0, 0]}")

Shape of J: (1, 127)
First val of J: 2.92028892055268


In [327]:
def backwards(x, y, pred):
    dw = x
    return dw


In [328]:
grad = backwards(X_train, Y_train, J)

print(f"First val of DW: {grad[0,0]}")

First val of DW: 1.5


![alt text](<Screenshot 2024-06-07 at 9.15.13 AM.png>)

In [329]:
eps = 1e-7

wplus = w + eps
wminus = w - eps

jplus = forward(X_train, wplus)
jminus = forward(X_train, wminus)

gradapprox = (jplus - jminus) / (2 * eps)


difference_num = np.linalg.norm((grad - gradapprox), ord=2)
graddifference_denum = np.linalg.norm(grad, ord = 2)
gradapproxdifference_denum = np.linalg.norm(gradapprox, ord = 2)

difference = (difference_num) / (graddifference_denum + gradapproxdifference_denum)

print(grad[0, 10])
print(gradapprox[0,10])

print('\nAvg Difference : {:.20f}'.format(difference))


11.4
11.400000001771105

Avg Difference : 0.00000000039643172910


### Neural Network Gradient Checking

In [330]:
from nn import init_params, leaky_relu, leaky_relu_deriv, softmax, one_hot, cce


In [331]:
data = pd.read_csv('data/fashion-mnist_train.csv')
data = np.array(data)

X_train = data[:, 1:785].T / 255
Y_train = data[:, 0].reshape(1, -1)

w1, b1, w2, b2 = init_params()

In [332]:
def forward_n(Y, X, w1, b1, w2, b2):
    
    one_hot_y = one_hot(Y)

    z1 = np.dot(w1, X) + b1
    a1 = leaky_relu(z1)
    z2 = np.dot(w2, a1) + b2
    a2 = softmax(z2)

    loss = cce(one_hot_y, a2)

    return z1, a1, z2, a2, w1, b1, w2, b2, one_hot_y, loss

z1, a1, z2, a2, w1, b1, w2, b2, one_hot_y, loss = forward_n(Y_train, X_train, w1, b1,w2, b2)
print(loss)

def backward_n(x, one_hot_y, w2, a2, a1, z1):
    dz2 = a2 - one_hot_y
    dw2 = np.dot(dz2, a1.T) / one_hot_y.shape[1]
    db2 = np.sum(dz2, axis = 1, keepdims=True) / one_hot_y.shape[1]
    dz1 = np.dot(w2.T, dz2) * leaky_relu_deriv(z1)
    dw1 = np.dot(dz1, x.T) / one_hot_y.shape[1]
    db1 = np.sum(dz1, axis = 1, keepdims=True) / one_hot_y.shape[1]
    return dz1, db1, dw1, dz2, db2, dw2

dz1, db1, dw1, dz2, db2, dw2 = backward_n(X_train, one_hot_y, w2, a2, a1, z1)
print(dw1.shape)
print(dw2.shape)


2.302157472675863
(64, 784)
(10, 64)


### Checking DW2

In [333]:

w1plus = w1.copy()
w1minus = w1.copy()
w2plus = w2.copy()
w2minus = w2.copy()

eps = 1e-7

w2plus[0, 7] += eps
w2minus[0, 7] -= eps

z1, a1, z2, a2, w1, b1, w2plus, b2, one_hot_y, lossplus = forward_n(Y_train, X_train, w1, b1, w2plus, b2)
z1, a1, z2, a2, w1, b1, w2minus, b2, one_hot_y, lossminus = forward_n(Y_train, X_train, w1, b1, w2minus, b2)

gradapprox = (lossplus - lossminus) / (2 * eps)

print(gradapprox)
print(dw2[0,7])

num = np.linalg.norm(dw2[0, 7] - gradapprox)
denom = np.maximum(np.linalg.norm(dw2[0, 7]), np.linalg.norm(gradapprox))
diff = num / denom

print('{:.20f}'.format(diff))

0.0037808267627781333
0.0037808228651304778
0.00000103089824002336


### Checking DW1

In [334]:
w1plus[0, 7] += eps
w1minus[0, 7] -= eps

z1, a1, z2, a2, w1plus, b1, w2, b2, one_hot_y, lossplus = forward_n(Y_train, X_train, w1plus, b1, w2, b2)
z1, a1, z2, a2, w1minus, b1, w2, b2, one_hot_y, lossminus = forward_n(Y_train, X_train, w1minus, b1, w2, b2)

gradapprox = (lossplus - lossminus) / (2 * eps)

num = np.linalg.norm(dw1[0, 7] - gradapprox)
denom = np.maximum(np.linalg.norm(dw1[0, 7]), np.linalg.norm(gradapprox))
diff = num / denom

print('{:.20f}'.format(gradapprox))
print('{:.20f}'.format(dw1[0,7]))
print('{:.20f}'.format(diff))

0.00002517319686035080
0.00002517492079837439
0.00006847838916336178
