In [None]:
# | hide
import nbdev
from nbdev.showdoc import *

nbdev.nbdev_export()

# Tidy Utils


In [None]:
# | default_exp utils

In [None]:
# | export
from tidygrad.tensor import Tensor
import numpy as np

In [None]:
# | export


def grad_check(nn, inputs, params: tuple = (), eps=1e-5, n=1000):
    for p in reversed(params):
        # Reshape to 1D so it's easier to sample random indices

        data_view = p.data.reshape(-1)
        grad_view = p.grad.reshape(-1)

        slow_grad = np.zeros_like(p.grad)
        slow_grad_view = slow_grad.reshape(-1)

        indices = np.random.choice(np.arange(grad_view.size), size=min(n, grad_view.size), replace=False)
        indices = list(filter(lambda idx: abs(grad_view[idx]) > 1e-9, indices))
        for idx in indices:

            old_val = data_view[idx]

            loss = nn(inputs, params)
            

            data_view[idx] = old_val + eps
            loss_plus_h = nn(inputs, params)
            # print(f"loss_plus_h: {loss_plus_h.data}")
            # print(f"loss: {loss.data}")
            # print(f"diff: {loss_plus_h.data - loss.data}")
            # print(f"derivative: {(loss_plus_h.data - loss.data) / eps}")
            
            slow_grad_view[idx] = (loss_plus_h.data - loss.data) / eps
            data_view[idx] = old_val

        max_grad_diff = np.max(
            np.abs(  (slow_grad_view[indices] - grad_view[indices]) / np.maximum(slow_grad_view[indices], grad_view[indices]) )
           )
        
        print(f"Max gradient difference for {p.name}: {max_grad_diff*100}%")
        # print(f"Gradient : {p.grad}")
        # print(f"Slow grad: {slow_grad_view}")
        # print(f"Diff: {np.where(slow_grad_view, slow_grad_view - grad_view, 0)}")
        if max_grad_diff > 1e-2:
            raise ValueError(
                f"Gradient check failed for {p.name}: Max error: {max_grad_diff}"
            )

In [None]:
from tidygrad.tensor import Tensor

x = Tensor(np.random.randn(32, 28 * 28), "X")
# Create a 1-hot encoded tensor with 1 random 1
y = np.zeros((32, 10))
y[np.arange(32), np.random.choice(10, 32)] = 1
y = Tensor(y, "y")

w1 = Tensor(np.random.randn(28 * 28, 100), "w1")
b1 = Tensor(np.random.randn(100), "b1")
w2 = Tensor(np.random.randn(100, 10), "w2")


def NN(inputs, params: tuple, debug=list()):
    x, y = inputs
    w1, b1, w2 = params
    z1 = x.mmul(w1, "tmp").add(b1, "z1")
    a1 = z1.sigmoid()

    z2 = a1.mmul(w2)

    diff = z2.sub(y, "diff")
    l = diff.mul(diff, "l")
    loss = l.sum("loss")

    debug.append((z1, a1, z2, diff, l, loss))

    return loss


debug = []
loss = NN(inputs=(x, y), params=(w1, b1, w2), debug=debug)

loss.backward()

grad_check(NN, (x, y), (w1, b1, w2)) 

Max gradient difference for w2: 0.027053536586292802%
Max gradient difference for b1: 0.008331793822180928%
Max gradient difference for w1: 0.1767389452647976%
