In [None]:
# | default_exp utils.grad_check
# | hide
import nbdev

nbdev.nbdev_export()
from nbdev.showdoc import *

### Tidy Utils


In [None]:
# | export
import numpy as np
import tidygrad as tg

In [None]:
# | export
def grad_check(func, inputs, params: tuple = (), eps=1e-5, n=1000, verbose=False):
    grad_failed = False
    # for p in reversed(params):
    #     p.grad = np.zeros_like(p.data)
    # loss = func(inputs, params)
    # loss.backward()

    with tg.no_grad():
        for p in reversed(params):
            # Reshape to 1D so it's easier to sample random indices
            num_failed = num_skipped = num_checked = 0
            data_view = p.data.reshape(-1)  # This does not make a copy
            grad_view = p.grad.reshape(-1)

            slow_grad = np.zeros_like(p.grad)

            scaled_slow_grad_view = slow_grad.reshape(-1)

            indices = np.random.choice(np.arange(grad_view.size), size=min(n, grad_view.size), replace=False)
            good_indices = []
            # indices = list(filter(lambda idx: abs(slow_grad_view[idx]) > eps, indices))  # XXX?
            # if len(indices) == 0:
            #     print(f"Skipping {p.name} because all gradients are zero")
            #     continue
            # else:
            #     print(f"Checking {p.name} with {len(indices)} non-zero gradients")
            for idx in indices:
                old_val = data_view[idx]

                loss = func(inputs, params)

                data_view[idx] = old_val + eps
                loss_plus_h = func(inputs, params)

                scaled_slow_grad_view[idx] = (loss_plus_h.data - loss.data) / eps
                # slow_grad_view[idx] =

                # (loss_plus_h.data - loss.data) / eps

                if verbose:
                    print(
                        f"{idx}: loss_plus_h: {loss_plus_h.data}, loss: {loss.data}, diff: {loss_plus_h.data - loss.data}, grad: {grad_view[idx]}, slow_grad: {scaled_slow_grad_view[idx] / eps}"
                    )
                data_view[idx] = old_val

                if abs(scaled_slow_grad_view[idx]) > eps:
                    good_indices.append(idx)

            differences = ( (scaled_slow_grad_view[good_indices] - grad_view[good_indices])
                            / (grad_view[good_indices])
            )

            # slow_grad /= eps

            max_grad_diff = np.max(np.abs(differences))
            print(f"Max fractional gradient difference for {p.name}: {max_grad_diff*100:.4f}%")
            if max_grad_diff > 1e-2:
                grad_failed = True
                print("Failed!")
                print("Slow grad: ", slow_grad)
                print("Fast grad: ", p.grad)
                print("Differences: ", differences)

    if grad_failed: raise ValueError(f"Gradient check failed for {p.name}: Max error: {max_grad_diff*100:.4f}")

In [None]:
from lovely_numpy import Lo
from tidygrad import Tensor

In [None]:
Lo((np.random.randn(32, 28 * 28) @ (np.random.randn(28 * 28, 100) * 0.1) + np.random.randn(100)) @ (np.random.randn(100, 10) * 0.1))

array[32, 10] n=320 (2.5Kb) x∈[-7.871, 6.829] μ=-0.001 σ=2.903

In [None]:
i = 0

In [None]:
i += 1
np.random.seed(i)

x = Tensor(np.random.randn(32, 28 * 28), "X")
# Create a 1-hot encoded tensor with 1 random 1
y = np.zeros((32, 10))
y[np.arange(32), np.random.choice(10, 32)] = 1
y = Tensor(y, "y")

w1 = Tensor(np.random.randn(28 * 28, 100) * 0.1, "w1", requires_grad=True)
b1 = Tensor(np.random.randn(100), "b1", requires_grad=True)
w2 = Tensor(np.random.randn(100, 10) * 0.1, "w2", requires_grad=True)

def NN(inputs, params: tuple):
    x, y = inputs
    w1, b1, w2 = params
    z1 = x.mmul(w1, "tmp").add(b1, "z1")
    a1 = tg.sigmoid(z1)
    z2 = a1.mmul(w2)

    loss = -tg.BCE_loss(z2, y).sum("loss")

    return loss

debug = []
loss = NN(inputs=(x, y), params=(w1, b1, w2))

loss.backward()

# grad_check(NN, (x, y), (w1, b1, w2))

Max fractional gradient difference for w2: 0.0011%
Max fractional gradient difference for b1: 0.0010%
Max fractional gradient difference for w1: 0.0159%
