In [133]:
import numpy as np

class Add:
    def backward(self, grad, inputs):
        a, b = inputs
        a.grad += grad
        b.grad += grad

class Mul:
    def backward(self, grad, inputs):
        a, b = inputs
        if a.data.shape == ():
            grad_a = (grad * b.data).sum()
        else:
            grad_a = grad * b.data
        if b.data.shape == ():
            grad_b = grad * a.data.sum()
        else:
            grad_b = grad * a.data
        a.grad += grad_a
        b.grad += grad_b
        
        

class Sub:
    def backward(self, grad, inputs):
        a, b = inputs
        a.grad += grad
        b.grad -= grad

class MatMul:
    def backward(self, grad, inputs):
        A, B = inputs
        grad_a = grad @ np.swapaxes(B.data, -1, -2)
        grad_b = np.swapaxes(A.data, -1, -2) @ grad
        while grad_a.shape != A.data.shape:
            grad_a = np.sum(grad_a, axis=(0))
        while grad_b.shape != B.data.shape:
            grad_b = np.sum(grad_b, axis=(0))
        A.grad += grad_a
        B.grad += grad_b
    

class tensor:
    def __init__(self, data, requires_grad=False, is_leaf = True):
        self.data = np.array(data, dtype=float)
        self.requires_grad = requires_grad
        self.grad = np.zeros(self.data.shape, dtype=float)
        self.op = None
        self.is_leaf = is_leaf
        self.inputs = []
        self.backprop_done = False

                
    
    def backward(self, grad=None):
        if self.backprop_done:
            return
        self.backprop_done = True
        if not self.requires_grad:
            return
        if grad is not None:
            self.grad += grad
        if self.op is None:
            return
        self.op.backward(self.grad, self.inputs)
        for node in self.inputs:
            node.backward()
    
    def get_ones(self):
        return np.ones(self.data.shape, dtype=float)    
        
    def __mul__(self, other):
        requires_grad = self.requires_grad or other.requires_grad
        out = tensor(self.data * other.data, requires_grad, is_leaf=False)
        out.op = Mul()
        out.inputs = [self, other]
        return out
    
    def __matmul__(self, other):
        requires_grad = self.requires_grad or other.requires_grad
        out = tensor(self.data @ other.data, requires_grad, is_leaf=False)
        out.op = MatMul()
        out.inputs = [self, other]
        return out
    
    def __add__(self, other):
        requires_grad = self.requires_grad or other.requires_grad
        out = tensor(self.data + other.data, requires_grad, is_leaf=False)
        out.op = Add()
        out.inputs = [self, other]
        return out
    def __sub__(self, other):
        requires_grad = self.requires_grad or other.requires_grad
        out = tensor(self.data - other.data, requires_grad, is_leaf=False)
        out.op = Sub()
        out.inputs = [self, other]
        return out

In [56]:
#scaler multiply test

In [57]:
a = tensor(2.0, True)

In [58]:
b = tensor(3.0, True)

In [59]:
c = tensor(4.0, True)

In [60]:
d = a * b * c
e = a * b
f = e + d

In [61]:
f.backward(1)

In [62]:
a.grad

array(15.)

In [63]:
b.grad

array(10.)

In [64]:
c.grad

array(6.)

In [65]:
import torch
a = torch.tensor(2.0, requires_grad=True)
b = torch.tensor(3.0, requires_grad=True)
c = torch.tensor(4.0, requires_grad=True)
d = a * b * c
e = a * b
f = e + d
f.backward()
a.grad

tensor(15.)

In [66]:
b.grad

tensor(10.)

In [67]:
#arbitrary-dim matrix multiplication\add test

In [68]:
a = tensor([[[[2.0, 3.0], [3.0, 4.0], [3.0, 4.0]],
           [[4.0, 6.0], [2.0, 7.0], [3.0, 4.0]],
           [[1.0, 2.0], [3.0, 5.0], [3.0, 4.0]]],
           [[[2.0, 3.0], [3.0, 4.0], [3.0, 4.0]],
           [[4.0, 6.0], [2.0, 7.0], [3.0, 4.0]],
           [[1.0, 2.0], [3.0, 5.0], [3.0, 4.0]]]], True)

In [69]:
a.data.shape

(2, 3, 3, 2)

In [70]:
b = tensor(np.array([[1.0],[2.0]]), True)

In [71]:
b.data.shape

(2, 1)

In [72]:
c = a @ b

In [73]:
c.data.shape

(2, 3, 3, 1)

In [74]:
d = tensor(np.array([[1.0, 2.0]]), True)
d.data.shape

(1, 2)

In [75]:
e = d @ b
e.data.shape

(1, 1)

In [76]:
f = c @ e + c
f.data.shape

(2, 3, 3, 1)

In [77]:
f.backward(f.get_ones())

In [78]:
a.grad

array([[[[ 6., 12.],
         [ 6., 12.],
         [ 6., 12.]],

        [[ 6., 12.],
         [ 6., 12.],
         [ 6., 12.]],

        [[ 6., 12.],
         [ 6., 12.],
         [ 6., 12.]]],


       [[[ 6., 12.],
         [ 6., 12.],
         [ 6., 12.]],

        [[ 6., 12.],
         [ 6., 12.],
         [ 6., 12.]],

        [[ 6., 12.],
         [ 6., 12.],
         [ 6., 12.]]]])

In [79]:
b.grad

array([[492.],
       [876.]])

In [80]:
c.grad

array([[[[6.],
         [6.],
         [6.]],

        [[6.],
         [6.],
         [6.]],

        [[6.],
         [6.],
         [6.]]],


       [[[6.],
         [6.],
         [6.]],

        [[6.],
         [6.],
         [6.]],

        [[6.],
         [6.],
         [6.]]]])

In [81]:
d.grad

array([[204., 408.]])

In [82]:
e.grad

array([[204.]])

In [83]:
import torch
a = torch.tensor(
    [[[[2.0, 3.0], [3.0, 4.0], [3.0, 4.0]],
       [[4.0, 6.0], [2.0, 7.0], [3.0, 4.0]],
       [[1.0, 2.0], [3.0, 5.0], [3.0, 4.0]]],
       [[[2.0, 3.0], [3.0, 4.0], [3.0, 4.0]],
       [[4.0, 6.0], [2.0, 7.0], [3.0, 4.0]],
       [[1.0, 2.0], [3.0, 5.0], [3.0, 4.0]]]], requires_grad=True)
b = torch.tensor([[1.0],[2.0]], requires_grad=True)
c = a @ b
d = torch.tensor([[1.0, 2.0]], requires_grad=True)
e = d @ b
f = c @ e + c
a.retain_grad()
b.retain_grad()
c.retain_grad()
d.retain_grad()
e.retain_grad()
f.retain_grad()

In [84]:
f.backward(gradient=torch.ones_like(f))

In [85]:
a.grad

tensor([[[[ 6., 12.],
          [ 6., 12.],
          [ 6., 12.]],

         [[ 6., 12.],
          [ 6., 12.],
          [ 6., 12.]],

         [[ 6., 12.],
          [ 6., 12.],
          [ 6., 12.]]],


        [[[ 6., 12.],
          [ 6., 12.],
          [ 6., 12.]],

         [[ 6., 12.],
          [ 6., 12.],
          [ 6., 12.]],

         [[ 6., 12.],
          [ 6., 12.],
          [ 6., 12.]]]])

In [86]:
b.grad

tensor([[492.],
        [876.]])

In [87]:
c.grad

tensor([[[[6.],
          [6.],
          [6.]],

         [[6.],
          [6.],
          [6.]],

         [[6.],
          [6.],
          [6.]]],


        [[[6.],
          [6.],
          [6.]],

         [[6.],
          [6.],
          [6.]],

         [[6.],
          [6.],
          [6.]]]])

In [88]:
d.grad

tensor([[204., 408.]])

In [89]:
e.grad

tensor([[204.]])

In [None]:
#matrix sclar multiplication\add test

In [135]:
a = tensor(2.0, True)
b = tensor(np.array([[1, 2],[3, 4]]), True)

In [136]:
a.data.shape == ()

True

In [137]:
b.data

array([[1., 2.],
       [3., 4.]])

In [138]:
c = a * b

In [139]:
c.data

array([[2., 4.],
       [6., 8.]])

In [140]:
c.backward(c.get_ones())

In [141]:
a.grad

array(10.)

In [142]:
b.grad

array([[2., 2.],
       [2., 2.]])

In [146]:
a = torch.tensor(2.0, requires_grad=True)
b = torch.tensor([[1.0, 2.0],[3.0, 4.0]], requires_grad=True)
c = a * b
a.retain_grad()
b.retain_grad()
c.retain_grad()
c.backward(gradient=torch.ones_like(c))

In [147]:
a.grad

tensor(10.)

In [148]:
b.grad

tensor([[2., 2.],
        [2., 2.]])