In [None]:
# Goal: implement _very_ basic functionalities of pytorch from scratch (but I'm going to cheat and use numpy)

# large inspiration from:
# - https://github.com/karpathy/micrograd
# - https://github.com/geohot/tinygrad

In [1]:
# import torch 
# import torchvision
# import torch.nn as nn
import numpy as np
# import torchvision.transforms as transforms

In [2]:
# pytorch

# Create tensors.
x = torch.tensor(15., requires_grad=True)
print(x)
w = torch.tensor(10., requires_grad=True)
b = torch.tensor(900., requires_grad=True)

# Build a computational graph.
y = w * x + b * 100    # y = 2 * x + 3
print(y)

# Compute gradients.
# Computes the sum of gradients of given tensors with respect to graph leaves.
y.backward()

# Print out the gradients.
# Computes and returns the sum of gradients of outputs with respect to the inputs.
print(x.grad)    # x.grad = 2 = dy/dx = w
print(w.grad)    # w.grad = 1 = dy/dw = x
print(b.grad)    # b.grad = 1 = dy/db = 1

NameError: name 'torch' is not defined

In [60]:
# not pytorch

class Tensor:
    def __init__(self, data, children=()):
        self.data = np.array(data, dtype=np.float32)
        self.children = children
        self.grad = 1
        self.op = None
        
    def __mul__(self, other):
        op = Multiply
        output = op.forward(self, other)
        output.op = op
        return output
    
    def __add__(self, other):
        op = Add
        output = op.forward(self, other)
        output.op = op
        return output
    
    def __sub__(self, other):
        op = Subtract
        output = op.forward(self, other)
        output.op = op
        return output
    
    def __pow__(self, other):
        op = Power
        output = op.forward(self, other)
        output.op = op
        return output
    
    def mean(self):
        op = Mean
        output = op.forward(self)
        output.op = op
        return output

    def backward(self):
        print(self.op)
        if self.op is not None:
            children_grads = self.op.backward(self, *self.children)
            for node, grad in zip(self.children, children_grads):
                print(node.data)
                node.grad = grad
        for node in self.children:
            node.backward()
            

class Multiply:
    def forward(a, b):
        return Tensor(np.matmul(a.data, b.data), [a, b])
    
    def backward(parent, a, b):
        return [b.data * parent.grad, a.data * parent.grad]
    
class Add:
    def forward(a, b):
        return Tensor(np.add(a.data, b.data), [a, b])
    
    def backward(parent, a, b):
        return [parent.grad, parent.grad]
    
class Subtract:
    def forward(a, b):
        return Tensor(np.subtract(a.data, b.data), [a, b])
    
    def backward(parent, a, b):
        return [parent.grad, parent.grad]
    
class Power:
    def forward(a, b):
        return Tensor(np.power(a.data, b), [a])
    
    def backward(parent, a):
        return [parent.grad]
    
class Mean:
    def forward(a):
        return Tensor(np.mean(a.data), [a])
    
    def backward(parent, a):
        return [parent.grad]
    
        

x = Tensor([15])
w = Tensor([10])
b = Tensor([900])
print(f'x: {x.data}')
print(f'w: {w.data}')
print(f'b: {b.data}')
print(f'')

y = w * x + b
print(f'y: {y.data}')

y.backward()

print(x.grad)
print(w.grad)
print(b.grad)

x: [15.]
w: [10.]
b: [900.]

y: [1050.]
<class '__main__.Add'>
150.0
[900.]
<class '__main__.Multiply'>
[10.]
[15.]
None
None
None
[10.]
[15.]
1


In [62]:
# TODO: make random
x = Tensor([[-0.7837,  0.3945],
            [-0.1218,  0.6905]])

y = Tensor([[-1.2883, -2.0291],
            [ 0.0665, -0.4150]])

# nn building blocks
# start with linear layer

class Linear:
    def __init__(self, in_dim, out_dim):
        self.in_dim = in_dim
        self.out_dim = out_dim
        # TODO: make random and transpose weight
        self.weight = Tensor(np.transpose([[-0.1367,  0.4447], [ 0.0321, -0.2277]]))
        self.bias = Tensor([-0.6949, -0.1522])
        
    def __call__(self, x):
        return x * self.weight + self.bias
    

linear = Linear(2, 2)
print ('w: ', linear.weight.data)
print ('b: ', linear.bias.data)

pred = linear(x)
print("pred linear: ", pred.data)

# loss function

class MSELoss:
    def __init__(self):
        pass
    def __call__(self, pred, true):
        # TODO: add ops to Tensor
        return ((true - pred)**2).mean()

criterion = MSELoss()
    
loss = criterion(pred, y)

print("loss: ", loss.data)

loss.backward()

print ('dL/dw: ', linear.weight.grad) 
print ('dL/db: ', linear.bias.grad)

# optimizer

class SGD:
    def __init__(self, params, lr=0.001):
        self.params = params
    # todo zero grad function
    def step(self):
        for param in self.params:
            param.data -= param.grad * lr
            
    def zero_grad(self):
        for param in self.params:
            param.grad = 0
    
    
optimizer = SGD([linear.weight, linear.bias], lr=0.01)



w:  [[-0.1367  0.0321]
 [ 0.4447 -0.2277]]
b:  [-0.6949 -0.1522]
pred linear:  [[-0.41233402 -0.2671844 ]
 [-0.37118456 -0.3133366 ]]
loss:  1.0183916
<class '__main__.Mean'>
[[0.76731646 3.1043465 ]
 [0.19156778 0.01033544]]
<class '__main__.Power'>
[[-0.875966   -1.7619156 ]
 [ 0.43768457 -0.10166338]]
<class '__main__.Subtract'>
[[-1.2883 -2.0291]
 [ 0.0665 -0.415 ]]
[[-0.41233402 -0.2671844 ]
 [-0.37118456 -0.3133366 ]]
None
<class '__main__.Add'>
[[ 0.28256595 -0.11498442]
 [ 0.32371542 -0.16113663]]
[-0.6949 -0.1522]
<class '__main__.Multiply'>
[[-0.7837  0.3945]
 [-0.1218  0.6905]]
[[-0.1367  0.0321]
 [ 0.4447 -0.2277]]
None
None
None
dL/dw:  [[-0.7837  0.3945]
 [-0.1218  0.6905]]
dL/db:  1
