In [9]:
import math
import numpy as np
import matplotlib.pyplot as plt
import torch
import random

---

### Defining the Value class for carrying data and gradients

In [10]:
class Value:

    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data 
        self.grad = 0.0

        self._backward = lambda: None
        
        self._prev = set(_children)
        self._op = _op
        self.label = label

    def __repr__(self):
        return f"Value(data={self.data})"
    
    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)

        out = Value(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad += 1 * out.grad
            other.grad += 1 * out.grad

        out._backward = _backward
        return out
    
    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        
        out = Value(self.data * other.data, (self, other), '*')

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward    
        return out
    
    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
        out = Value(self.data**other, (self, ), f'**{other}')

        def _backward():
            self.grad += other * (self.data ** (other - 1)) * out.grad
        out._backward = _backward
        return out
    
    def __rmul__(self, other):
        return self * other
    
    def __radd__(self, other):
        return self + other

    def __truediv__(self, other):
        return self * other**-1
    
    def __neg__(self):
        return self * -1
    
    def __sub__(self, other):
        return self + (-other)

    def tanh(self):
        x = self.data
        t = (math.exp(2*x) - 1) / (math.exp(2*x) + 1)
        out = Value(t, (self, ), 'tanh')

        def _backward():
            self.grad += (1 - t**2) * out.grad 
        out._backward = _backward
        return out
    
    def exp(self):
        x = self.data
        out = Value(math.exp(x), (self, ), 'exp')

        def _backward():
            self.grad += out.data * out.grad
        out._backward = _backward
        return out
    
    def backward(self):

        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)
        self.grad = 1.0

        for node in reversed(topo):
            node._backward()

---

### Defining the variables and operations with PyTorch

In [11]:
x1 = torch.Tensor([2.0]).double(); x1.requires_grad = True
x2 = torch.Tensor([0.0]).double(); x2.requires_grad = True
w1 = torch.Tensor([-3.0]).double(); w1.requires_grad = True
w2 = torch.Tensor([1.0]).double(); w2.requires_grad = True
b = torch.Tensor([6.8813735870195432]).double(); b.requires_grad = True
n = x1*w1 + x2*w2 + b
o = torch.tanh(n)

print('o:', o.item())
o.backward()

print('-----')
print('x2:', x2.grad.item())
print('w2:', w2.grad.item())
print('x1:', x1.grad.item())
print('w1:', w1.grad.item())

o: 0.7071066904050358
-----
x2: 0.5000001283844369
w2: 0.0
x1: -1.5000003851533106
w1: 1.0000002567688737


---

### Building the Multilayer Perceptron in Micrograd

<img src="https://cs231n.github.io/assets/nn1/neural_net2.jpeg" width="300">

In [12]:
class Neuron:
    def __init__(self, nin):
        self.w = [Value(random.uniform(-1, 1)) for _ in range(nin)]
        self.b = Value(random.uniform(-1, 1))

    def __call__(self, x):
        activation = sum((wi*xi for wi, xi in zip(self.w, x)), self.b)
        out = activation.tanh()
        return out
    
    def parameters(self):
        return self.w + [self.b]
    
class Layer:
    def __init__(self, nin, nout):
        self.neurons = [Neuron(nin) for _ in range(nout)]

    def __call__(self, x):
        outs = [n(x) for n in self.neurons]
        return outs[0] if len(outs) == 1 else outs

    def parameters(self):
        return [p for neuron in self.neurons for p in neuron.parameters()]
    
class MLP:
    def __init__(self, nin, nouts):
        size = [nin] + nouts
        self.layers = [Layer(size[i], size[i+1]) for i in range(len(size) - 1)]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    
    def parameters(self):
        params = []
        for layer in self.layers:
            ps = layer.parameters()
            params.extend(ps)
        return params

---

### Gradient Descent and Backpropagation to train the NN

In [13]:
# input x and nn structure with 3 layers

x = [2.0, 3.0, -1.0]
n = MLP(3, [4, 4, 1])
n(x)

Value(data=0.42253489344413125)

In [14]:
# tiny dataset abd desired target output respectively

xs = [
  [2.0, 3.0, -1.0],
  [3.0, -1.0, 0.5],
  [0.5, 1.0, 1.0],
  [1.0, 1.0, -1.0],
]

ys = [1.0, -1.0, -1.0, 1.0]

In [15]:
for k in range(100):

    # forward pass
    ypredicted = [n(x) for x in xs]
    loss = sum((yout - ygroundtruth)**2 for ygroundtruth, yout in zip(ys, ypredicted))

    # backward pass
    for param in n.parameters():
        param.grad = 0.0 # reset the gradients to 0 before the backward pass
    loss.backward()

    # gradient descent step
    for param in n.parameters():
        param.data -= 0.3 * param.grad
    
    print(k, loss.data)

0 2.697487297842995
1 0.09543770738285644
2 0.00017535479404302045
3 0.00017458339603074272
4 0.00017382036801488293
5 0.0001730655693350376
6 0.00017231886249556707
7 0.00017158011307656572
8 0.00017084918964786676
9 0.00017012596368586578
10 0.0001694103094931994
11 0.00016870210412101
12 0.00016800122729381895
13 0.00016730756133685466
14 0.00016662099110571947
15 0.00016594140391837728
16 0.00016526868948931898
17 0.00016460273986580604
18 0.00016394344936620267
19 0.0001632907145202177
20 0.00016264443401105177
21 0.0001620045086193602
22 0.0001613708411689566
23 0.00016074333647419223
24 0.00016012190128898872
25 0.00015950644425740882
26 0.00015889687586575425
27 0.00015829310839609928
28 0.00015769505588125434
29 0.00015710263406106838
30 0.00015651576034003413
31 0.0001559343537461801
32 0.0001553583348911496
33 0.0001547876259314771
34 0.00015422215053098723
35 0.00015366183382430952
36 0.00015310660238143187
37 0.0001525563841732796
38 0.00015201110853831047
39 0.00015147070

In [16]:
ypredicted

[Value(data=0.992472370792177),
 Value(data=-0.9962121084853451),
 Value(data=-0.9969530813240761),
 Value(data=0.9932433035092674)]

The target was [1.0, -1.0, -1.0, 1.0], so the model prediction is pretty good.