# WS_follow_along_micrograd
# WESmith 06/01/23
## follow along with Karpathy video https://www.youtube.com/watch?v=VMj-3S1tku0

In [None]:
import math
import numpy as np
from graphviz import Digraph
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def f(x):
    return 3 * x**2 - 4 * x + 5

In [None]:
f(3.0)

In [None]:
xs = np.arange(-5, 5, 0.25)
ys = f(xs)
plt.plot(xs, ys)
plt.grid()

In [None]:
def diff(f, x, h=0.0001):  # WS
    return (f(x + h) - f(x))/h

In [None]:
h = 1e-8
diff(f, 3, h=h), diff(f, -3, h=h), diff(f, 2/3, h=h)

In [None]:
h = 0.0001
a =  2.0
b = -3.0
c = 10.0
d = a * b + c
print(d)

In [None]:
class Value:
    
    def __init__(self, data, _children=(), _op='', label=''):
        self.data  = data
        self.grad  = 0.0
        self._backward = lambda: None  # empty function for a leaf node
        self._prev = set(_children)
        self._op   = _op
        self.label = label
        
    def __repr__(self):
        return f"Value(data={self.data}, grad={self.grad})"
    
    def __add__(self, other):
        # WS handle adding integers
        other = other if isinstance(other, Value) else Value(other)
        out   = Value(self.data + other.data, (self, other), '+')
        def _backward():
            # WS note: must accumulate gradients to handle repetetive
            # appearance of a variable (discussed in the video at 1:25:40),
            # due to basic calculus rules for multivariate derivatives
            self.grad  += 1.0 * out.grad
            other.grad += 1.0 * out.grad
        out._backward   = _backward
        return out

    def __pow__(self, other):
        assert isinstance(other, (int, float)), 'only supporting int/float powers for now'
        out = Value(self.data**other, (self, ), f'**{other}')
        def _backward():
            self.grad += other * self.data**(other - 1) * out.grad
        out._backward  = _backward
        return out
    
    def __mul__(self, other):
        # WS handle multiplying by an integer
        other = other if isinstance(other, Value) else Value(other)
        out   = Value(self.data * other.data, (self, other), '*')
        def _backward():
            self.grad  += other.data * out.grad
            other.grad +=  self.data * out.grad
        out._backward   = _backward
        return out
    
    def __radd__(self, other): # other + self
        return self + other
    
    def __rmul__(self, other): # other * self
        return self * other
    
    def __truediv__(self, other): # self / other
        return self * other**(-1)
    
    def __rtruediv__(self, other): # other / self
        return other * self**(-1)
    
    def __neg__(self): # -self
        return self * -1
    
    def __sub__(self, other): # self - other
        return self + (-other)
    
    def __rsub__(self, other): # other - self
        return other + (-self)
    
    def relu(self):
        out = Value(0 if self.data < 0 else self.data, (self, ), 'ReLU')
        def _backward():
            self.grad += (out.data > 0) * out.grad
        out._backward  = _backward
        return out
    
    def tanh(self):
        x = self.data
        v = math.exp(2 * x)
        t = (v - 1) / (v + 1)
        out = Value(t, (self, ), 'tanh')
        # WS note: _backward() has built into it the 'self' pointer to the
        # child node to set its grad value, the 't' value of the parent node,
        # and the 'out' pointer to the parent node: it is simple and works nicely;
        # this is a powerful use of a python function to encapsulate all of this
        def _backward():
            self.grad += (1 - t**2) * out.grad
        out._backward  = _backward
        return out
    
    def exp(self):
        x = self.data
        out = Value(math.exp(x), (self, ), 'exp')
        def _backward():
            self.grad += out.data * out.grad
        out._backward  = _backward
        return out
    
    def backward(self): # WS this is called on the final node
        topo    = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)

        build_topo(self)
        self.grad = 1.0
        for node in reversed(topo): # WS calculate all the gradients
            node._backward()


In [None]:
def trace(root):
    # builds a set of all nodes and edges in a graph
    nodes, edges = set(), set()
    def build(v):
        if v not in nodes:
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)
    build(root)
    return nodes, edges

def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # left to right
    
    nodes, edges = trace(root)
    for n in nodes:
        uid = str(id(n)) # WS built-in id() returns the identity of the object
        # for any value in the graph, create a rectangular ('record') node for it
        dot.node(name=uid, label='{%s | data %.4f | grad %.4f}' %\
                 (n.label, n.data, n.grad), shape='record')
        if n._op:
            # if this value is a result of some operation, create an op node for it
            dot.node(name=uid + n._op, label=n._op)
            # and connect this node to it
            dot.edge(uid + n._op, uid)
            
    for n1, n2 in edges:
        # connect n1 to the op node of n2
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)
        
    return dot

In [None]:
a = Value(2.0)
b = Value(4.0)
a / b, a.exp(), a + 1, a * 2, 3 * a  # __rmul__ handles integer * Value

In [None]:
c = a - b
draw_dot(c)

In [None]:
d = a/b
draw_dot(d)

In [None]:
a = Value( 2.0, label='a')
b = Value(-3.0, label='b')
c = Value(10.0, label='c')
e = a * b; e.label='e'
d = e + c; d.label='d'
f = Value(-2.0, label='f')
L = d * f; L.label='L'
L.grad = 1 # root node
a + b, a * b, d

In [None]:
a.__add__(b), a.__mul__(b)  # equivalent

In [None]:
d._prev, d._op, type(d._prev)

In [None]:
draw_dot(L)

In [None]:
x = np.arange(-5, 5, 0.2)
plt.plot(xs, np.tanh(xs)); plt.grid()

In [None]:
x1 = Value( 2.0, label='x1')
x2 = Value( 0.0, label='x2')
w1 = Value(-3.0, label='w1')
w2 = Value( 1.0, label='w2')
b  = Value( 6.8813736, label='b')
x1w1 = x1 * w1; x1w1.label = 'x1*w1'
x2w2 = x2 * w2; x2w2.label = 'x2*w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
n    = x1w1x2w2 + b; n.label = 'n'
o    = n.tanh(); o.label = 'o'

In [None]:
draw_dot(o)

In [None]:
o.backward()

In [None]:
# build a topological graph
topo    = []
visited = set()
def build_topo(v):
    if v not in visited:
        visited.add(v)
        for child in v._prev:
            build_topo(child)
        topo.append(v)
build_topo(o)

In [None]:
for k in reversed(topo): # WS
    print('{}: {}'.format(k.label, k.data))
    for j in k._prev:
        print('   child: {}'.format(j.label))

In [None]:
a = Value(3.0, label='a')
b = a + a
b.backward()
draw_dot(b)

In [None]:
a = Value(-2.0, label='a')
b = Value( 3.0, label='b')
d = a * b; d.label = 'd'
e = a + b; e.label = 'e'
f = d * e; f.label = 'f'
f.backward()
draw_dot(f)

In [None]:
x1 = Value( 2.0, label='x1')
x2 = Value( 0.0, label='x2')
w1 = Value(-3.0, label='w1')
w2 = Value( 1.0, label='w2')
b  = Value( 6.8813736, label='b')
x1w1 = x1 * w1; x1w1.label = 'x1*w1'
x2w2 = x2 * w2; x2w2.label = 'x2*w2'
x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
n    = x1w1x2w2 + b; n.label = 'n'
# explicitly implement tanh now:
e    = (2*n).exp();     e.label = 'e'
o    = (e - 1)/(e + 1); o.label = 'o'
o.backward()
draw_dot(o)

In [None]:
import torch

In [None]:
x1 = torch.Tensor([ 2.0]).double();      x1.requires_grad = True
x2 = torch.Tensor([ 0.0]).double();      x2.requires_grad = True
w1 = torch.Tensor([-3.0]).double();      w1.requires_grad = True
w2 = torch.Tensor([ 1.0]).double();      w2.requires_grad = True
b  = torch.Tensor([6.8813736]).double();  b.requires_grad = True
n  = x1 * w1 + x2 * w2 + b
o  = torch.tanh(n)

In [None]:
print(o.data.item())

In [None]:
o.backward()

In [None]:
print('x2', x2.grad.item())
print('w2', w2.grad.item())
print('x1', x1.grad.item())
print('w1', w1.grad.item())

In [None]:
from numpy import random

In [None]:
class Module: # to keep similarity to pytorch
    
    def zero_grad(self):
        for p in self.parameters():
            p.grad = 0
            
    def parameters(self):
        return []

class Neuron(Module):

    def __init__(self, nin, nonlin=True):
        self.nin = nin # WS added
        self.w   = [Value(random.uniform(-1, 1)) for _ in range(nin)]
        self.b   = Value(random.uniform(-1, 1))
        self.nonlin = nonlin # apply a nonlinearity or not to neuron output

    def __call__(self, x):
        # w * x + b
        # WS a clever way to use zip()
        act = sum((wi * xi for wi, xi in zip(self.w, x)), self.b)
        return act.tanh() if self.nonlin else act

    def parameters(self):
        return self.w + [self.b] # list plus list

    def __repr__(self): # WS added
        #return f"Neuron(num_inputs={self.nin})"
        return f"{'Tanh' if self.nonlin else 'Linear'}Neuron({len(self.w)})"

class Layer(Module):

    def __init__(self, nin, nout, **kwargs):
        self.nout    = nout  # WS added
        self.neurons = [Neuron(nin, **kwargs) for _ in range(nout)]
  
    def __call__(self, x):
        out = [n(x) for n in self.neurons]
        return out[0] if len(out) == 1 else out

    def parameters(self):
        params = []
        for neuron in self.neurons:
            ps = neuron.parameters()
            params.extend(ps)
        return params
        # WS below from video: equivalent to the above code
        #return [p for neuron in self.neurons for p in neuron.parameters()]

    def __repr__(self): # WS added
        #return f"Layer(num_neurons={self.nout})"
        return f"Layer of [{', '.join(str(n) for n in self.neurons)}]"

class MLP(Module):

    def __init__(self, nin, nouts): # WS nouts is a list
        self.nin = nin  # WS added
        sz = [nin] + nouts
        self.layers = [Layer(sz[i], sz[i + 1], nonlin=i!=len(nouts)-1) for i in range(len(nouts))]
 
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

    def __repr__(self): # WS added
        #return f"MLP(num_inputs={self.nin}, layers={self.layers})"
        return f"MLP of [{', '.join(str(layer) for layer in self.layers)}]"


In [None]:
# WS example of double-list comprehension
# Without list comp
data = [[1,2,3],[7,8,9]]
out1 = []
for k in data:
    for j in k:
        out1.append(j)
# with list comp
out2 = [j for k in data for j in k]
out1, out2

In [None]:
x = [2.0, 3.0, -1.0]
n = MLP(3, [4, 4, 1])
n(x)

In [None]:
len(n.parameters())

In [None]:
xs = [[2.0, 3.0, -1.0], 
      [3.0, -1.0, 0.5],
      [0.5, 1.0, 1.0],
      [1.0, 1.0, -1.0]] # 4 input vectors
ys = [1.0, -1.0, -1.0, 1.0] # 4 desired targets

In [None]:
ypred = [n(x) for x in xs]
loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))
loss

In [None]:
loss.backward()

In [None]:
n.layers[0].neurons[0].w[0].grad

In [None]:
n.layers[0].neurons[0].w[0].data

In [None]:
draw_dot(loss) # really huge

In [None]:
n = MLP(3, [4, 4, 1])

In [None]:
for k in range(50):
    
    # forward pass
    ypred = [n(x) for x in xs]
    loss = sum((yout - ygt)**2 for ygt, yout in zip(ys, ypred))
    
    # backward pass
    for p in n.parameters():
        p.grad = 0 # essential, since grads are accumulated for each pass
    loss.backward()
    
    # update via gradient descent
    for p in n.parameters():
        p.data += -0.1 * p.grad
        
    #print(k, loss.data)
print(loss)
    

In [None]:
[n(x) for x in xs]