# Lab2: Backpropagation handmade

In [106]:
# 1. Build the class for math expressions
# 2. Build some basic operations
# 3. Build computation graph
# 4. Build some activations
# 5. Build backpropagation

In [107]:
import numpy as np
from collections import defaultdict

import math
import matplotlib.pyplot as plt

In [108]:
def print_testpr(l):
    for i in l:
        print(i)

In [109]:
from graphlib import TopologicalSorter
class Parameter:
    def __init__(self, value: float, name: str) -> None:
        self._value = value
        self._name = name

        self._grad = 0.0
        self._backward = lambda: None
        
        self.parameters = dict()
        self.parameters[id(self)] = self
        self._graph = defaultdict(set)
        self._graph[id(self)] = set()

    def __repr__(self) -> str:
        return f"Parameter {self._name} = {self._value}; dL/d[{self._name}] = {self._grad}"

    def __mul__(self, other):
        result = Parameter(
            self._value * other._value,
            f'{self._name} * {other._name}'
        )
        
        result.parameters.update(self.parameters)
        result.parameters.update(other.parameters)
        result.parameters[id(result)] = result
        
        result._graph.update(self._graph)
        result._graph.update(other._graph)
        result._graph[id(result)].add(id(self))
        result._graph[id(result)].add(id(other))

        def _backward():
            self._grad += other._value * result._grad #dL / dself
            other._grad += self._value * result._grad # dL / dother

        result._backward = _backward

        return result

    def __add__(self, other):
        result = Parameter(
            self._value + other._value,
            f'[{self._name} + {other._name}]'
        )
        
        result.parameters.update(self.parameters)
        result.parameters.update(other.parameters)
        result.parameters[id(result)] = result
        
        result._graph.update(self._graph)
        result._graph.update(other._graph)
        result._graph[id(result)].add(id(self))
        result._graph[id(result)].add(id(other))

        def _backward():
            self._grad += 1.0 * result._grad  #dL / dself
            other._grad += 1.0 * result._grad # dL / dother

        result._backward = _backward

        return result

    def sigmoid(self):
        # f(x) = 1 / (1 + exp(self._value))
        # f'(x) = f(x) * (1 - f(x))

        val = 1.0 / (1.0 + math.exp(-self._value))

        result = Parameter(
            val,
            f"σ({self._name})"
        )
        
        result.parameters.update(self.parameters)
        result.parameters[id(result)] = result
        
        result._graph.update(self._graph)
        result._graph[id(result)].add(id(self))
        
        def _backward():
            self._grad = result._grad * val * (1 - val)

        result._backward = _backward

        return result

    def _topological_sort(self):
        in_degree = {node: 0 for node in self._graph}
        for node in self._graph:
            for neighbor in self._graph[node]:
                in_degree[neighbor] += 1

        queue = [node for node in self._graph if in_degree[node] == 0]
        result = []

        while queue:
            current_node = queue.pop(0)
            result.append(current_node)

            for neighbor in self._graph[current_node]:
                in_degree[neighbor] -= 1
                if in_degree[neighbor] == 0:
                    queue.append(neighbor)

        return result

    def backward(self):
        queue = self._topological_sort()
        print(queue)
        self._grad = 1
        for i in queue:
            print(i)
            self.parameters[i]._backward()
            
            


    def SiLU(self):
        # LAB 2 TASK 2
        # f(x) = x*sigma(x)
        # f'(x) = sigma(x) + x*sigma(x)*(1-sigma(x))
        val = self._value * self.sigmoid()._value

        result = Parameter(
            val,
            f"{self._name}*σ({self._name})"
        )
        
        result.parameters.update(self.parameters)
        result.parameters[id(result)] = result
        
        result._graph.update(self._graph)
        result._graph[id(result)].add(id(self))
        
        def _backward():
            self._grad = result._grad * ( self.sigmoid()._value + self._value*self.sigmoid()._value*(1-self.sigmoid()._value) )

        result._backward = _backward

        return result
        

    def ReLU(self):
        # LAB 2 TASK 2
        # f(x) = max(0, x)
        # f'(x) = z>0
        val = max(0, self._value)

        result = Parameter(
            val,
            f"max(0, {self._name})"
        )
        
        result.parameters.update(self.parameters)
        result.parameters[id(result)] = result
        
        result._graph.update(self._graph)
        result._graph[id(result)].add(id(self))
        
        def _backward():
            self._grad = result._grad * ( self._value > 0 )

        result._backward = _backward

        return result

In [110]:
def sgd(parameters: list, learning_rate = 0.3): 
    for j in parameters:
        j._value -= learning_rate * j._grad        
    return parameters

## Task 1

### Test 1

In [111]:
a = Parameter(3.0, 'a')
b = Parameter(2.0, 'b')
c = Parameter(5.0, 'c')
d = Parameter(5.0, 'd')

In [112]:
print(a)
print(b)
print(c)
print(d)

Parameter a = 3.0; dL/d[a] = 0.0
Parameter b = 2.0; dL/d[b] = 0.0
Parameter c = 5.0; dL/d[c] = 0.0
Parameter d = 5.0; dL/d[d] = 0.0


In [113]:
u = a * b
v = u + c
L = v * d

In [114]:
L.backward()

[2430023952528, 2430122163920, 2430143405136, 2430143487568, 2430122446672, 2430104104656, 2430121001552]
2430023952528
2430122163920
2430143405136
2430143487568
2430122446672
2430104104656
2430121001552


In [115]:
l1 = [ L, v, d, u, c, a, b]
print_testpr(l1)

Parameter [a * b + c] * d = 55.0; dL/d[[a * b + c] * d] = 1
Parameter [a * b + c] = 11.0; dL/d[[a * b + c]] = 5.0
Parameter d = 5.0; dL/d[d] = 11.0
Parameter a * b = 6.0; dL/d[a * b] = 5.0
Parameter c = 5.0; dL/d[c] = 5.0
Parameter a = 3.0; dL/d[a] = 10.0
Parameter b = 2.0; dL/d[b] = 15.0


### Test 2

In [116]:
y1 = Parameter(3.0, 'y1')
y2 = Parameter(2.0, 'y2')
c1 = Parameter(7.0, 'c1')
c2 = Parameter(5.0, 'c2')

In [117]:
y1c1 = y1*c1
y2c2y1 = y2*c2*y1
our = y1c1 + y2c2y1

In [118]:
our.backward()

[2430143201232, 2430143190544, 2430143188624, 2430143185168, 2430143187088, 2430143185744, 2430143185680, 2430143200080]
2430143201232
2430143190544
2430143188624
2430143185168
2430143187088
2430143185744
2430143185680
2430143200080


In [119]:
test = [our, y2c2y1, y1c1, y1, y2, c1, c2]
print_testpr(test)

Parameter [y1 * c1 + y2 * c2 * y1] = 51.0; dL/d[[y1 * c1 + y2 * c2 * y1]] = 1
Parameter y2 * c2 * y1 = 30.0; dL/d[y2 * c2 * y1] = 1.0
Parameter y1 * c1 = 21.0; dL/d[y1 * c1] = 1.0
Parameter y1 = 3.0; dL/d[y1] = 17.0
Parameter y2 = 2.0; dL/d[y2] = 15.0
Parameter c1 = 7.0; dL/d[c1] = 3.0
Parameter c2 = 5.0; dL/d[c2] = 6.0


#### Test 3

In [120]:
x1 = Parameter(3.0, 'x1')
x2 = Parameter(4.0, 'x2')

w1 = Parameter(1.0, 'w1')
w2 = Parameter(2.0, 'w2')

x1w1 = x1 * w1
x2w2 = x2 * w2
xw = x1w1 + x2w2
out = xw.sigmoid()

In [121]:
out.backward()

[2430120895376, 2430120976272, 2430143200208, 2430143195984, 2430143197968, 2430143194512, 2430143194960, 2430143199184]
2430120895376
2430120976272
2430143200208
2430143195984
2430143197968
2430143194512
2430143194960
2430143199184


In [122]:
l2 = [out, xw, x2w2, x1w1, x1, x2, w1, w2]
print_testpr(l2)

Parameter σ([x1 * w1 + x2 * w2]) = 0.999983298578152; dL/d[σ([x1 * w1 + x2 * w2])] = 1
Parameter [x1 * w1 + x2 * w2] = 11.0; dL/d[[x1 * w1 + x2 * w2]] = 1.670114291046157e-05
Parameter x2 * w2 = 8.0; dL/d[x2 * w2] = 1.670114291046157e-05
Parameter x1 * w1 = 3.0; dL/d[x1 * w1] = 1.670114291046157e-05
Parameter x1 = 3.0; dL/d[x1] = 1.670114291046157e-05
Parameter x2 = 4.0; dL/d[x2] = 3.340228582092314e-05
Parameter w1 = 1.0; dL/d[w1] = 5.010342873138471e-05
Parameter w2 = 2.0; dL/d[w2] = 6.680457164184628e-05


#### Test 4

In [123]:
xx = x1*x1
xx.backward()

[2430143100432, 2430143197968]
2430143100432
2430143197968


In [124]:
xx, x1

(Parameter x1 * x1 = 9.0; dL/d[x1 * x1] = 1,
 Parameter x1 = 3.0; dL/d[x1] = 6.00001670114291)

#### Test 5

In [125]:
x = Parameter(4, "x")
y = Parameter(5, "y")
Lout = x*(x*x+y)

In [126]:
Lout.backward()

[2430143514832, 2430143100176, 2430143095312, 2430143088912, 2430143186000]
2430143514832
2430143100176
2430143095312
2430143088912
2430143186000


In [127]:
test2 = [Lout, x, y]
print_testpr(test2)

Parameter x * [x * x + y] = 84; dL/d[x * [x * x + y]] = 1
Parameter x = 4; dL/d[x] = 53.0
Parameter y = 5; dL/d[y] = 4.0


#### Test 6

In [128]:
x = Parameter(4,"x")
y = Parameter(5,"y")
L = (x*x)*(x*x*y)
L.backward()
print_testpr([L, xx, x,y])

[2430143525840, 2430143519504, 2430143522832, 2430143521232, 2430143518160, 2430143188432]
2430143525840
2430143519504
2430143522832
2430143521232
2430143518160
2430143188432
Parameter x * x * x * x * y = 1280; dL/d[x * x * x * x * y] = 1
Parameter x1 * x1 = 9.0; dL/d[x1 * x1] = 1
Parameter x = 4; dL/d[x] = 1280.0
Parameter y = 5; dL/d[y] = 256.0


In [129]:
x = Parameter(4.0, 'x')
y = Parameter(4.0, 'y')
w = Parameter(4.0, 'w')

u1 = x + y
u2 = u1 * x
u3 = u2 + w
L = u1 + u3
L.backward()
print_testpr([x,y])

[2430143523152, 2430143522000, 2430143522320, 2430143526288, 2430143521744, 2430143526544, 2430143518800]
2430143523152
2430143522000
2430143522320
2430143526288
2430143521744
2430143526544
2430143518800
Parameter x = 4.0; dL/d[x] = 13.0
Parameter y = 4.0; dL/d[y] = 5.0


In [130]:
import torch
x = torch.tensor(4., requires_grad=True)
y = torch.tensor(4., requires_grad=True)
w = torch.tensor(4., requires_grad=True)
u1 = x + y
u2 = u1 * x
u3 = u2 + w
L = u1 + u3
L.backward()
x.grad, y.grad

(tensor(13.), tensor(5.))

## Task 2-3

In [131]:
N = 20

In [132]:
x = Parameter(3.0, 'x')
b = Parameter(1.0, 'b')

w1 = Parameter(1.0, 'w1')
w2 = Parameter(2.0, 'w2')
parameters = [w1, w2, b]
L = (w2*(w1*x+b).ReLU()+b).ReLU()
print("Before:")
print(L)
print_testpr(parameters)
for i in range(N):
    L.backward()
    sgd(parameters)
    L = (w2*(w1*x+b).ReLU()+b).ReLU()
print("After:")
print(L)
print_testpr(parameters)

Before:
Parameter max(0, [w2 * max(0, [w1 * x + b]) + b]) = 9.0; dL/d[max(0, [w2 * max(0, [w1 * x + b]) + b])] = 0.0
Parameter w1 = 1.0; dL/d[w1] = 0.0
Parameter w2 = 2.0; dL/d[w2] = 0.0
Parameter b = 1.0; dL/d[b] = 0.0
[2430143290768, 2430143293328, 2430143283600, 2429364125584, 2430143297552, 2430143297872, 2430143290000, 2429364118928, 2429364123856, 2430143516752]
2430143290768
2430143293328
2430143283600
2429364125584
2430143297552
2430143297872
2430143290000
2429364118928
2429364123856
2430143516752
[2430143641936, 2430143636368, 2430143637776, 2429364125584, 2430143639888, 2430143637264, 2430143641872, 2429364118928, 2429364123856, 2430143516752]
2430143641936
2430143636368
2430143637776
2429364125584
2430143639888
2430143637264
2430143641872
2429364118928
2429364123856
2430143516752
[2430143283600, 2430143297872, 2430143290768, 2429364125584, 2430143635984, 2430143633808, 2430143640400, 2429364118928, 2429364123856, 2430143516752]
2430143283600
2430143297872
2430143290768
24293