In [1]:
import numpy as np
from collections import defaultdict
from typing import List

In [2]:
class Parameter:
    def __init__(self, value: float, name: str) -> None:
        self._value = value
        self._name = name

        self._grad = 0.0
        self._backward = lambda: None
        self._parameters = {id(self): self}
        self._graph = defaultdict(list)

    def __repr__(self) -> str:
        return f"Parameter {self._name} = {self._value}; dL/d[{self._name}] = {self._grad}"
        
    def _topologicalSortUtil(self,v,visited,stack):
        visited[v] = True
 
        for i in self._graph[v]:
            if visited[i] == False:
                self._topologicalSortUtil(i,visited,stack)
 
        stack.insert(0,v)
 

    def _topologicalSort(self):
        visited = {p: False for p in self._parameters}
        stack = []
 
        for i in self._parameters.keys():
            if visited[i] == False:
                self._topologicalSortUtil(i,visited,stack)
 
        return stack
    
    def backward(self):
        self._grad = 1
        queue = self._topologicalSort()
        for k in queue:
            self._parameters[k]._backward()

    def __add__(self, other: 'Parameter') -> 'Parameter':
        result = Parameter(
            self._value + other._value,
            f'[{self._name} + {other._name}]'
        )

        def _backward():
            self._grad += 1.0 * result._grad  #dL / dself
            other._grad += 1.0 * result._grad # dL / dother

        result._parameters.update(self._parameters)
        result._parameters.update(other._parameters)
        result._graph.update(self._graph)
        result._graph.update(other._graph)
        result._graph[id(result)].extend([id(self), id(other)])
        result._backward = _backward

        return result
    
    def __mul__(self, other: 'Parameter') -> 'Parameter':
        result = Parameter(
            self._value * other._value,
            f'{self._name} * {other._name}'
        )

        def _backward():
            self._grad += other._value * result._grad #dL / dself
            other._grad += self._value * result._grad # dL / dother

        result._parameters.update(self._parameters)
        result._parameters.update(other._parameters)
        result._graph.update(self._graph)
        result._graph.update(other._graph)
        result._graph[id(result)].extend([id(self), id(other)])
        result._backward = _backward

        return result

    def sigmoid(self) -> 'Parameter':
        # f(x) = 1 / (1 + exp(self._value))
        # f'(x) = f(x) * (1 - f(x))

        val = 1.0 / (1.0 + np.exp(-self._value))

        result = Parameter(
            val,
            f"σ({self._name})"
        )

        def _backward():
            self._grad += result._grad * val * (1 - val)

        result._parameters.update(self._parameters)
        result._graph.update(self._graph)
        result._graph[id(result)].append(id(self))
        result._backward = _backward

        return result

    def relu(self) -> 'Parameter':
        result = Parameter(max(0, self._value), f'ReLU({self._name})')
        
        def _backward():
            self._grad += result._grad * (self._value > 0)
            
        result._parameters.update(self._parameters)
        result._graph.update(self._graph)
        result._graph[id(result)].append(id(self))
        self._backward = _backward
        
        return result

    def softplus(self):
        result = Parameter(np.log1p(np.exp(self._value), f'Softplus({self._name})'))
        
        def _backward():
            self._grad += 1.0 / (1.0 + np.exp(-self._value))
            
        result._parameters.update(self._parameters)
        result._graph.update(self._graph)
        result._graph[id(result)].append(id(self))
        self._backward = _backward
        
        return result
    
        
def sgd(f, x: List[Parameter], params: List[Parameter]=None, lr=1e-2, steps=100, verbosity=3):
    for s in range(steps):
        result = f(*x)
        result.backward()
        if s % (steps // verbosity) == 0:
            for p in params:
                print(p)
        for p in params:
            p._value -= lr * p._grad
            p._grad = 0

In [3]:
x = Parameter(6, 'x')
sgd(lambda x: (x+Parameter(1, '1'))*(x+Parameter(2, '2')), [x], [x], lr=1e-3, steps=10000)

Parameter x = 6; dL/d[x] = 15.0
Parameter x = -1.4905124216761014; dL/d[x] = 0.018975156647797142
Parameter x = -1.4999879981143407; dL/d[x] = 2.4003771318614753e-05
Parameter x = -1.4999999848174892; dL/d[x] = 3.036502160824739e-08


In [4]:
x = Parameter(6, 'x')
sgd(lambda x: (x+x)*(x+Parameter(1, '1'))*x, [x], [x], lr=1e-3, steps=10000)

Parameter x = 6; dL/d[x] = 240.0
Parameter x = 9.386130754670727e-07; dL/d[x] = 3.7544575878353234e-06
Parameter x = 1.4820435041903189e-12; dL/d[x] = 5.928174016774455e-12
Parameter x = 2.340108091993604e-18; dL/d[x] = 9.360432367974416e-18


In [5]:
x = Parameter(3, 'x')
sgd(lambda x: x*x + Parameter(6, '6') + Parameter(-1, '-1') * (x.relu() + x), [x], [x], lr=1e-3, steps=10000)

Parameter x = 3; dL/d[x] = 4.0
Parameter x = 1.0025300208863732; dL/d[x] = 0.0050600417727464375
Parameter x = 1.0000032005028432; dL/d[x] = 6.4010056863850195e-06
Parameter x = 1.0000000040486707; dL/d[x] = 8.097341464008423e-09


In [6]:
x = Parameter(-3, 'x')
sgd(lambda x: x*x + Parameter(6, '6') + Parameter(-1, '-1') * (x.relu() + x), [x], [x], lr=1e-3, steps=10000)

Parameter x = -3; dL/d[x] = -7.0
Parameter x = 0.9911447655250428; dL/d[x] = -0.01771046894991435
Parameter x = 0.9999887980359132; dL/d[x] = -2.2403928173542198e-05
Parameter x = 0.9999999858293982; dL/d[x] = -2.834120360617476e-08
