In [2]:
import numpy as np
from collections import defaultdict

In [26]:
class Parameter:
    def __init__(self, value: float, name: str) -> None:
        self._value = value
        self._name = name

        self._grad = 0.0
        self._backward = lambda: None
        self._parameters = {id(self): self}
        self._graph = defaultdict(list)

    def __repr__(self) -> str:
        return f"Parameter {self._name} = {self._value}; dL/d[{self._name}] = {self._grad}"
        
    def _topologicalSortUtil(self,v,visited,stack):
        # Mark the current node as visited.
        visited[v] = True
 
        # Recur for all the vertices adjacent to this vertex
        for i in self._graph[v]:
            if visited[i] == False:
                self._topologicalSortUtil(i,visited,stack)
 
        # Push current vertex to stack which stores result
        stack.insert(0,v)
 

    def _topologicalSort(self):
        # Mark all the vertices as not visited
        visited = {p: False for p in self._parameters}
        stack = []
 
        # Call the recursive helper function to store Topological
        # Sort starting from all vertices one by one
        for i in self._parameters.keys():
            if visited[i] == False:
                self._topologicalSortUtil(i,visited,stack)
 
        # Print contents of stack
        return stack
    
    def backward(self):
        self._grad = 1
        queue = self._topologicalSort()
        for k in queue:
            self._parameters[k]._backward()

    def __add__(self, other: 'Parameter') -> 'Parameter':
        result = Parameter(
            self._value + other._value,
            f'[{self._name} + {other._name}]'
        )

        def _backward():
            self._grad += 1.0 * result._grad  #dL / dself
            other._grad += 1.0 * result._grad # dL / dother

        result._parameters.update(self._parameters)
        result._parameters.update(other._parameters)
        result._graph.update(self._graph)
        result._graph.update(other._graph)
        result._graph[id(result)].extend([id(self), id(other)])
        result._backward = _backward

        return result
    
    def __mul__(self, other: 'Parameter') -> 'Parameter':
        result = Parameter(
            self._value * other._value,
            f'{self._name} * {other._name}'
        )

        def _backward():
            self._grad += other._value * result._grad #dL / dself
            other._grad += self._value * result._grad # dL / dother

        result._parameters.update(self._parameters)
        result._parameters.update(other._parameters)
        result._graph.update(self._graph)
        result._graph.update(other._graph)
        result._graph[id(result)].extend([id(self), id(other)])
        result._backward = _backward

        return result

    def sigmoid(self) -> 'Parameter':
        # f(x) = 1 / (1 + exp(self._value))
        # f'(x) = f(x) * (1 - f(x))

        val = 1.0 / (1.0 + np.exp(-self._value))

        result = Parameter(
            val,
            f"σ({self._name})"
        )

        def _backward():
            self._grad += result._grad * val * (1 - val)

        result._parameters.update(self._parameters)
        result._graph.update(self._graph)
        result._graph[id(result)].append(id(self))
        result._backward = _backward

        return result

    def relu(self) -> 'Parameter':
        result = Parameter(max(0, self._value), f'ReLU({self._name})')
        
        def _backward():
            self._grad += result._grad * 1 if self._value > 0 else 0 
            
        result._parameters.update(self._parameters)
        result._graph.update(self._graph)
        result._graph[id(result)].append(id(self))
        self._backward = _backward
        
        return result

    def softplus(self):
        result = Parameter(np.log1p(np.exp(self._value), f'Softplus({self._name})'))
        
        def _backward():
            self._grad += 1.0 / (1.0 + np.exp(-self._value))
            
        result._parameters.update(self._parameters)
        result._graph.update(self._graph)
        result._graph[id(result)].append(id(self))
        self._backward = _backward
        
        return result
    
        
def sgd(f, x: list[Parameter], params: list[Parameter]=None, lr=1e-2, steps=100):
    for s in range(steps):
        result = f(*x)
        result.backward()
        for p in params:
            print(p._grad)
            print(p)
            p._value -= lr * p._grad
            p._grad = 0

In [27]:
x = Parameter(6, 'x')
sgd(lambda x: (x+Parameter(1, '1'))*(x+Parameter(2, '2')), [x], [x], lr=1e-1, steps=10000)

15.0
Parameter x = 6; dL/d[x] = 15.0
12.0
Parameter x = 4.5; dL/d[x] = 12.0
9.6
Parameter x = 3.3; dL/d[x] = 9.6
7.68
Parameter x = 2.34; dL/d[x] = 7.68
6.144
Parameter x = 1.5719999999999998; dL/d[x] = 6.144
4.9152
Parameter x = 0.9575999999999998; dL/d[x] = 4.9152
3.9321599999999997
Parameter x = 0.46607999999999983; dL/d[x] = 3.9321599999999997
3.145728
Parameter x = 0.07286399999999982; dL/d[x] = 3.145728
2.5165823999999994
Parameter x = -0.24170880000000022; dL/d[x] = 2.5165823999999994
2.0132659199999994
Parameter x = -0.4933670400000002; dL/d[x] = 2.0132659199999994
1.6106127359999998
Parameter x = -0.6946936320000001; dL/d[x] = 1.6106127359999998
1.2884901888
Parameter x = -0.8557549056000001; dL/d[x] = 1.2884901888
1.03079215104
Parameter x = -0.9846039244800001; dL/d[x] = 1.03079215104
0.8246337208319998
Parameter x = -1.087683139584; dL/d[x] = 0.8246337208319998
0.6597069766655999
Parameter x = -1.1701465116672; dL/d[x] = 0.6597069766655999
0.52776558133248
Parameter x = -1.

In [28]:
x

Parameter x = -1.4999999999999996; dL/d[x] = 0