In [42]:
import math
import random

In [43]:
class Parameter:
    def __init__(self, value: float, name: str, _prev=()) -> None:
        self._value = value
        self._name = name
        self._grad = 0.0
        self._backward = lambda: None
        self._prev = set(_prev)

    def __repr__(self) -> str:
        return f"Parameter {self._name} = {self._value}; dL/d[{self._name}] = {self._grad}"

    def zero_grad(self):
        self._grad = 0.0

    def __mul__(self, other: 'Parameter') -> 'Parameter':
        result = Parameter(
            self._value * other._value,
            f'({self._name} * {other._name})',
            (self, other)
        )
        
        def _backward():
            self._grad += other._value * result._grad
            other._grad += self._value * result._grad

        result._backward = _backward
        return result

    def __add__(self, other: 'Parameter') -> 'Parameter':
        result = Parameter(
            self._value + other._value,
            f'({self._name} + {other._name})',
            (self, other)
        )
        
        def _backward():
            self._grad += 1.0 * result._grad
            other._grad += 1.0 * result._grad

        result._backward = _backward
        return result

    def relu(self) -> 'Parameter':
        result_value = max(0.0, self._value)
        result = Parameter(
            result_value,
            f'relu({self._name})',
            (self,)
        )
        
        def _backward():
            self._grad += (1.0 if self._value > 0 else 0.0) * result._grad

        result._backward = _backward
        return result

    def sigmoid(self) -> 'Parameter':
        sig_value = 1 / (1 + math.exp(-self._value))
        result = Parameter(
            sig_value,
            f'sigmoid({self._name})',
            (self,)
        )
        
        def _backward():
            self._grad += sig_value * (1 - sig_value) * result._grad

        result._backward = _backward
        return result

    def silu(self) -> 'Parameter':
        sigmoid = self.sigmoid()
        result = self * sigmoid
        result._name = f'silu({self._name})'
        
        def _backward():
            self._grad += (sigmoid._value + self._value * sigmoid._value * (1 - sigmoid._value)) * result._grad
            # Note: sigmoid's backward will be called when self.sigmoid() is used, no need to update sigmoid._grad here

        result._backward = _backward
        return result

    def backward(self):
        # comp graph in topological order
        topo = []
        visited = set()

        def build_topo(param):
            if param not in visited:
                visited.add(param)
                for child in param._prev:
                    build_topo(child)
                topo.append(param)
        
        build_topo(self)

        # reset gradients
        for param in topo:
            param._grad = 0.0

        # set the gradient of the output
        self._grad = 1.0

        # propagate gradients
        for param in reversed(topo):
            param._backward()

In [44]:
def sgd(parameters: list[Parameter], lr: float) -> None:
    for param in parameters:
        param._value -= lr * param._grad

def mse_loss(y_pred: Parameter, y_true: float) -> Parameter:
    diff = y_pred + Parameter(-y_true, '-y_true')
    return diff * diff

def train(model: LinearModel, x_train: list[float], y_train: list[float], lr: float = 0.1, epochs: int = 100):
    for epoch in range(epochs):
        total_loss = 0.0

        for x_val, y_val in zip(x_train, y_train):
            x_param = Parameter(x_val, 'x')
            y_pred = model(x_param)

            loss = mse_loss(y_pred, y_val)

            # backpropagate
            loss.backward()

            # update parameters
            sgd([model.w, model.b], lr)

            total_loss += loss._value

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(x_train)}")

# training data for the model y = 3x + 2
x_train = [i for i in range(-10, 11)]
y_train = [3 * x + 2 for x in x_train]

# initialize
model = LinearModel()

# train
train(model, x_train, y_train, lr=0.01, epochs=20)

# validation
print(f"Learned w: {model.w._value}, b: {model.b._value}")

Epoch 1, Loss: 46.84976766430972
Epoch 2, Loss: 4.762173652918586
Epoch 3, Loss: 3.00929726116663
Epoch 4, Loss: 1.9016253219818784
Epoch 5, Loss: 1.2016688786008447
Epoch 6, Loss: 0.7593546831259383
Epoch 7, Loss: 0.47984893763469993
Epoch 8, Loss: 0.30322457748108844
Epoch 9, Loss: 0.191612687196531
Epoch 10, Loss: 0.12108326508251104
Epoch 11, Loss: 0.07651454242173733
Epoch 12, Loss: 0.04835082038808792
Epoch 13, Loss: 0.03055369290866978
Epoch 14, Loss: 0.01930739008902733
Epoch 15, Loss: 0.012200663047970548
Epoch 16, Loss: 0.007709803247551579
Epoch 17, Loss: 0.004871953752205555
Epoch 18, Loss: 0.0030786691438805364
Epoch 19, Loss: 0.0019454625761160394
Epoch 20, Loss: 0.0012293703734259876
Learned w: 3.002916668942686, b: 1.9690519687344459
