In [None]:
#
# Project:
#      PyTorch Dojo (https://github.com/wo3kie/ml-dojo)
#
# Author:
#      Lukasz Czerwinski (https://www.lukaszczerwinski.pl/)
#

$$ \text{Model} = M(W, b) = Wx+b $$
$$ \frac{\partial M}{\partial W} = x $$
$$ \frac{\partial M}{\partial b} = 1 $$
$$ \\[2em] $$
$$ \text{Sigmoid} = S(M) = \frac{e^M}{e^M+1} $$
$$ \frac{\partial S}{\partial M} = \frac{e^M}{(e^M+1)^2} = S(1-S) $$
$$ \\[2em] $$
$$ \text{Loss} = L(S) = -(y\ln(S)+(1-y)\ln(1-S)) $$
$$ \frac{\partial L}{\partial S} = -\Big(y \frac{1}{S} + (1-y) \frac{1}{1-S}(-1) \Big) = \frac{S-y}{S(1-S)} $$
$$ \frac{\partial L}{\partial W} = \frac{\partial L}{\partial S} \frac{\partial S}{\partial M} \frac{\partial M}{\partial W} = \frac{S-y}{S(1-S)} \, S(1-S) \, x = (S-y)x $$
$$ \frac{\partial L}{\partial b} = \frac{\partial L}{\partial S} \frac{\partial S}{\partial M} \frac{\partial M}{\partial b} = \frac{S-y}{S(1-S)} \, S(1-S) \, 1 = S-y $$

In [None]:
from torch import exp, rand, Tensor

import import_ipynb
from common import assert_eq, assert_ge, assert_lt, Patient, T # type: ignore

def _Linear(X: Tensor, w: Tensor, b: Tensor) -> Tensor:
    return X @ w.T + b


def _Model(X: Tensor, w: Tensor, b: Tensor) -> Tensor:
    return _Linear(X, w, b)


def _Sigmoid(m: Tensor) -> Tensor:
    return exp(m) / (exp(m) + 1)


def _BinaryCrossEntropy(S: Tensor, y: Tensor) -> Tensor:
    return -((y * S.log()) + ((1 - y) * (1 - S).log())).mean()


def _Loss(S: Tensor, y: Tensor) -> Tensor:  
    return _BinaryCrossEntropy(S, y)


def logistic_regression_sgd_gradient(X: Tensor, y: Tensor, epochs=2000, lr=0.1) -> tuple[float, callable]:
    """
    Perform logistic regression using Stochastic Gradient Descent (SGD) with manual gradient calculation..

    Parameters:
        X: Input features of shape (Samples, Features)
        y: Target values of shape (Samples, 1)
        epochs: Number of training epochs
        lr: Learning rate

    Returns:
        A tuple containing the final loss and a prediction function that takes new input data and returns predicted probabilities.
    """

    (s, f) = X.shape

    w = rand(1, f)
    assert_eq(w.shape, (1, f))

    b = rand(1)
    assert_eq(b.shape, (1,))

    for _ in range(epochs):
        M = _Model(X, w, b)
        assert_eq(M.shape, (s, 1))

        S = _Sigmoid(M)
        assert_eq(S.shape, (s, 1))

        dL_dw = (S - y).T @ X
        assert_eq(dL_dw.shape, (1, f))

        #
        # When reducing the loss function, using `mean()` is more appropriate than `sum()` because 
        # it normalizes the loss by the number of samples, providing a more stable and comparable 
        # loss value across different batch sizes.
        #
        dL_db = (S - y).mean()
        assert_eq(dL_db.shape, ())

        w = w - lr * dL_dw
        b = b - lr * dL_db
        
        #
        # In the autograd version, computing the loss is required because it serves 
        # as the root of the computational graph for backpropagation. 
        #
        # In the manualâ€‘gradient version, the loss value is not needed for the weight update itself,
        # it is computed only to monitor training progress.
        #

        L = _Loss(S, y)

    return (L.item(), lambda x: _Sigmoid(_Model(x, w, b)))


def _test_logistic_regression_sgd_gradient(epochs=2000, lr=0.1) -> None:
    training_data = T([Patient(0.5).data for _ in range(80)])

    X = training_data[:, :-1]
    X[:, 0] /= 100 # Data scaling to make training numerically stable
    y = training_data[:, -1].unsqueeze(1)

    (_, model) = logistic_regression_sgd_gradient(X, y, epochs, lr)

    for d in T([Patient(1.0).data for _ in range(10)]):
        d[0] /= 100 # The same data scaling as during training.
        assert_ge(model(d[:-1]), T(0.5))
        
    for d in T([Patient(0.0).data for _ in range(10)]):
        d[0] /= 100 # The same data scaling as during training.
        assert_lt(model(d[:-1]), T(0.5))


if __name__ == "__main__":
    _test_logistic_regression_sgd_gradient()