## Setup Libraries

In [8]:
import numpy as np

## For better fit

In [9]:
def add_ones(matrix):
    ones = np.ones((matrix.shape[0], 1))
    return np.hstack((ones, matrix))

## Some simple activation function

In [10]:
class activation_func():
    def __init__(self):   # Storing the value of itself
        self.value = None
    def __call__(self):   # Forward pass
        pass
    def grad(self):   # Gradient calculation
        pass

class linear(activation_func):
    def __call__(self, input):
        self.value = input
        return input
    def grad(self):
        return np.ones(self.value.shape)

class relu(activation_func):
    def __call__(self, input):
        self.value = np.maximum(input, 0)
        return self.value
    def grad(self):
        return (self.value > 0).astype(float)

class sigmoid(activation_func):
    def __call__(self, input):
        self.value = 1 / (1 + np.exp(-input))
        return self.value
    def grad(self):
        return self.value * (1 - self.value)

class tanh(activation_func):
    def __call__(self, input):
        self.value = (np.exp(input) - np.exp(-input)) / (np.exp(input) + np.exp(-input))
        return self.value
    def grad(self):
        return 1 - self.value**2

## Loss function and Loss gradient

In [11]:
def MSE(y_true, y_pred):
    n = len(y_true)
    return .5/n * np.sum((y_true - y_pred)**2)

def grad_MSE(y_true, y_pred):
    return y_pred - y_true

## Optimizer

In [12]:
def gradient_descent(y_true, X, weights, learning_rate=0.01, iterations=1000):
    for iter in range(iterations):
        y_pred = np.dot(X, weights)
        grad = np.dot(X.T, grad_MSE(y_true, y_pred)) / len(y_true)
        weights = weights - learning_rate * grad
    return weights

## Dense layer

In [13]:
class Dense():
    def __init__(self, num_nodes, activation=None, learning_rate=0.01):
        self.num_nodes = num_nodes  # This number also indicates the output_size of the current layer
        self.activation = activation or linear()
        self.weights = None   # Weights will be inititialize during first forward pass (call)
        self.learning_rate = learning_rate
        self.z = None   # Value before activation
        self.a = None   # Output of layer (value after activation)
        self.inputs = None

    def __call__(self, inputs):
        if self.weights is None:  # Init weights if not yet
            input_size = inputs.shape[1]
            self.weights = np.random.rand(input_size, self.num_nodes)

        self.inputs = inputs  # For backprop
        self.z = np.dot(inputs, self.weights)   # Forward pass
        self.a = self.activation(self.z)        # Activate
        return self.a

    def back_prop(self, da):
        m = self.inputs.shape[0]  # Number of samples
        dZ = da * self.activation.grad()  # Gradient of activation (ReLU's grad or other)
        dW = np.dot(self.inputs.T, dZ) / m  # Gradient for weights
        dA_prev = np.dot(dZ, self.weights.T)  # Passing to the next layer
        return dW, dA_prev

    def optimize_weights(self, dW):
        self.weights = self.weights - self.learning_rate * dW

## Sequential Model

In [14]:
class Sequential_model():
    def __init__(self, layers=None):
        self.layers = layers if layers is not None else []
        self.optimizer = None
        self.loss = None

    def add(self, layer):
        self.layers.append(layer)
        return

    def forward_pass(self, inputs):
        outputs = inputs
        for layer in self.layers:
            outputs = layer(outputs)
        return outputs

    def backward_pass(self, loss_grad):
        dA_prev = loss_grad
        for layer in self.layers[::-1]:
            dW, dA_prev = layer.back_prop(dA_prev)
            layer.optimize_weights(dW)

    def fit(self, X_train, y_train, epochs=5):
        history = {'loss':[]}
        inputs = add_ones(X_train)

        for epoch in range(epochs):
            outputs = self.forward_pass(inputs)

            loss = MSE(y_train, outputs)
            loss_grad = grad_MSE(y_train, outputs)
            history['loss'].append(loss)
            if ((epoch + 1) % (10 ** (int(np.log10(epochs) - 1)))) == 0:
                print(f"Epoch: {epoch + 1} - Loss: {loss}")
            self.backward_pass(loss_grad)
        return history

    def predict(self, X_test):
        X_test = add_ones(X_test)
        return self.forward_pass(X_test)

## Test the model with simple regression task

In [15]:
np.random.seed(42)
X = np.linspace(-10, 10, 1000).reshape(-1, 1)
y = 2 * X + 1 # Ground truth: y = 2x + 1
y = y + 0.01 * np.random.randn(*y.shape)   # Add extra noises

In [16]:
model = Sequential_model([
    Dense(10),
    Dense(1)
])

In [17]:
history = model.fit(X, y, epochs=1000)

Epoch: 100 - Loss: 8.379246534170667e-05
Epoch: 200 - Loss: 4.784615649233523e-05
Epoch: 300 - Loss: 4.783494837482222e-05
Epoch: 400 - Loss: 4.783494487927935e-05
Epoch: 500 - Loss: 4.783494487818901e-05
Epoch: 600 - Loss: 4.783494487818835e-05
Epoch: 700 - Loss: 4.7834944878188187e-05
Epoch: 800 - Loss: 4.7834944878188824e-05
Epoch: 900 - Loss: 4.7834944878188444e-05
Epoch: 1000 - Loss: 4.7834944878188234e-05


In [18]:
model.predict(np.array([[10]]))

array([[21.00079424]])