# From Scratch

In [1]:
import torch 
import numpy as np 
import torchinfo 
from torchinfo import summary 
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [2]:
def equation(input:torch.Tensor): # z = 5 * a^2 + 3 * b^2
    return torch.matmul(torch.mul(input, input), torch.tensor([5, 3], dtype=torch.float32))

num_of_data = 100000
num_of_train_data = int(num_of_data * 0.7)
num_of_val_data =int(num_of_data * 0.3)
inputs = torch.randn(num_of_data, 2)
outputs = equation(inputs).view(-1,1)

batch_size = 500 
number_of_batches = num_of_data//batch_size

In [55]:
# Adjusting the initial weights and biases accordingly
lower_limit, upper_limit = -4, 4

class DeepLearning():

    def __init__(self, n_inputs, n_hidden, n_outputs, lr = 0.01):
        self.lr = lr
        # Initilizaing empty layers         
        self.hidden_layer = (lower_limit - upper_limit) * torch.rand(n_hidden, n_inputs+1) + upper_limit # additional 1 is to account for the bias value
        self.output_layer = (lower_limit - upper_limit) * torch.rand(n_outputs, n_hidden+1) + upper_limit # additional 1 is to account for the bias value

    def activate(self, weights, inputs):
        return torch.matmul(inputs, torch.transpose(weights[:, :-1], 0, 1)) + weights[:, -1]

    def sigmoid(self, inputs):
        return 1.0 / (1.0 + np.exp(-inputs))

    def MSELoss(self, target, output):
        return torch.mean((target-output)**2)

    def forwardpropagate(self, inputs, targets):
        self.inputs = inputs
        self.target = targets
        out = self.activate(self.hidden_layer, self.inputs)
        out = self.sigmoid(out)
        self.hidden_nodes = out.clone().detach()
        out = self.activate(self.output_layer, out)
        self.forward_res = out
        return self.MSELoss(self.target, out).item()

    def backpropagate(self): # BackPropagation       

        # Accounting for the output_layer, from 10 nodes to one single output node
        dc_dw2 = torch.mul(2 * (self.forward_res-self.target), self.hidden_nodes)
        dc_db2 = 2 * (self.forward_res-self.target)
        dc_dW2 = torch.cat((dc_dw2, dc_db2), dim = 1)
        dc_dW2 = torch.mean(dc_dW2, dim = 0).view(1,-1)

        # Accouting for the hidden_layer, from the 2 input nodes to the 10 hidden nodes
        sigmoid_derivative = torch.matmul(torch.transpose(self.hidden_nodes, 0, 1), (1-self.hidden_nodes))
        dc_dy = 2*(self.forward_res-self.target)
        dy_dhidden= self.output_layer[:, :-1]
        dx_dw1 = self.inputs
        dc_dW1 = torch.matmul(torch.transpose(torch.matmul(torch.matmul(dc_dy, dy_dhidden), sigmoid_derivative), 0, 1), dx_dw1)
        dc_dW1 = torch.cat((dc_dW1, torch.mean((torch.transpose(torch.matmul(torch.matmul(dc_dy, dy_dhidden), sigmoid_derivative), 0, 1)), dim =1).view(-1,1)), dim=1)

        # Attempt to normalize the gradients to prevent exploding gradient issues (not as effective)
        # dc_dW1 = torch.nn.functional.normalize(dc_dW1, dim = 0)
        # dc_dW2 = torch.nn.functional.normalize(dc_dW2, dim = 0)

        # Attempt to clip the gradients (more effective option)
        gradient_clipping = 1
        dc_dW1 = torch.clip(dc_dW1, -gradient_clipping, gradient_clipping)
        dc_dW2 = torch.clip(dc_dW2, -gradient_clipping, gradient_clipping)

        # Optimizing
        self.output_layer -= self.lr * dc_dW2 
        self.hidden_layer -= self.lr * dc_dW1

In [56]:
X_train, y_train, X_val, y_val = inputs[:num_of_train_data], outputs[:num_of_train_data], inputs[num_of_train_data:], outputs[num_of_train_data:]
model = DeepLearning(2, 10, 1, lr = 0.001)

for epoch in range(50):
    print(model.hidden_layer)
    print("")
    print(model.output_layer)
    # Training Loop - with backpropagation to adjust weights and biases 
    training_loss, count = 0, 0
    for batch_end_idx in range(batch_size, num_of_train_data, batch_size):
        train_input, train_output = X_train[batch_end_idx-batch_size:batch_end_idx], y_train[batch_end_idx-batch_size:batch_end_idx]
        training_loss += model.forwardpropagate(train_input, train_output)
        model.backpropagate()
        count += 1
    training_loss /= count 

    # Validation Loop - without backpropagation 
    validation_loss, count = 0, 0
    for batch_end_idx in range(batch_size, num_of_val_data, batch_size):
        val_input, val_output = X_val[batch_end_idx-batch_size:batch_end_idx], y_val[batch_end_idx-batch_size:batch_end_idx]
        validation_loss += model.forwardpropagate(val_input, val_output)
        count += 1
    validation_loss /= count 

    print(f"====Epoch {epoch} | Training Loss : {training_loss} | Validation Loss : {validation_loss} ====")

tensor([[ 3.5609, -3.0996, -0.4426],
        [-1.4523,  0.7248,  1.1363],
        [-3.4581, -1.0862, -3.1795],
        [-3.4465, -2.3997,  1.7328],
        [ 3.6408, -0.8127, -2.0699],
        [ 1.9618,  1.8304, -2.8479],
        [-0.4747, -0.3939, -2.7220],
        [ 2.8967,  3.7411, -3.4946],
        [-2.0638,  0.4756,  0.5820],
        [ 1.8023, -0.6910,  2.8333]])

tensor([[ 0.4362,  3.8137, -3.1134, -2.7114,  0.5187,  3.4584, -1.0334, -3.8733,
         -3.2578,  2.1744,  3.3554]])
====Epoch 0 | Training Loss : 99.632407702988 | Validation Loss : 93.93368763034627 ====
tensor([[ 3.6998, -3.0286, -0.5816],
        [-1.5914,  0.6538,  1.2753],
        [-3.5971, -1.1572, -3.0405],
        [-3.5855, -2.4707,  1.8718],
        [ 3.7798, -0.7417, -2.2089],
        [ 2.0328,  1.8574, -2.9189],
        [-0.6137, -0.4649, -2.5830],
        [ 3.0077,  3.7921, -3.6056],
        [-2.2028,  0.4046,  0.7210],
        [ 1.9414, -0.6200,  2.6943]])

tensor([[ 0.5752,  3.9527, -2.9745, -2.5724,  0.