In [52]:
import torch as t
import numpy as np

# Inputs
inputs = t.randn(4, 3) # 3 features, 4 samples
outputs = t.tensor([-1, 1, 1, -1], dtype=t.float32).view(-1, 1)

# Parameters
n = 5
feat = inputs.shape[1] # 3

W1 = t.randn(feat, n, requires_grad=True) # (3, 5) 3 features, 5 neurons
B1 = t.randn(1, n, requires_grad=True)  # 1 bias per neuron

W2 = t.randn(n, 5, requires_grad=True) #(5,5)
B2 = t.randn(1, n, requires_grad=True) #

W3 = t.randn(n, 1, requires_grad=True) # (5, 1) 5 neurons, 1 output

params = [W1, W2, W3, B1, B2]
print(W1.shape, W2.shape, W3.shape, B1.shape, B2.shape)

torch.Size([3, 5]) torch.Size([5, 5]) torch.Size([5, 1]) torch.Size([1, 5]) torch.Size([1, 5])


In [55]:
# Forward pass
L1 = inputs @ W1 + B1 # (4,3) X (3,5) = (4,5)
L2 = L1 @ W2 + B2 # (4,5) x (5,5) = (5,5)
L3 = L2 @ W3 #(5,5) x (5,1) = (5,1)
#L4 = t.randn()

# Compute the loss
loss = ((L3 - outputs) ** 2).mean()
print("Loss:", loss.item())

for p in params:
    p.grad = None
# Backward pass
loss.backward()
print(L3)

Loss: 4971133140992.0
tensor([[ 3837761.7500],
        [  435424.8125],
        [ 1365491.5000],
        [-1761236.6250]], grad_fn=<MmBackward0>)


In [54]:
lr = 0.1

for p in params:
    p.data += -lr * p.grad

In [83]:
import torch as t
import numpy as np
from torchviz import make_dot

# Inputs
inputs = t.randn(4, 3)  # 3 features, 4 samples
outputs = t.tensor([-1, 1, 1, -1], dtype=t.float32).view(-1, 1)

# Parameters
n = 5
feat = inputs.shape[1]  # 3

# Initialize weights more carefully
W1 = (t.randn(feat, n) / np.sqrt(feat)).clone().detach().requires_grad_(True)
B1 = t.randn(1, n).clone().detach().requires_grad_(True)
W2 = (t.randn(n, 5) / np.sqrt(n)).clone().detach().requires_grad_(True)
B2 = t.randn(1, n).clone().detach().requires_grad_(True)
W3 = (t.randn(n, 1) / np.sqrt(n)).clone().detach().requires_grad_(True)

params = [W1, W2, W3, B1, B2]

lr = 0.03  # Reduce the learning rate

# Training loop
for i in range(15):  # 100 training steps
    # Forward pass
    L1 = inputs @ W1 + B1  # (4,3) X (3,5) = (4,5)
    L2 = L1 @ W2 + B2  # (4,5) x (5,5) = (4,5)
    L3 = L2 @ W3  # (4,5) x (5,1) = (4,1)

    # Compute the loss
    loss = ((L3 - outputs) ** 2).mean()
    print(f"Step {i + 1}, Loss: {loss.item()}")

    # Zero gradients
    for p in params:
        if p.grad is not None:
            p.grad.zero_()

    # Backward pass
    loss.backward()

    # Update weights
    with t.no_grad():
        for p in params:
            p -= lr * p.grad

# Visualize the computational graph
make_dot(loss, params={name: param for name, param in enumerate(params)}).render("computational_graph", format="png")
print(L3)


Step 1, Loss: 0.7733758687973022
Step 2, Loss: 0.4250103235244751
Step 3, Loss: 0.32029587030410767
Step 4, Loss: 0.27598363161087036
Step 5, Loss: 0.24892061948776245
Step 6, Loss: 0.2269206941127777
Step 7, Loss: 0.20705828070640564
Step 8, Loss: 0.1885438710451126
Step 9, Loss: 0.1711898297071457
Step 10, Loss: 0.15495039522647858
Step 11, Loss: 0.13981766998767853
Step 12, Loss: 0.12578465044498444
Step 13, Loss: 0.11283937096595764
Step 14, Loss: 0.10096020251512527
Step 15, Loss: 0.09011775255203247
tensor([[-0.6389],
        [ 0.8076],
        [ 0.5691],
        [-0.9141]], grad_fn=<MmBackward0>)
