In [1]:
import numpy as np

In [2]:
W1 = np.array([[1., 0., -1., 0.], [0., 1., 0., -1.]])
w10 = np.array([[-1., -1., -1., -1.]]).T
W2 = np.array([[1., -1.], [1., -1.], [1., -1.], [1., -1.]])
w20 = np.array([[0., 2.]]).T

In [3]:
x = np.array([[3., 14.]]).T

In [4]:
def softmax(x):
    e = np.exp(x)
    e /= np.sum(e)
    return e

In [5]:
a1 = np.maximum(0, W1.T @ x + w10)
a1

array([[ 2.],
       [13.],
       [ 0.],
       [ 0.]])

In [6]:
a2 = softmax(W2.T @ a1 + w20)
a2

array([[1.00000000e+00],
       [6.91440011e-13]])

In [7]:
import torch

In [8]:
W1 = torch.tensor(W1, requires_grad=True)
w10 = torch.tensor(w10, requires_grad=True)
W2 = torch.tensor(W2, requires_grad=True)
w20 = torch.tensor(w20, requires_grad=True)
x = torch.tensor(x, requires_grad=True)
zero = torch.tensor(0)

In [9]:
def softmax(x):
    e = torch.exp(x)
    e = e / torch.sum(e)
    return e

In [10]:
a1 = torch.maximum(zero, W1.T @ x + w10)
a1.retain_grad()
a1

tensor([[ 2.],
        [13.],
        [ 0.],
        [ 0.]], dtype=torch.float64, grad_fn=<MaximumBackward0>)

In [11]:
z2 = W2.T @ a1 + w20
z2.retain_grad()
a2 = softmax(z2)
a2

tensor([[1.0000e+00],
        [6.9144e-13]], dtype=torch.float64, grad_fn=<DivBackward0>)

In [12]:
loss = -torch.log(a2[1])

In [13]:
loss.backward()

In [14]:
z2.grad

tensor([[ 1.0000],
        [-1.0000]], dtype=torch.float64)

In [15]:
a1.grad

tensor([[2.0000],
        [2.0000],
        [2.0000],
        [2.0000]], dtype=torch.float64)

In [16]:
W1n = W1 - 0.1 * W1.grad
W1n

tensor([[ 0.4000, -0.6000, -1.0000,  0.0000],
        [-2.8000, -1.8000,  0.0000, -1.0000]], dtype=torch.float64,
       grad_fn=<SubBackward0>)

In [17]:
W2n = W2 - 0.1 * W2.grad
W2n

tensor([[ 0.8000, -0.8000],
        [-0.3000,  0.3000],
        [ 1.0000, -1.0000],
        [ 1.0000, -1.0000]], dtype=torch.float64, grad_fn=<SubBackward0>)

In [18]:
w10n = w10 - 0.1 * w10.grad
w10n

tensor([[-1.2000],
        [-1.2000],
        [-1.0000],
        [-1.0000]], dtype=torch.float64, grad_fn=<SubBackward0>)

In [19]:
w20n = w20 - 0.1 * w20.grad
w20n

tensor([[-0.1000],
        [ 2.1000]], dtype=torch.float64, grad_fn=<SubBackward0>)