In [1]:
import torch

## Example

In [2]:
func = lambda x: x ** 2 + 2*x + 1

In [3]:
x = torch.arange(5, dtype=torch.float32, requires_grad=True)
x

tensor([0., 1., 2., 3., 4.], requires_grad=True)

In [4]:
y = func(x)
y

tensor([ 1.,  4.,  9., 16., 25.], grad_fn=<AddBackward0>)

In [5]:
y.sum().backward()

In [6]:
x.grad

tensor([ 2.,  4.,  6.,  8., 10.])

In [7]:
2 * x + 2

tensor([ 2.,  4.,  6.,  8., 10.], grad_fn=<AddBackward0>)

## Jacobian

In [12]:
inp = torch.eye(4, 5, requires_grad=True)
print(f"inp =\n{inp}")
out = (inp+1).pow(2)
# retain_graph，False会free掉计算后的grad
out.backward(torch.ones_like(out), retain_graph=True)
print(f"First call\n{inp.grad}")

# 累计梯度
out.backward(torch.ones_like(out), retain_graph=True)
print(f"\nSecond call\n{inp.grad}")

# 累计梯度
out.backward(torch.ones_like(out), retain_graph=True)
print(f"\nThird call\n{inp.grad}")


inp.grad.zero_()
out.backward(torch.ones_like(out), retain_graph=True)
print(f"\nCall after zeroing gradients\n{inp.grad}")

inp =
tensor([[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.]], requires_grad=True)
First call
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.]])

Second call
tensor([[8., 4., 4., 4., 4.],
        [4., 8., 4., 4., 4.],
        [4., 4., 8., 4., 4.],
        [4., 4., 4., 8., 4.]])

Third call
tensor([[12.,  6.,  6.,  6.,  6.],
        [ 6., 12.,  6.,  6.,  6.],
        [ 6.,  6., 12.,  6.,  6.],
        [ 6.,  6.,  6., 12.,  6.]])

Call after zeroing gradients
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.]])


In [9]:
2*(inp + 1)

tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.]], grad_fn=<MulBackward0>)

## Case

$$ \text{BCE} = −(y \log (p) + (1−y) \log (1−p))$$

In [514]:
torch.manual_seed(42)
x = torch.rand((1, 5)) # input tensor
y = torch.zeros((1, 1))  # expected output

w = torch.randn(5, 1, requires_grad=True)
b = torch.randn(1, requires_grad=True)
z = torch.matmul(x, w) + b
p = torch.nn.Sigmoid()(z)

loss = torch.nn.functional.binary_cross_entropy(p, y)
loss.backward()

In [515]:
loss

tensor(0.7104, grad_fn=<BinaryCrossEntropyBackward0>)

In [516]:
-(y*p.log()+(1-y)*(1-p).log()).mean()

tensor(0.7104, grad_fn=<NegBackward0>)

In [517]:
w.grad

tensor([[0.4487],
        [0.4653],
        [0.1947],
        [0.4878],
        [0.1986]])

In [518]:
b.grad

tensor([0.5085])

$$
\frac{\partial \text{loss}}{\partial p} = -\frac{y}{p} + \frac{1-y}{1-p}
$$

$$\frac{\partial p}{\partial z} = p \cdot (1-p)$$

$$\frac{\partial z}{\partial w} = x^T$$

In [521]:
gl = -(y / p - (1 - y) / (1 - p))
gp = p * (1 - p)
gz = gl * gp
gw = x.t()

dw = torch.matmul(gw, gz)
dw

tensor([[0.4487],
        [0.4653],
        [0.1947],
        [0.4878],
        [0.1986]], grad_fn=<MmBackward0>)

In [520]:
db = torch.sum(gz, dim=0)
db

tensor([0.5085], grad_fn=<SumBackward1>)

参考资料：

[Automatic Differentiation with torch.autograd — PyTorch Tutorials 2.1.1+cu121 documentation](https://pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html)