In [1]:
import torch
import numpy as np

## 正向传播与反向传播

这一部分主要包含下面的内容:

- 正向传播
- 非叶子节点梯度的保存, 使用retain_grad
- 反向传播

### 正向传播

In [5]:
def forwrad(x, y, w1, w2):
    # 其中 x,y 为输入数据，w为该函数所需要的参数
    z_1 = torch.mm(w1, x)
    z_1.retain_grad()
    y_1 = torch.sigmoid(z_1)
    y_1.retain_grad()
    z_2 = torch.mm(w2, y_1)
    z_2.retain_grad()
    y_2 = torch.sigmoid(z_2)
    y_2.retain_grad()
    loss = 1/2*(((y_2 - y)**2).sum())
    loss.retain_grad()
    return loss, z_1, y_1, z_2, y_2


# 测试代码
x = torch.tensor([[1.0]])
y = torch.tensor([[1.0], [0.0]])
w1 = torch.tensor([[1.0], [2.0]], requires_grad=True)
w2 = torch.tensor([[3.0, 4.0], [5.0, 6.0]], requires_grad=True)
# w2 = torch.tensor([[3.0, 1.0], [1.0, 6.0]], requires_grad=True)

loss, z_1, y_1, z_2, y_2 = forwrad(x, y, w1, w2)

In [6]:
print(' loss: {} \n z_1: {} \n y_1: {} \n z_2: {} \n y_2: {}'.format(loss, z_1, y_1, z_2, y_2))

 loss: 0.49987438321113586 
 z_1: tensor([[1.],
        [2.]], grad_fn=<MmBackward>) 
 y_1: tensor([[0.7311],
        [0.8808]], grad_fn=<SigmoidBackward>) 
 z_2: tensor([[5.7164],
        [8.9401]], grad_fn=<MmBackward>) 
 y_2: tensor([[0.9967],
        [0.9999]], grad_fn=<SigmoidBackward>)


### 反向传播

In [7]:
loss.backward()  # 反向传播，计算梯度
print('w1的梯度, {}'.format(w1.grad))
print('w2的梯度, {}'.format(w2.grad))

w1的梯度, tensor([[1.2243e-04],
        [7.8005e-05]])
w2的梯度, tensor([[-7.8431e-06, -9.4496e-06],
        [ 9.5752e-05,  1.1536e-04]])


### 查看各节点的梯度

$$loss.grad = \frac{\partial loss}{\partial loss} = 1$$

In [8]:
loss.grad

tensor(1.)

$$y_2.grad = \frac{\partial loss}{\partial y_2}$$

In [9]:
y_2.grad

tensor([[-0.0033],
        [ 0.9999]])

$$z_2.grad = \frac{\partial loss}{\partial z_2} = \frac{\partial loss}{\partial y_2} \cdot \frac{\partial y_2}{\partial z_2}$$

In [10]:
z_2.grad

tensor([[-1.0728e-05],
        [ 1.3098e-04]])

后面的节点计算方法类似

In [11]:
w2.grad

tensor([[-7.8431e-06, -9.4496e-06],
        [ 9.5752e-05,  1.1536e-04]])

In [15]:
# w2的梯度, 按照手推的公式进行计算
((y_2-y)* torch.sigmoid(z_2)*(1-torch.sigmoid(z_2)))* (torch.cat((y_1, y_1), dim=1).T)

tensor([[-7.8431e-06, -9.4496e-06],
        [ 9.5752e-05,  1.1536e-04]], grad_fn=<MulBackward0>)

In [12]:
y_1.grad

tensor([[0.0006],
        [0.0007]])

In [21]:
# y_1的梯度计算
torch.mm(w2.T, z_2.grad)

tensor([[0.0006],
        [0.0007]], grad_fn=<MmBackward>)

In [13]:
z_1.grad

tensor([[1.2243e-04],
        [7.8005e-05]])

In [14]:
w1.grad

tensor([[1.2243e-04],
        [7.8005e-05]])

## 使用Hook

In [24]:
# 我们可以定义一个hook来保存中间的变量
grads = {} # 存储节点名称与节点的grad
def save_grad(name):
    def hook(grad):
        grads[name] = grad
    return hook

In [25]:
def forwrad(x, y, w1, w2):
    # 其中 x,y 为输入数据，w为该函数所需要的参数
    z_1 = torch.mm(w1, x)
    y_1 = torch.sigmoid(z_1)
    z_2 = torch.mm(w2, y_1)
    y_2 = torch.sigmoid(z_2)
    loss = 1/2*(((y_2 - y)**2).sum())

    # hook中间节点
    z_1.register_hook(save_grad('z_1'))
    y_1.register_hook(save_grad('y_1'))
    z_2.register_hook(save_grad('z_2'))
    y_2.register_hook(save_grad('y_2'))
    loss.register_hook(save_grad('loss'))

    return loss, z_1, y_1, z_2, y_2


# 测试代码
x = torch.tensor([[1.0]])
y = torch.tensor([[1.0], [0.0]])
w1 = torch.tensor([[1.0], [2.0]], requires_grad=True)
w2 = torch.tensor([[3.0, 4.0], [5.0, 6.0]], requires_grad=True)

# 正向传播
loss, z_1, y_1, z_2, y_2 = forwrad(x, y, w1, w2)

# 反向传播
loss.backward()

In [27]:
print(grads['z_1'])
print(grads['y_1'])
print(grads['z_2'])
print(grads['y_2'])
print(grads['loss'])

tensor([[1.2243e-04],
        [7.8005e-05]])
tensor([[0.0006],
        [0.0007]])
tensor([[-1.0728e-05],
        [ 1.3098e-04]])
tensor([[-0.0033],
        [ 0.9999]])
tensor(1.)
