构建神经网络的典型流程：
- 定义一个拥有可学习参数的神经网络
- 遍历训练数据集
- 处理输入数据使其流经神经网络
- 计算损失值
- 将网络参数的梯度进行反向传播
- 以一定的规则更新网络的权重

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [4]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # 输入通道维度为1，输出通道维度为6，卷积核大小3*3
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.conv2 = nn.Conv2d(6, 16, 3)
        self.fc1 = nn.Linear(16 * 6 * 6, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        
    def forward(self, x):
        # 任意卷积层后面要加激活层、池化层
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        # 进入全连接层，需调整张量的形状
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def num_flat_features(self, x):
        # 计算size，除了第0个维度上的batch_size
        size = x.size()[1:]
        num_features = 1
        for s in size:
            num_features *= s 
        return num_features

In [5]:
net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [12]:
# 获得可训练参数
params = list(net.parameters())
len(params), params[0].size()

(10, torch.Size([6, 1, 3, 3]))

In [28]:
input = torch.randn(2, 1, 32, 32)
out = net(input)
out

tensor([[ 0.0725, -0.1456, -0.0095, -0.0596, -0.0269,  0.1553,  0.0668, -0.1343,
         -0.0889,  0.0337],
        [ 0.0335, -0.0860,  0.0251, -0.1038, -0.0483,  0.1644,  0.0582, -0.0799,
         -0.1131,  0.0342]], grad_fn=<AddmmBackward0>)

In [29]:
# 梯度归零，并反向传播,否则梯度会在不同的批次之间被累加
net.zero_grad()
out.backward(torch.randn(2, 10))

nn.Conv2d需要一个4D Tensor，形状为(nSamples, nChannels, Height, Width)。如果输入只有单一样本形式，则需要执行input.unsqueeze(0)，主动将3D Tensor扩充成4D Tensor。

#### 损失函数
- 损失函数的输入是一个输入的pair：（output, target），然后计算出一个数值来评估output和target之间的差距
- nn.MSELoss：计算均方误差损失

In [38]:
# output和target形状必须匹配
output = net(input)
target = torch.randn(2, 10)

criterion = nn.MSELoss()

loss = criterion(output, target)
loss

tensor(1.1806, grad_fn=<MseLossBackward0>)

In [39]:
print(loss.grad_fn)
print(loss.grad_fn.next_functions[0][0])

<MseLossBackward0 object at 0x0000027A6892BB50>
<AddmmBackward0 object at 0x0000027A6892B970>


In [40]:
net.zero_grad()
print(net.conv1.bias.grad)

# 执行反向传播
loss.backward()
print(net.conv1.bias.grad)

tensor([0., 0., 0., 0., 0., 0.])
tensor([-0.0070, -0.0017,  0.0055,  0.0171, -0.0003, -0.0050])


#### SGD
- 随机梯度下降：weight = weight - learning_rate * gradient

In [41]:
# 传统python代码实现SGD
learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)

In [42]:
import torch.optim as optim

In [43]:
optimizer = optim.SGD(net.parameters(), lr=0.01)

optimizer.zero_grad()

output = net(input)
loss = criterion(output, target)

loss.backward()
# 参数的更新
optimizer.step()

$$L_{out}=floor((L_{in}+2padding-dilation*(kernerl_{size}-1)-1)/stride+1)$$

In [19]:
(32 + 2 * 0 - 1 * (3- 1) - 1) / 1 + 1

30.0

$$H_{out}=floor((H_{in} + 2padding[1] - dilation[1]*(kernel_{size}[0] - 1) - 1)/stride[1] + 1)$$

In [22]:
(30 + 2 * 0 - 1 * (3 - 1) - 1) / 2 + 1

14.5

In [24]:
(14 + 2 * 0 - 1 * (3- 1) - 1) / 1 + 1

12.0

In [23]:
(12 + 2 * 0 - 1 * (3 - 1) - 1) / 2 + 1

5.5