In [1]:
import torch
from torch import nn
import utils

In [2]:
def corr2d(X, K):
    """计算二维互相关运算(图像卷积)"""
    h, w = K.shape
    Y = torch.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1))
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            Y[i, j] = (X[i:i + h, j:j + w] * K).sum()
    return Y

In [3]:
X = torch.tensor(
    [[0.0, 1.0, 2.0], 
     [3.0, 4.0, 5.0], 
     [6.0, 7.0, 8.0]]
)
K = torch.tensor(
    [[0.0, 1.0], 
     [2.0, 3.0]]
)
corr2d(X, K)

tensor([[19., 25.],
        [37., 43.]])

In [4]:
class Conv2D(nn.Module):
    def __init__(self, kernel_size):
        super().__init__()
        self.weight = nn.Parameter(torch.rand(kernel_size))
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        return corr2d(x, self.weight) + self.bias

In [5]:
X = torch.ones((6, 8))
X[:, 2:6] = 0
X

tensor([[1., 1., 0., 0., 0., 0., 1., 1.],
        [1., 1., 0., 0., 0., 0., 1., 1.],
        [1., 1., 0., 0., 0., 0., 1., 1.],
        [1., 1., 0., 0., 0., 0., 1., 1.],
        [1., 1., 0., 0., 0., 0., 1., 1.],
        [1., 1., 0., 0., 0., 0., 1., 1.]])

In [6]:
# 该核主要用于发现左右方向上的梯度变化
K = torch.tensor([[1.0, -1.0]])

In [7]:
Y = corr2d(X, K)
Y

tensor([[ 0.,  1.,  0.,  0.,  0., -1.,  0.],
        [ 0.,  1.,  0.,  0.,  0., -1.,  0.],
        [ 0.,  1.,  0.,  0.,  0., -1.,  0.],
        [ 0.,  1.,  0.,  0.,  0., -1.,  0.],
        [ 0.,  1.,  0.,  0.,  0., -1.,  0.],
        [ 0.,  1.,  0.,  0.,  0., -1.,  0.]])

In [8]:
# X转置后无左右方向上的梯度变化
corr2d(X.t(), K)

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])

In [9]:
# 构造一个二维卷积层，它具有1个输出通道和形状为（1，2）的卷积核
conv2d = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(1, 2), bias=True)

# 这个二维卷积层使用四维输入和输出格式（批量大小、通道、高度、宽度），
# 其中批量大小和通道数都为1
X = X.reshape((1, 1, 6, 8))
Y = Y.reshape((1, 1, 6, 7))
lr = 2e-3  # 学习率

for i in range(200):
    Y_hat = conv2d(X)
    l = (Y_hat - Y) ** 2
    conv2d.zero_grad()
    l.sum().backward()
    # 迭代卷积核
    conv2d.weight.data[:] -= lr * conv2d.weight.grad
    if (i + 1) % 20 == 0:
        print(f'epoch {i+1}, loss {l.sum():.3f}')

epoch 20, loss 4.132
epoch 40, loss 1.703
epoch 60, loss 0.796
epoch 80, loss 0.453
epoch 100, loss 0.323
epoch 120, loss 0.274
epoch 140, loss 0.256
epoch 160, loss 0.249
epoch 180, loss 0.246
epoch 200, loss 0.245


In [10]:
# 查看学习出来的kernel,和[[1.0, -1.0]]很接近
conv2d.weight.data, conv2d.bias.data

(tensor([[[[ 0.9274, -1.0587]]]]), tensor([0.1094]))

In [11]:
# (w + 2p - k) // s + 1

In [12]:
# 为了方便起见，我们定义了一个计算卷积层的函数。
# 此函数初始化卷积层权重，并对输入和输出提高和缩减相应的维数
def comp_conv2d(conv2d, X):
    # 这里的（1，1）表示批量大小和通道数都是1
    X = X.reshape((1, 1, X.shape[0], X.shape[1]))
    Y = conv2d(X)
    # 省略前两个维度：批量大小和通道
    return Y.reshape(Y.shape[2:])

# 请注意，这里每边都填充了1行或1列，因此总共添加了2行或2列
conv2d = nn.Conv2d(1, 1, kernel_size=3, padding=1)
X = torch.rand(size=(8, 8))
comp_conv2d(conv2d, X).shape

torch.Size([8, 8])

In [13]:
conv2d = nn.Conv2d(1, 1, kernel_size=(5, 3), padding=(2, 1))
comp_conv2d(conv2d, X).shape

torch.Size([8, 8])

In [14]:
conv2d = nn.Conv2d(1, 1, kernel_size=3, padding=1, stride=2)
comp_conv2d(conv2d, X).shape

torch.Size([4, 4])

In [15]:
conv2d = nn.Conv2d(1, 1, kernel_size=(3, 5), padding=(0, 1), stride=(3, 4))
comp_conv2d(conv2d, X).shape

torch.Size([2, 2])

In [16]:
# 多输入通道
def corr2d_multi_in(X, K):
    # 将X和K对应通道一一卷积，并将结果加在一起
    return sum(corr2d(x, k) for x, k in zip(X, K))

In [17]:
X = torch.tensor(
    [
        [[0.0, 1.0, 2.0], 
         [3.0, 4.0, 5.0], 
         [6.0, 7.0, 8.0]],
        
        [[1.0, 2.0, 3.0], 
         [4.0, 5.0, 6.0], 
         [7.0, 8.0, 9.0]]
    ]
)
K = torch.tensor(
    [
        [[0.0, 1.0], 
         [2.0, 3.0]], 
        
        [[1.0, 2.0], 
         [3.0, 4.0]]
    ]
)

corr2d_multi_in(X, K)

tensor([[ 56.,  72.],
        [104., 120.]])

In [18]:
# 多输出通道
def corr2d_multi_in_out(X, K):
    # K比X多一维度，遍历中的每个k为多通道卷积核，K为多个(输出通道)多通道(输入通道)卷积核
    # 最后将所有结果都叠加在一起
    return torch.stack([corr2d_multi_in(X, k) for k in K], dim=0)

In [19]:
K = torch.stack((K, K + 1, K + 2), dim=0)
K.shape

torch.Size([3, 2, 2, 2])

In [20]:
corr2d_multi_in_out(X, K)

tensor([[[ 56.,  72.],
         [104., 120.]],

        [[ 76., 100.],
         [148., 172.]],

        [[ 96., 128.],
         [192., 224.]]])

In [21]:
# 1 * 1 卷积
def corr2d_multi_in_out_1x1(X, K):
    c_i, h, w = X.shape
    c_o = K.shape[0]
    X = X.reshape((c_i, h * w))
    K = K.reshape((c_o, c_i))
    # 全连接层中的矩阵乘法
    Y = torch.matmul(K, X)
    return Y.reshape((c_o, h, w))

In [25]:
# 1 * 1 卷积可粗略看做一种全连接，通常用于调整网络层的通道数量和控制模型复杂性
# 【1 * 1 卷积意义为每个X的多通道对应像素融合为一个像素，即每个X的多通道融合为单通道】
X = torch.normal(0, 1, (3, 3, 3))
K = torch.normal(0, 1, (2, 3, 1, 1))

Y1 = corr2d_multi_in_out_1x1(X, K)
Y2 = corr2d_multi_in_out(X, K)
float(torch.abs(Y1 - Y2).sum()) < 1e-6

True

In [30]:
# 池化(pooling)
def pool2d(X, pool_size, mode='max'):
    p_h, p_w = pool_size
    Y = torch.zeros((X.shape[0] - p_h + 1, X.shape[1] - p_w + 1))
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            if mode == 'max':
                Y[i, j] = X[i: i + p_h, j: j + p_w].max()
            elif mode == 'avg':
                Y[i, j] = X[i: i + p_h, j: j + p_w].mean()
    return Y

In [31]:
X = torch.tensor(
    [[0.0, 1.0, 2.0], 
     [3.0, 4.0, 5.0], 
     [6.0, 7.0, 8.0]]
)
pool2d(X, (2, 2))

tensor([[4., 5.],
        [7., 8.]])

In [32]:
pool2d(X, (2, 2), 'avg')

tensor([[2., 3.],
        [5., 6.]])

In [33]:
X = torch.arange(16, dtype=torch.float32).reshape((1, 1, 4, 4))
X

tensor([[[[ 0.,  1.,  2.,  3.],
          [ 4.,  5.,  6.,  7.],
          [ 8.,  9., 10., 11.],
          [12., 13., 14., 15.]]]])

In [35]:
# 默认情况下，torch中的步幅与pooling窗口的大小相同
pool2d = nn.MaxPool2d(3)
pool2d(X)

tensor([[[[10.]]]])

In [36]:
pool2d = nn.MaxPool2d(3, padding=1, stride=2)
pool2d(X)

tensor([[[[ 5.,  7.],
          [13., 15.]]]])

In [37]:
pool2d = nn.MaxPool2d((2, 3), stride=(2, 3), padding=(0, 1))
pool2d(X)

tensor([[[[ 5.,  7.],
          [13., 15.]]]])

In [38]:
# 多通道池化
X = torch.cat((X, X + 1), dim=1)
X

tensor([[[[ 0.,  1.,  2.,  3.],
          [ 4.,  5.,  6.,  7.],
          [ 8.,  9., 10., 11.],
          [12., 13., 14., 15.]],

         [[ 1.,  2.,  3.,  4.],
          [ 5.,  6.,  7.,  8.],
          [ 9., 10., 11., 12.],
          [13., 14., 15., 16.]]]])

In [39]:
pool2d = nn.MaxPool2d(3, padding=1, stride=2)
pool2d(X)

tensor([[[[ 5.,  7.],
          [13., 15.]],

         [[ 6.,  8.],
          [14., 16.]]]])