先使用 pytorch 中的卷积层进行梯度的验证

+ 前向传播:

$$
oh = \frac{h - \left(d\times\left(kh - 1\right) + 1\right)}{s} + 1
$$

+ 反向传播:

$$
kh = \frac{h - \left(s\times\left(oh - 1\right) + 1\right)}{d} + 1
$$

In [139]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.optim import SGD

input = Variable(torch.arange(25).view(1, 1, 5, 5).float(), requires_grad=True)
net = nn.Conv2d(1, 1, 2, padding=3, stride=2, bias=False)
net.weight.data.copy_(torch.ones_like(net.weight.data))
output = net(input)
print(output)
y = output.sum()
print(y)
y.backward()
print(input.grad)
print(net.weight.grad)

tensor([[[[ 0.,  0.,  0.,  0.,  0.],
          [ 0.,  0.,  3.,  7.,  0.],
          [ 0., 15., 36., 44.,  0.],
          [ 0., 35., 76., 84.,  0.],
          [ 0.,  0.,  0.,  0.,  0.]]]], grad_fn=<ThnnConv2DBackward>)
tensor(300., grad_fn=<SumBackward0>)
tensor([[[[1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1.]]]])
tensor([[[[ 48.,  72.],
          [ 72., 108.]]]])


In [29]:
# 当 stride 不是 1 的时候

input = Variable(torch.arange(64).view(1, 1, 8, 8).float(), requires_grad=True)
net = nn.Conv2d(1, 1, 2, bias=False)
net.weight.data.copy_(torch.ones_like(net.weight.data))

sgd = SGD(net.parameters(), lr=1)

output = net(input)
print(output)
y = output.sum()
print(y)

sgd.zero_grad()
y.backward()
sgd.step()
print(input.grad)
print(net.weight.grad)
print(net.weight.data)

tensor([[[[ 18.,  26.,  34.,  42.],
          [ 82.,  90.,  98., 106.],
          [146., 154., 162., 170.],
          [210., 218., 226., 234.]]]], grad_fn=<ThnnConv2DBackward>)
tensor(2016., grad_fn=<SumBackward0>)
tensor([[[[1., 1., 1., 1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1., 1., 1., 1.],
          [1., 1., 1., 1., 1., 1., 1., 1.]]]])
tensor([[[[432., 448.],
          [560., 576.]]]])
tensor([[[[-431., -447.],
          [-559., -575.]]]])


In [17]:
432 / 16

27.0

# 实现卷积层

1. 使用 `np.lib.stride_tricks.as_strided` 与 `tensordot` 函数实现 `im2col` 以及卷积操作.

    + 需要考虑到 stride, padding, dilation 等情况
    
    + 前向传播:

    $$
    oh = \frac{h - \left(d\times\left(kh - 1\right) + 1\right)}{s} + 1
    $$

    + 反向传播:

    $$
    kh = \frac{h - \left(s\times\left(oh - 1\right) + 1\right)}{d} + 1
    $$

In [30]:
import numpy as np

In [167]:
def make_padding(input, padding=(0, 0)):
    if padding == (0, 0):
        return input
    B, C, H, W = input.shape
    pad = np.zeros((B, C, H + 2 * padding[0], W + 2 * padding[1]))
    pad[..., padding[0]:-padding[0], padding[1]:-padding[1]] = input
    return pad

def make_dilation(input, dilation=(1, 1)):
    if dilation == (1, 1):
        return input
    
    B, C, H, W = input.shape
    p, q = dilation
    oh, ow = p * (H - 1) + 1, q * (W - 1) + 1
    pad = np.zeros((B, C, oh, ow))
    pad[..., ::p, ::q] = input
    return pad

def unwrap_padding(input, padding=(0, 0)):
    if padding == (0, 0):
        return input
    p, q = padding
    return input[..., p:-p, q:-q]


def rotate_kernel(kernel):
    return kernel[..., ::-1, ::-1]
    
a = np.arange(8).reshape(1, 2, 2, 2)
a = make_padding(a, (2, 2))
unwrap_padding(a, (2, 2))

array([[[[0., 1.],
         [2., 3.]],

        [[4., 5.],
         [6., 7.]]]])

In [206]:
def im2col(input, ksize, stride=(1, 1), dilation=(1, 1), writeable=False):
    """
    使用 np.lib.stride_tricks.as_strided 实现 im2col, 关键是确认输出结果的 shape 和 strides 两个参数.
    input 应该在输入这个函数之前进行 padding
    """
    B, C, H, W = input.shape
    kh, kw = ksize
    
    istrides = input.strides
    ostrides = list(istrides + istrides[-2:])
    oh = (H - (dilation[0] * (kh - 1) + 1)) / stride[0] + 1
    ow = (W - (dilation[1] * (kw - 1) + 1)) / stride[1] + 1
    assert int(oh) == oh and int(ow) == ow, 'conv2d not aligned'
    oh = int(oh)
    ow = int(ow)
    ostrides[2] *= stride[0]
    ostrides[3] *= stride[1]
    ostrides[4] *= dilation[0]
    ostrides[5] *= dilation[1]
    
    return np.lib.stride_tricks.as_strided(input, shape=(B, C, oh, ow, kh, kw),
                                          strides=ostrides,
                                          writeable=writeable)

a = np.arange(9).reshape(1, 1, 3, 3)
im2col(a, (2, 2), stride=(2, 2), dilation=(2, 2))

array([[[[[[0, 2],
           [6, 8]]]]]])

In [73]:
def conv(input, kernel, padding=(0, 0), stride=(1, 1), dilation=(1, 1)):
    input = make_padding(input, padding)
    input = make_dilation(input, dilation)
    
    x_col = im2col(input, kernel.shape[-2:], stride)
    # tensordot 的结果 shape 为 (B, oh, ow, oC)
    return np.tensordot(x_col, kernel, axes=[(1, 4, 5), (0, 2, 3)]).transpose(0, 3, 1, 2)

## 这个结果和 pytorch 中一样
input = np.arange(9).reshape(1, 1, 3, 3)
kernel = np.ones((1, 1, 2, 2))
out_conv = conv(input, kernel)
print(repr(out_conv))

array([[[[ 8., 12.],
         [20., 24.]]]])


对于卷积, 前向传播时有:

$$
    oh = \frac{h - \left(d\times\left(kh - 1\right) + 1\right)}{s} + 1
$$

为了实现误差传播, 实现对前一层输入的梯度求解, 上式可以改成:

$$
h = \left[(oh - 1)\times s + 1\right] + \left[d\times (kh - 1)\right]
$$

右边第一项表示对 output feature 进行 dilation (设结果为 A) , 但是 dilation 的系数为 `s`, 而第二项表示再对 A 进行 padding, padding 的大小为 `d * (kh - 1)`

而对于权重的更新, 上上式可以改为:

  $$
    kh = \frac{h - \left(s\times\left(oh - 1\right) + 1\right)}{d} + 1
  $$

In [212]:
# conv 的反向传播: 梯度更新和误差传播
def conv(input, kernel, padding=(0, 0), stride=(1, 1), dilation=(1, 1)):
    input = make_padding(input, padding)
    input = make_dilation(input, dilation)
    
    x_col = im2col(input, kernel.shape[-2:], stride)
    # tensordot 的结果 shape 为 (B, oh, ow, oC)
    return np.tensordot(x_col, kernel, axes=[(1, 4, 5), (0, 2, 3)]).transpose(0, 3, 1, 2)


def backward_conv(input, kernel, eta, padding=(0, 0), stride=(1, 1), dilation=(1, 1)):
    h, i = input.shape[-2:]
    kernel_grad = np.zeros_like(kernel)
    kh, kw = kernel.shape[-2:]
    s, _ = stride
    p, _ = padding
    d, _ = dilation
    oh, ow = eta.shape[-2:]
    
    ieta = eta.copy()
    ieta = make_dilation(ieta, stride)
    ieta = make_padding(ieta, ((kh - 1), (kw - 1)))
    x_col = im2col(ieta, (kh, kw))
    input_grad = np.tensordot(x_col, rotate_kernel(kernel), axes=[(1, 4, 5), (1, 2, 3)]).transpose(0, 3, 1, 2)
    input_grad = unwrap_padding(input_grad, padding)
    
    input = make_padding(input, padding)
    x_col = im2col(input, (oh, ow), dilation=stride)
    kernel_grad = np.tensordot(x_col, eta, axes=[(0, 4, 5), (0, 2, 3)]).transpose(0, 3, 1, 2)
    return input_grad, kernel_grad
    

    
input = np.arange(9).reshape(1, 1, 3, 3)
kernel = np.arange(4).reshape(*(1, 1, 2, 2))
padding = (1, 1)
stride = (1, 1)
dilation = (1, 1)
out_conv = conv(input, kernel, padding=padding, stride=stride, dilation=dilation) 
print(out_conv.shape)
eta = np.ones_like(out_conv)
backward_conv(input, kernel, eta, padding=padding, stride=stride, dilation=dilation)

(1, 1, 4, 4)


(array([[[[6., 6., 6.],
          [6., 6., 6.],
          [6., 6., 6.]]]]), array([[[[36., 36.],
          [36., 36.]]]]))

In [213]:
input = Variable(torch.arange(9).view(1, 1, 3, 3).float(), requires_grad=True)
net = nn.Conv2d(1, 1, 2, padding=1, stride=1, bias=False)
shape = net.weight.data.size()
net.weight.data.copy_(torch.arange(4).view(shape))
output = net(input)
print(output)
y = output.sum()
print(y)
y.backward()
print(input.grad)
print(net.weight.grad)

tensor([[[[ 0.,  3.,  8.,  4.],
          [ 9., 19., 25., 10.],
          [21., 37., 43., 16.],
          [ 6.,  7.,  8.,  0.]]]], grad_fn=<ThnnConv2DBackward>)
tensor(216., grad_fn=<SumBackward0>)
tensor([[[[6., 6., 6.],
          [6., 6., 6.],
          [6., 6., 6.]]]])
tensor([[[[36., 36.],
          [36., 36.]]]])


In [208]:
5. == 5

True

## 尝试用循环来实现卷积层


In [299]:
import numpy as np

In [313]:
p = 0
s = 1
d = 1

h = w = 5
kh = kw = 2
N = 2
C = 3
oC = 1

input = np.arange(N * C * h * w, dtype=np.float64).reshape(N, C, h, w)
# kernel = np.arange(kh * kw, dtype=np.float64).reshape(1, 1, kh, kw)
kernel = np.ones((C, oC, kh, kw))

def conv(input, kernel, p, s, d):
    input = make_padding(input, (p, p))
    
    N, C, H, W = input.shape
    iC, oC, kh, kw = kernel.shape
    assert C == iC, 'channels not aligned'
    
    oh, ow = (H - (d * (kh - 1) + 1)) // s + 1, (W - (d * (kw - 1) + 1)) // s + 1
    out = np.zeros((N, oC, oh, ow))
    
    for n in range(N):
        for c in range(oC):
            for i in range(oh):
                for j in range(ow):
                    out[n, c, i, j] = np.sum(input[n, :, i * s : i * s + kh, j * s : j * s + kw] * \
                                             kernel[:, c, ...].squeeze())
                    
    return out

out_conv = conv(input, kernel, p, s, d)

def backward_conv(input, kernel, eta, p, s, d):
    N, C, h, w = input.shape
    iC, oC, kh, kw = kernel.shape
    oh, ow = eta.shape[-2:]
    input_grad = np.zeros_like(input)
    kernel_grad = np.zeros_like(kernel)
    
    ## 误差更新
    ## 由于我是直接对矩阵相乘, 所以这里不需要将 kernel 翻转.
    input_grad = make_padding(input_grad, (p, p))
    for n in range(N):
        for c in range(C):
            for i in range(oh):
                for j in range(ow):
                    input_grad[n, c, i * s : i * s + kh, j * s : j * s + kw] += np.sum(kernel[c, :, ...] * \
                                                                            eta[n, :, i, j], axis=0)
    input_grad = unwrap_padding(input_grad, (p, p))

    ## 梯度更新
    input = make_padding(input, (p, p))
    eta = make_dilation(eta, (s, s))
    dh, dw = eta.shape[-2:]
    for ic in range(iC):
        for oc in range(oC):
            for i in range(kh):
                for j in range(kw):
                    kernel_grad[ic, oc, i, j] = np.sum(input[:, ic, i : i + dh, j : j + dw] * eta[:, oc, ...])
    return input_grad, kernel_grad


eta = np.ones_like(out_conv)
input_grad, kernel_grad = backward_conv(input, kernel, eta, p, s, d)
print(input_grad)
print(kernel_grad)

[[[[1. 2. 2. 2. 1.]
   [2. 4. 4. 4. 2.]
   [2. 4. 4. 4. 2.]
   [2. 4. 4. 4. 2.]
   [1. 2. 2. 2. 1.]]

  [[1. 2. 2. 2. 1.]
   [2. 4. 4. 4. 2.]
   [2. 4. 4. 4. 2.]
   [2. 4. 4. 4. 2.]
   [1. 2. 2. 2. 1.]]

  [[1. 2. 2. 2. 1.]
   [2. 4. 4. 4. 2.]
   [2. 4. 4. 4. 2.]
   [2. 4. 4. 4. 2.]
   [1. 2. 2. 2. 1.]]]


 [[[1. 2. 2. 2. 1.]
   [2. 4. 4. 4. 2.]
   [2. 4. 4. 4. 2.]
   [2. 4. 4. 4. 2.]
   [1. 2. 2. 2. 1.]]

  [[1. 2. 2. 2. 1.]
   [2. 4. 4. 4. 2.]
   [2. 4. 4. 4. 2.]
   [2. 4. 4. 4. 2.]
   [1. 2. 2. 2. 1.]]

  [[1. 2. 2. 2. 1.]
   [2. 4. 4. 4. 2.]
   [2. 4. 4. 4. 2.]
   [2. 4. 4. 4. 2.]
   [1. 2. 2. 2. 1.]]]]
[[[[1488. 1520.]
   [1648. 1680.]]]


 [[[2288. 2320.]
   [2448. 2480.]]]


 [[[3088. 3120.]
   [3248. 3280.]]]]


In [314]:
input = Variable(torch.arange(N * C * h * w).view(N, C, h, w).float(), requires_grad=True)
net = nn.Conv2d(C, oC, (kh, kw), padding=p, stride=s, bias=False)
shape = net.weight.data.size()
# net.weight.data.copy_(torch.arange(kh * kw).view(shape))
net.weight.data.copy_(torch.ones_like(net.weight.data))
output = net(input)
# print(output)
y = output.sum()
# print(y)
y.backward()
print(input.grad)
print(net.weight.grad)

tensor([[[[1., 2., 2., 2., 1.],
          [2., 4., 4., 4., 2.],
          [2., 4., 4., 4., 2.],
          [2., 4., 4., 4., 2.],
          [1., 2., 2., 2., 1.]],

         [[1., 2., 2., 2., 1.],
          [2., 4., 4., 4., 2.],
          [2., 4., 4., 4., 2.],
          [2., 4., 4., 4., 2.],
          [1., 2., 2., 2., 1.]],

         [[1., 2., 2., 2., 1.],
          [2., 4., 4., 4., 2.],
          [2., 4., 4., 4., 2.],
          [2., 4., 4., 4., 2.],
          [1., 2., 2., 2., 1.]]],


        [[[1., 2., 2., 2., 1.],
          [2., 4., 4., 4., 2.],
          [2., 4., 4., 4., 2.],
          [2., 4., 4., 4., 2.],
          [1., 2., 2., 2., 1.]],

         [[1., 2., 2., 2., 1.],
          [2., 4., 4., 4., 2.],
          [2., 4., 4., 4., 2.],
          [2., 4., 4., 4., 2.],
          [1., 2., 2., 2., 1.]],

         [[1., 2., 2., 2., 1.],
          [2., 4., 4., 4., 2.],
          [2., 4., 4., 4., 2.],
          [2., 4., 4., 4., 2.],
          [1., 2., 2., 2., 1.]]]])
tensor([[[[1488., 1520.],