In [1]:
# AlexNet
# 更深更大的LeNet
# 主要改进:
#   - 丢弃法
#   - ReLu
#   - MaxPooling (梯度大, 训练容易)
# 计算机视觉方法论的改变
#  - (之前) 图片 -> 人工特征提取 -> SVM
#  - (之后) 图片 -> 通过CNN学习特征 -> Softmax回归

# AlexNet架构
#  - image (3x224x224)
#  - 11x11 Conv (96), stride 4 (更大的核窗口和步长, 因为图片更大了) (通道数变大)
#  - 3x3 Maxpool, stride 2 (更大的池化窗口, 使用最大池化层)
#  - 5x5 Conv (256), pad 2 (更多的输出通道)
#  - 3x3 Maxpool, stride 2
#  - 3个3x3 Conv (384), pad 1
#  - 3x3 Maxpool, stride 2
#  - Dense (4096)
#  - Dense (4096)
#  - 1000类输出

# 更多细节
#  - 激活函数从sigmoid变成ReLu (减缓梯度丢失)
#  - 隐藏全连接层后加入丢弃层
#  - 数据增强 (截取, 改变颜色, 改变光照)

# 总结
# - AlexNet是更大更深的LeNet, 10x参数量, 260x计算复杂度
# - 新加入丢弃法, ReLu, 最大池化层, 数据增强

In [2]:
import torch
from torch import nn
from d2l import torch as d2l

net = nn.Sequential(
    nn.Conv2d(1, 96, kernel_size=11, stride=4, padding=1),
    nn.MaxPool2d(kernel_size=3, stride=2),
    nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.ReLU(),
    nn.MaxPool2d(kernel_size=3, stride=2),
    nn.Conv2d(256, 384, kernel_size=3, padding=1), nn.ReLU(),
    nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.ReLU(),
    nn.Conv2d(384, 256, kernel_size=3, padding=1), nn.ReLU(),
    nn.MaxPool2d(kernel_size=3, stride=2), nn.Flatten(),
    nn.Linear(6400, 4096), nn.ReLU(), nn.Dropout(p=0.5),
    nn.Linear(4096, 4096), nn.ReLU(), nn.Dropout(p=0.5),
    nn.Linear(4096, 10))

In [3]:
# 构造一个单通道, 来观察每一层的形状
X = torch.rand(1, 1, 224, 224)
for layer in net:
    X = layer(X)
    print(layer.__class__.__name__, 'Output shape:\t', X.shape)

Conv2d Output shape:	 torch.Size([1, 96, 54, 54])
MaxPool2d Output shape:	 torch.Size([1, 96, 26, 26])
Conv2d Output shape:	 torch.Size([1, 256, 26, 26])
ReLU Output shape:	 torch.Size([1, 256, 26, 26])
MaxPool2d Output shape:	 torch.Size([1, 256, 12, 12])
Conv2d Output shape:	 torch.Size([1, 384, 12, 12])
ReLU Output shape:	 torch.Size([1, 384, 12, 12])
Conv2d Output shape:	 torch.Size([1, 384, 12, 12])
ReLU Output shape:	 torch.Size([1, 384, 12, 12])
Conv2d Output shape:	 torch.Size([1, 256, 12, 12])
ReLU Output shape:	 torch.Size([1, 256, 12, 12])
MaxPool2d Output shape:	 torch.Size([1, 256, 5, 5])
Flatten Output shape:	 torch.Size([1, 6400])
Linear Output shape:	 torch.Size([1, 4096])
ReLU Output shape:	 torch.Size([1, 4096])
Dropout Output shape:	 torch.Size([1, 4096])
Linear Output shape:	 torch.Size([1, 4096])
ReLU Output shape:	 torch.Size([1, 4096])
Dropout Output shape:	 torch.Size([1, 4096])
Linear Output shape:	 torch.Size([1, 10])


In [4]:
# Fashion-MNIST 分辨率低于ImageNet, 我们将它增加到224x224
batch_size = 128
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)

In [5]:
# 训练AlexNet
lr, num_epochs = 0.01, 10
d2l.train_ch6(ne, train_iter, test_iter, num_epochs, lr, d2l.try_gpu)


KeyboardInterrupt

