

![](../image/alexnet.png)

卷积:
- 五个卷积层(11x11, 5x5, 3个3x3).

- 三个全连接层

改进:

- 增加卷积层数目, 并使用不同大小的卷积核

- 相对于LeNet, AlexNet使用**ReLU**和**最大汇聚层**.

- AlexNet通过**暂退法**控制全连接层的模型复杂度，而LeNet只使用了权重衰减。

数据集

- AlexNet在训练时增加了大量的图像增强数据，如翻转、裁切和变色。
    
  将AlexNet直接应用于Fashion-MNIST的一个问题是，Fashion-MNIST图像的分辨率（$28 \times 28$像素）(低于ImageNet图像)
为了解决这个问题，(**我们将它们增加到$224 \times 224$**)（通常来讲这不是一个明智的做法，但我们在这里这样做是为了有效使用AlexNet架构）。

In [1]:
import torch
# nn块
from torch import nn
# data.DataLoader
from torch.utils.data import DataLoader
# torchvision.datasets.FashionMNIST
import torchvision
# 修改数据集格式
from torchvision import transforms

In [3]:
# -----------参数-----------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
batch_size = 128
lr = 3e-2
num_epochs=10

cuda


In [4]:
# 列表
trans = [
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
]
# 转化列表为torchvision.transforms.transforms.Compose对象, 这样就能写 transform=trans
trans = transforms.Compose(trans)
mnist_train_totensor = torchvision.datasets.FashionMNIST(
    root="../data",
    train=True,
    download=True,
    transform=trans
)
mnist_test_totensor = torchvision.datasets.FashionMNIST(
    root="../data",
    train=False,
    download=True,
    transform=trans
)
# 28*28, 不用转化大小
# mnist_train_totensor[0][0]
mnist_train_totensor[0][0].shape

torch.Size([1, 224, 224])

In [5]:
# shuffle, 打乱
# num_workers, 使用4个进程来读取数据
train_iter = DataLoader(
    mnist_train_totensor, batch_size, shuffle=True, num_workers=4)
test_iter = DataLoader(
    mnist_test_totensor, batch_size, shuffle=True, num_workers=4)

In [6]:
net = nn.Sequential(
    # 这里，我们使用一个11*11的更大窗口来捕捉对象。
    # 同时，步幅为4，以减少输出的高度和宽度。
    # 另外，输出通道的数目远大于LeNet
    nn.Conv2d(1, 96, kernel_size=11, stride=4, padding=1),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=3, stride=2),
    
    # 减小卷积窗口，使用填充为2来使得输入与输出的高和宽一致，且增大输出通道数
    nn.Conv2d(96, 256, kernel_size=5, padding=2),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=3, stride=2),
    
    # 使用三个连续的卷积层和较小的卷积窗口。
    # 除了最后的卷积层，输出通道的数量进一步增加。
    # 在前两个卷积层之后，汇聚层不用于减少输入的高度和宽度
    nn.Conv2d(256, 384, kernel_size=3, padding=1),
    nn.ReLU(),
    
    nn.Conv2d(384, 384, kernel_size=3, padding=1),
    nn.ReLU(),
    
    nn.Conv2d(384, 256, kernel_size=3, padding=1),
    nn.ReLU(),
    nn.MaxPool2d(kernel_size=3, stride=2),
    
    nn.Flatten(),
    # 这里，全连接层的输出数量是LeNet中的好几倍。使用dropout层来减轻过拟合
    
    nn.Linear(6400, 4096),
    nn.ReLU(),
    nn.Dropout(p=0.5),
    
    nn.Linear(4096, 4096),
    nn.ReLU(),
    nn.Dropout(p=0.5),
    
    # 最后是输出层。由于这里使用Fashion-MNIST，所以用类别数为10，而非论文中的1000
    nn.Linear(4096, 10)
).to(device)
net

Sequential(
  (0): Conv2d(1, 96, kernel_size=(11, 11), stride=(4, 4), padding=(1, 1))
  (1): ReLU()
  (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (3): Conv2d(96, 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (4): ReLU()
  (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (6): Conv2d(256, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (7): ReLU()
  (8): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (9): ReLU()
  (10): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (11): ReLU()
  (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (13): Flatten(start_dim=1, end_dim=-1)
  (14): Linear(in_features=6400, out_features=4096, bias=True)
  (15): ReLU()
  (16): Dropout(p=0.5, inplace=False)
  (17): Linear(in_features=4096, out_features=4096, bias=True)
  (18): ReLU()
  (19): Dropout(p=0.5, inplace=False)
  (20): Linear(in_featu

In [7]:
X = torch.rand(size=(1, 1, 224, 224), dtype=torch.float32).to(device)
for layer in net:
    X = layer(X)
    print(f'output shape: {layer.__class__.__name__: <15}{X.shape}')

output shape: Conv2d         torch.Size([1, 96, 54, 54])
output shape: ReLU           torch.Size([1, 96, 54, 54])
output shape: MaxPool2d      torch.Size([1, 96, 26, 26])
output shape: Conv2d         torch.Size([1, 256, 26, 26])
output shape: ReLU           torch.Size([1, 256, 26, 26])
output shape: MaxPool2d      torch.Size([1, 256, 12, 12])
output shape: Conv2d         torch.Size([1, 384, 12, 12])
output shape: ReLU           torch.Size([1, 384, 12, 12])
output shape: Conv2d         torch.Size([1, 384, 12, 12])
output shape: ReLU           torch.Size([1, 384, 12, 12])
output shape: Conv2d         torch.Size([1, 256, 12, 12])
output shape: ReLU           torch.Size([1, 256, 12, 12])
output shape: MaxPool2d      torch.Size([1, 256, 5, 5])
output shape: Flatten        torch.Size([1, 6400])
output shape: Linear         torch.Size([1, 4096])
output shape: ReLU           torch.Size([1, 4096])
output shape: Dropout        torch.Size([1, 4096])
output shape: Linear         torch.Size([1, 409

In [8]:
def init_weights(m):
    if type(m) == nn.Linear or type(m) == nn.Conv2d:
        nn.init.xavier_uniform_(m.weight)


net.apply(init_weights)
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()

In [9]:
def train_loop(train_iter, net, loss, optimizer):
    # 共有几批
    num_batchs = len(train_iter)
    # 总平均loss
    total_train_loss = 0
    for batch, (X, y) in enumerate(train_iter):
        # move to device
        X, y = X.to(device), y.to(device)
        # 该批的推断结果
        y_hat = net(X)
        
        train_loss = loss(y_hat, y)
        total_train_loss += train_loss.item()

        # Backpropagation
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        # --------打印进度        
        print(f"\r[{batch+1:>8d}/{num_batchs:>8d}]  ", end='')
    
    return total_train_loss / num_batchs

# ---------训练
for epoch in range(num_epochs):
    total_train_loss = train_loop(train_iter, net, loss, optimizer)
    print(f'epoch {epoch + 1}, total_train_loss {total_train_loss:f}')

In [None]:
# ----------预测
def test_net(test_iter, net, loss):
    # 共有几批
    num_batchs = len(test_iter)
    # 总平均loss, 总平均准确率
    total_test_loss, total_correct = 0, 0
    # 设定评估模式
    net.eval()
    # 不要梯度
    with torch.no_grad():
        for batch, (X, y) in enumerate(test_iter):
            # move to device
            X, y = X.to(device), y.to(device)
            y_hat = net(X)

            test_loss = loss(y_hat, y)
            # 分类0,1,2,3的类别对的上否
            correct = (y_hat.argmax(1) == y).float().sum().item()
            total_test_loss += test_loss.item()
            total_correct += correct/len(X)

            # --------打印进度
            print(f"\r[{batch+1:>8d}/{num_batchs:>8d}]  ", end='')


    total_test_loss /= num_batchs
    total_correct /= num_batchs
    print(
        f"\nTest: Accuracy: {total_correct:.1%}, Avg loss: {total_test_loss:f}")
    
test_net(test_iter, net, loss)

[      79/      79]  
Test: Accuracy: 87.6%, Avg loss: 0.332149
