# 卷积神经网络

LeNet是第一个卷积神经网络，由Yann LeCun在1989年提出，并在1998年发表。LeNet在图像分类任务上取得了很好的效果，是深度学习领域的奠基性工作之一。

分为两个部分：卷积层和全连接层。

In [1]:
import d2lzh as d2l
import mxnet as mx
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import loss as gloss, nn
import time

net = nn.Sequential()
net.add(nn.Conv2D(6, kernel_size=5, strides=1, activation='sigmoid'),
        nn.MaxPool2D(pool_size=2, strides=2),
        nn.Conv2D(16, kernel_size=5, strides=1, activation='sigmoid'),
        nn.MaxPool2D(pool_size=2, strides=2),
        # Dense层会默认将(批量大小, 通道, 高, 宽)形状的输入转换成(批量大小, 通道*高*宽)形状的输入
        nn.Dense(120, activation='sigmoid'),
        nn.Dense(84, activation='sigmoid'),
        nn.Dense(10))

In [2]:
X=nd.random.uniform(shape=(1,1,28,28)) # 长和宽均为28的单通道样本
net.initialize()
for layer in net:
    X=layer(X)
    print(layer.name,'output shape:\t',X.shape)

conv0 output shape:	 (1, 6, 24, 24)
pool0 output shape:	 (1, 6, 12, 12)
conv1 output shape:	 (1, 16, 8, 8)
pool1 output shape:	 (1, 16, 4, 4)
dense0 output shape:	 (1, 120)
dense1 output shape:	 (1, 84)
dense2 output shape:	 (1, 10)


获取数据和训练模型

In [3]:
batch_size=256
train_iter,test_iter=d2l.load_data_fashion_mnist(batch_size=batch_size)

In [11]:
def try_gpu():
    try:
        ctx=mx.gpu()
        _=nd.zeros((1,),ctx=ctx)
    except:
        ctx=mx.cpu()
    return ctx
ctx=try_gpu()
ctx

cpu(0)

In [16]:
def evaluate_accuracy(data_iter,net, ctx):
    acc_sum,n=nd.array([0],ctx=ctx),0
    for X,y in data_iter:
        X,y=X.as_in_context(ctx),y.as_in_context(ctx).astype('float32')
        acc_sum+=(net(X).argmax(axis=1)==y).sum()
        n+=y.size
    return acc_sum.asscalar()/n
def train_ch5(net, train_iter, test_iter, batch_size,trainer,ctx,num_epochs):
    print('training on',ctx)
    loss = gloss.SoftmaxCrossEntropyLoss()
    for epoch in range(num_epochs):
        train_l_sum,train_acc_sum,n,start=0.0,0.0,0,time.time()
        for X,y in train_iter:
            X,y=X.as_in_context(ctx),y.as_in_context(ctx)
            with autograd.record():
                y_hat=net(X)
                l=loss(y_hat,y).sum()
            l.backward()
            trainer.step(batch_size)
            y=y.astype('float32')
            train_l_sum+=l.asscalar()
            train_acc_sum+=(y_hat.argmax(axis=1)==y).sum().asscalar()
            n+=y.size
        test_acc=evaluate_accuracy(test_iter,net,ctx)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc, time.time() - start))


In [19]:
lr,num_epochs=0.9,5
net.initialize(force_reinit=True,ctx=ctx,init=init.Xavier())
trainer=gluon.Trainer(net.collect_params(),'sgd',{'learning_rate':lr})
train_ch5(net, train_iter, test_iter, batch_size,trainer,ctx,num_epochs)

training on cpu(0)
epoch 1, loss 2.3168, train acc 0.108, test acc 0.189, time 27.1 sec
epoch 2, loss 1.3413, train acc 0.470, test acc 0.627, time 27.1 sec
epoch 3, loss 0.8670, train acc 0.659, test acc 0.708, time 24.9 sec
epoch 4, loss 0.7295, train acc 0.715, test acc 0.739, time 26.0 sec
epoch 5, loss 0.6486, train acc 0.744, test acc 0.762, time 25.4 sec


In [21]:
# 调整卷积窗口大小

net = nn.Sequential()
net.add(nn.Conv2D(6, kernel_size=3, strides=1, activation='sigmoid'), # nn.Conv2D参数：6表示输出通道数，5表示卷积核大小，1表示步幅，sigmoid表示激活函数
        nn.MaxPool2D(pool_size=2, strides=2),
        nn.Conv2D(16, kernel_size=5, strides=1, activation='sigmoid'),
        nn.MaxPool2D(pool_size=2, strides=2),
        # Dense层会默认将(批量大小, 通道, 高, 宽)形状的输入转换成(批量大小, 通道*高*宽)形状的输入
        nn.Dense(120, activation='sigmoid'),
        nn.Dense(84, activation='sigmoid'),
        nn.Dense(10))

X=nd.random.uniform(shape=(1,1,28,28)) # 长和宽均为28的单通道样本
net.initialize()
for layer in net:
    X=layer(X)
    print(layer.name,'output shape:\t',X.shape)

def evaluate_accuracy(data_iter,net, ctx):
    acc_sum,n=nd.array([0],ctx=ctx),0
    for X,y in data_iter:
        X,y=X.as_in_context(ctx),y.as_in_context(ctx).astype('float32')
        acc_sum+=(net(X).argmax(axis=1)==y).sum()
        n+=y.size
    return acc_sum.asscalar()/n
def train_ch5(net, train_iter, test_iter, batch_size,trainer,ctx,num_epochs):
    print('training on',ctx)
    loss = gloss.SoftmaxCrossEntropyLoss()
    for epoch in range(num_epochs):
        train_l_sum,train_acc_sum,n,start=0.0,0.0,0,time.time()
        for X,y in train_iter:
            X,y=X.as_in_context(ctx),y.as_in_context(ctx)
            with autograd.record():
                y_hat=net(X)
                l=loss(y_hat,y).sum()
            l.backward()
            trainer.step(batch_size)
            y=y.astype('float32')
            train_l_sum+=l.asscalar()
            train_acc_sum+=(y_hat.argmax(axis=1)==y).sum().asscalar()
            n+=y.size
        test_acc=evaluate_accuracy(test_iter,net,ctx)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc, time.time() - start))

lr,num_epochs=0.9,5
net.initialize(force_reinit=True,ctx=ctx,init=init.Xavier())
trainer=gluon.Trainer(net.collect_params(),'sgd',{'learning_rate':lr})
train_ch5(net, train_iter, test_iter, batch_size,trainer,ctx,num_epochs)

conv2 output shape:	 (1, 6, 26, 26)
pool2 output shape:	 (1, 6, 13, 13)
conv3 output shape:	 (1, 16, 9, 9)
pool3 output shape:	 (1, 16, 4, 4)
dense3 output shape:	 (1, 120)
dense4 output shape:	 (1, 84)
dense5 output shape:	 (1, 10)
training on cpu(0)
epoch 1, loss 2.3189, train acc 0.104, test acc 0.100, time 16.3 sec
epoch 2, loss 1.5385, train acc 0.412, test acc 0.652, time 22.7 sec
epoch 3, loss 0.8479, train acc 0.670, test acc 0.718, time 22.6 sec
epoch 4, loss 0.7208, train acc 0.717, test acc 0.742, time 23.8 sec
epoch 5, loss 0.6494, train acc 0.744, test acc 0.761, time 22.0 sec


卷积窗口大小决定了卷积层提取特征的感受野（Receptive Field）。

影响：

较小的卷积窗口（如 3x3）：

提取更细粒度的局部特征。

计算量较小，适合深层网络。

在图像分类任务中，通常使用多个小卷积核堆叠（如 VGG 网络）来替代大卷积核，以减少参数数量并增加非线性。

较大的卷积窗口（如 5x5 或 7x7）：

提取更大范围的全局特征。

计算量较大，可能导致过拟合。

适合输入图像较大且需要捕捉全局信息的任务。

实验结果：

使用较小的卷积窗口（如 3x3）通常可以获得更好的性能，尤其是在深层网络中。

较大的卷积窗口可能会导致模型过拟合，尤其是在数据量较少的情况下。

In [22]:
# 调整输出通道数

net = nn.Sequential()
net.add(nn.Conv2D(10, kernel_size=5, strides=1, activation='sigmoid'), # nn.Conv2D参数：6表示输出通道数，5表示卷积核大小，1表示步幅，sigmoid表示激活函数
        nn.MaxPool2D(pool_size=2, strides=2),
        nn.Conv2D(16, kernel_size=5, strides=1, activation='sigmoid'),
        nn.MaxPool2D(pool_size=2, strides=2),
        # Dense层会默认将(批量大小, 通道, 高, 宽)形状的输入转换成(批量大小, 通道*高*宽)形状的输入
        nn.Dense(120, activation='sigmoid'),
        nn.Dense(84, activation='sigmoid'),
        nn.Dense(10))

X=nd.random.uniform(shape=(1,1,28,28)) # 长和宽均为28的单通道样本
net.initialize()
for layer in net:
    X=layer(X)
    print(layer.name,'output shape:\t',X.shape)

def evaluate_accuracy(data_iter,net, ctx):
    acc_sum,n=nd.array([0],ctx=ctx),0
    for X,y in data_iter:
        X,y=X.as_in_context(ctx),y.as_in_context(ctx).astype('float32')
        acc_sum+=(net(X).argmax(axis=1)==y).sum()
        n+=y.size
    return acc_sum.asscalar()/n
def train_ch5(net, train_iter, test_iter, batch_size,trainer,ctx,num_epochs):
    print('training on',ctx)
    loss = gloss.SoftmaxCrossEntropyLoss()
    for epoch in range(num_epochs):
        train_l_sum,train_acc_sum,n,start=0.0,0.0,0,time.time()
        for X,y in train_iter:
            X,y=X.as_in_context(ctx),y.as_in_context(ctx)
            with autograd.record():
                y_hat=net(X)
                l=loss(y_hat,y).sum()
            l.backward()
            trainer.step(batch_size)
            y=y.astype('float32')
            train_l_sum+=l.asscalar()
            train_acc_sum+=(y_hat.argmax(axis=1)==y).sum().asscalar()
            n+=y.size
        test_acc=evaluate_accuracy(test_iter,net,ctx)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc, time.time() - start))

lr,num_epochs=0.9,5
net.initialize(force_reinit=True,ctx=ctx,init=init.Xavier())
trainer=gluon.Trainer(net.collect_params(),'sgd',{'learning_rate':lr})
train_ch5(net, train_iter, test_iter, batch_size,trainer,ctx,num_epochs)

conv4 output shape:	 (1, 10, 24, 24)
pool4 output shape:	 (1, 10, 12, 12)
conv5 output shape:	 (1, 16, 8, 8)
pool5 output shape:	 (1, 16, 4, 4)
dense6 output shape:	 (1, 120)
dense7 output shape:	 (1, 84)
dense8 output shape:	 (1, 10)
training on cpu(0)
epoch 1, loss 2.3153, train acc 0.105, test acc 0.165, time 25.0 sec
epoch 2, loss 1.3913, train acc 0.458, test acc 0.615, time 43.3 sec
epoch 3, loss 0.8437, train acc 0.670, test acc 0.730, time 47.1 sec
epoch 4, loss 0.6965, train acc 0.726, test acc 0.745, time 45.8 sec
epoch 5, loss 0.6172, train acc 0.757, test acc 0.771, time 33.6 sec


输出通道数决定了卷积层提取的特征图数量。

影响：

较多的输出通道：

提取更丰富的特征，增强模型的表达能力。

计算量和参数数量增加，可能导致过拟合。

需要更多的训练数据来充分训练模型。

较少的输出通道：

计算量较小，训练速度更快。

可能无法充分捕捉数据的复杂特征，导致欠拟合。

实验结果：

增加输出通道数通常会提高模型的性能，但也会增加计算复杂度。

如果通道数过多，可能会导致过拟合，尤其是在数据量较少的情况下。

通常需要在模型复杂度和性能之间找到平衡。

In [None]:
# 调整激活函数

net = nn.Sequential()
net.add(nn.Conv2D(6, kernel_size=5, strides=1, activation='relu'), # nn.Conv2D参数：6表示输出通道数，5表示卷积核大小，1表示步幅，sigmoid表示激活函数
        nn.MaxPool2D(pool_size=2, strides=2),
        nn.Conv2D(16, kernel_size=5, strides=1, activation='relu'),
        nn.MaxPool2D(pool_size=2, strides=2),
        # Dense层会默认将(批量大小, 通道, 高, 宽)形状的输入转换成(批量大小, 通道*高*宽)形状的输入
        nn.Dense(120, activation='relu'),
        nn.Dense(84, activation='relu'),
        nn.Dense(10))

X=nd.random.uniform(shape=(1,1,28,28)) # 长和宽均为28的单通道样本
net.initialize()
for layer in net:
    X=layer(X)
    print(layer.name,'output shape:\t',X.shape)

def evaluate_accuracy(data_iter,net, ctx):
    acc_sum,n=nd.array([0],ctx=ctx),0
    for X,y in data_iter:
        X,y=X.as_in_context(ctx),y.as_in_context(ctx).astype('float32')
        acc_sum+=(net(X).argmax(axis=1)==y).sum()
        n+=y.size
    return acc_sum.asscalar()/n
def train_ch5(net, train_iter, test_iter, batch_size,trainer,ctx,num_epochs):
    print('training on',ctx)
    loss = gloss.SoftmaxCrossEntropyLoss()
    for epoch in range(num_epochs):
        train_l_sum,train_acc_sum,n,start=0.0,0.0,0,time.time()
        for X,y in train_iter:
            X,y=X.as_in_context(ctx),y.as_in_context(ctx)
            with autograd.record():
                y_hat=net(X)
                l=loss(y_hat,y).sum()
            l.backward()
            trainer.step(batch_size)
            y=y.astype('float32')
            train_l_sum+=l.asscalar()
            train_acc_sum+=(y_hat.argmax(axis=1)==y).sum().asscalar()
            n+=y.size
        test_acc=evaluate_accuracy(test_iter,net,ctx)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc, time.time() - start))

lr,num_epochs=0.9,5
net.initialize(force_reinit=True,ctx=ctx,init=init.Xavier())
trainer=gluon.Trainer(net.collect_params(),'sgd',{'learning_rate':lr})
train_ch5(net, train_iter, test_iter, batch_size,trainer,ctx,num_epochs)

conv8 output shape:	 (1, 6, 24, 24)
pool8 output shape:	 (1, 6, 12, 12)
conv9 output shape:	 (1, 16, 8, 8)
pool9 output shape:	 (1, 16, 4, 4)
dense12 output shape:	 (1, 120)
dense13 output shape:	 (1, 84)
dense14 output shape:	 (1, 10)
training on cpu(0)
epoch 1, loss 2.3157, train acc 0.105, test acc 0.100, time 18.2 sec
epoch 2, loss 2.3036, train acc 0.100, test acc 0.100, time 26.6 sec
epoch 3, loss 2.3035, train acc 0.101, test acc 0.100, time 26.4 sec
epoch 4, loss 2.3036, train acc 0.097, test acc 0.100, time 26.2 sec
epoch 5, loss 2.3036, train acc 0.097, test acc 0.100, time 23.4 sec


实验结果可以看出，将激活函数从 Sigmoid 改为 ReLU 后，模型的性能显著下降，训练准确率和测试准确率都停留在 10% 左右（对于 10 类分类任务来说，这相当于随机猜测）。这表明模型在训练过程中出现了问题。

In [25]:
# 调整激活函数与学习率

net = nn.Sequential()
net.add(nn.Conv2D(6, kernel_size=5, strides=1, activation='relu'), # nn.Conv2D参数：6表示输出通道数，5表示卷积核大小，1表示步幅，sigmoid表示激活函数
        nn.MaxPool2D(pool_size=2, strides=2),
        nn.Conv2D(16, kernel_size=5, strides=1, activation='relu'),
        nn.MaxPool2D(pool_size=2, strides=2),
        # Dense层会默认将(批量大小, 通道, 高, 宽)形状的输入转换成(批量大小, 通道*高*宽)形状的输入
        nn.Dense(120, activation='relu'),
        nn.Dense(84, activation='relu'),
        nn.Dense(10))

X=nd.random.uniform(shape=(1,1,28,28)) # 长和宽均为28的单通道样本
net.initialize()
for layer in net:
    X=layer(X)
    print(layer.name,'output shape:\t',X.shape)

def evaluate_accuracy(data_iter,net, ctx):
    acc_sum,n=nd.array([0],ctx=ctx),0
    for X,y in data_iter:
        X,y=X.as_in_context(ctx),y.as_in_context(ctx).astype('float32')
        acc_sum+=(net(X).argmax(axis=1)==y).sum()
        n+=y.size
    return acc_sum.asscalar()/n
def train_ch5(net, train_iter, test_iter, batch_size,trainer,ctx,num_epochs):
    print('training on',ctx)
    loss = gloss.SoftmaxCrossEntropyLoss()
    for epoch in range(num_epochs):
        train_l_sum,train_acc_sum,n,start=0.0,0.0,0,time.time()
        for X,y in train_iter:
            X,y=X.as_in_context(ctx),y.as_in_context(ctx)
            with autograd.record():
                y_hat=net(X)
                l=loss(y_hat,y).sum()
            l.backward()
            trainer.step(batch_size)
            y=y.astype('float32')
            train_l_sum+=l.asscalar()
            train_acc_sum+=(y_hat.argmax(axis=1)==y).sum().asscalar()
            n+=y.size
        test_acc=evaluate_accuracy(test_iter,net,ctx)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc, time.time() - start))

lr,num_epochs=0.01,5
net.initialize(force_reinit=True,ctx=ctx,init=init.Xavier())
trainer=gluon.Trainer(net.collect_params(),'sgd',{'learning_rate':lr})
train_ch5(net, train_iter, test_iter, batch_size,trainer,ctx,num_epochs)

conv10 output shape:	 (1, 6, 24, 24)
pool10 output shape:	 (1, 6, 12, 12)
conv11 output shape:	 (1, 16, 8, 8)
pool11 output shape:	 (1, 16, 4, 4)
dense15 output shape:	 (1, 120)
dense16 output shape:	 (1, 84)
dense17 output shape:	 (1, 10)
training on cpu(0)
epoch 1, loss 1.6687, train acc 0.412, test acc 0.633, time 24.8 sec
epoch 2, loss 0.9429, train acc 0.650, test acc 0.695, time 29.1 sec
epoch 3, loss 0.8081, train acc 0.696, test acc 0.702, time 28.3 sec
epoch 4, loss 0.7349, train acc 0.721, test acc 0.749, time 23.8 sec
epoch 5, loss 0.6913, train acc 0.737, test acc 0.761, time 23.3 sec


尝试降低学习率，避免梯度爆炸或权重更新不稳定。将学习率从 0.9 调整为 0.01 ，调整学习率到 0.01 后，模型的训练过程和性能都趋于稳定，且没有出现过拟合现象