# 丢弃法

方法

从零开始

In [1]:
import d2lzh as d2l
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import loss as gloss, nn

def dropout(X,drop_prob): # drop_prob表示丢弃概率
    assert 0 <= drop_prob <= 1
    keep_prob = 1 - drop_prob
    # 这种情况下把全部元素都丢弃
    if keep_prob == 0:
        return X.zeros_like()
    mask = nd.random.uniform(0, 1, shape=X.shape) < keep_prob # 生成与X形状相同的矩阵，矩阵中的元素随机取0或1，概率为1-keep_prob和keep_prob
    return mask * X / keep_prob  # 通过乘法运算将需要丢弃的元素设为0，通过除法运算将保留的元素重新缩放

In [3]:
X=nd.arange(16).reshape((2,8))
dropout(X,0)


[[ 0.  1.  2.  3.  4.  5.  6.  7.]
 [ 8.  9. 10. 11. 12. 13. 14. 15.]]
<NDArray 2x8 @cpu(0)>

In [4]:
dropout(X,0.5)


[[ 0.  2.  4.  6.  0.  0.  0. 14.]
 [ 0. 18.  0.  0. 24. 26. 28.  0.]]
<NDArray 2x8 @cpu(0)>

In [5]:
dropout(X,1)


[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]
<NDArray 2x8 @cpu(0)>

In [6]:
# 定义模型参数

num_inputs,num_outputs,num_hiddens1,num_hiddens2 = 784,10,256,256

w1=nd.random.normal(scale=0.01,shape=(num_inputs,num_hiddens1))
b1=nd.zeros(num_hiddens1)
w2=nd.random.normal(scale=0.01,shape=(num_hiddens1,num_hiddens2))
b2=nd.zeros(num_hiddens2)
w3=nd.random.normal(scale=0.01,shape=(num_hiddens2,num_outputs))
b3=nd.zeros(num_outputs)

params=[w1,b1,w2,b2,w3,b3]
for param in params:
    param.attach_grad()

In [7]:
# 定义模型

drop_prob1,drop_prob2 = 0.2, 0.5 # 设置丢弃概率，建议把靠近输入层的丢弃概率设置得稍小一些

def net(X):
    X=X.reshape((-1,num_inputs))
    H1=(nd.dot(X,w1)+b1).relu()
    if (autograd.is_training()):
        # 在训练模型时，随机丢弃一部分数据
        H1=dropout(H1,drop_prob1)
    H2=(nd.dot(H1,w2)+b2).relu()
    if (autograd.is_training()):
        # 在训练模型时，随机丢弃一部分数据
        H2=dropout(H2,drop_prob2)
    return nd.dot(H2,w3)+b3

In [9]:
# 训练和测试模型

num_epochs,lr,batch_size=10,0.5,256
loss = gloss.SoftmaxCrossEntropyLoss()
train_iter,test_iter=d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch3(net,train_iter,test_iter,loss,num_epochs,batch_size,params,lr)

epoch 1, loss 1.1942, train acc 0.532, test acc 0.774
epoch 2, loss 0.5952, train acc 0.779, test acc 0.829
epoch 3, loss 0.4972, train acc 0.821, test acc 0.852
epoch 4, loss 0.4540, train acc 0.835, test acc 0.853
epoch 5, loss 0.4233, train acc 0.846, test acc 0.859
epoch 6, loss 0.4015, train acc 0.855, test acc 0.872
epoch 7, loss 0.3896, train acc 0.858, test acc 0.870
epoch 8, loss 0.3704, train acc 0.865, test acc 0.878
epoch 9, loss 0.3617, train acc 0.867, test acc 0.875
epoch 10, loss 0.3511, train acc 0.871, test acc 0.877


简洁实现

In [10]:
net=nn.Sequential()
net.add(nn.Dense(256,activation='relu'))
net.add(nn.Dropout(drop_prob1)) # 在第一个全连接层之后添加丢弃层
net.add(nn.Dense(256,activation='relu'))
net.add(nn.Dropout(drop_prob2))  # 在第二个全连接层之后添加丢弃层
net.add(nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))

In [11]:
trainer=gluon.Trainer(net.collect_params(),'sgd',{'learning_rate':lr})
d2l.train_ch3(net,train_iter,test_iter,loss,num_epochs,batch_size,None,None,trainer)

epoch 1, loss 1.1899, train acc 0.540, test acc 0.748
epoch 2, loss 0.5856, train acc 0.781, test acc 0.818
epoch 3, loss 0.5252, train acc 0.811, test acc 0.852
epoch 4, loss 0.4528, train acc 0.835, test acc 0.857
epoch 5, loss 0.4252, train acc 0.845, test acc 0.849
epoch 6, loss 0.4005, train acc 0.854, test acc 0.869
epoch 7, loss 0.3860, train acc 0.859, test acc 0.872
epoch 8, loss 0.3738, train acc 0.865, test acc 0.865
epoch 9, loss 0.3624, train acc 0.868, test acc 0.873
epoch 10, loss 0.3522, train acc 0.872, test acc 0.876


* 可以使用丢弃法应对过拟合，丢弃法只在训练模型是使用，在测试模型时不会使用丢弃法。

In [12]:
# 将丢弃概率的超参数对调，靠近输入层的丢弃概率设置为0.5，靠近输出的丢弃概率设置为0.2

net=nn.Sequential()
net.add(nn.Dense(256,activation='relu'))
net.add(nn.Dropout(drop_prob2)) # 在第一个全连接层之后添加丢弃层
net.add(nn.Dense(256,activation='relu'))
net.add(nn.Dropout(drop_prob1))  # 在第二个全连接层之后添加丢弃层
net.add(nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))
trainer=gluon.Trainer(net.collect_params(),'sgd',{'learning_rate':lr})
d2l.train_ch3(net,train_iter,test_iter,loss,num_epochs,batch_size,None,None,trainer)

epoch 1, loss 1.2351, train acc 0.522, test acc 0.733
epoch 2, loss 0.6200, train acc 0.766, test acc 0.828
epoch 3, loss 0.5253, train acc 0.807, test acc 0.852
epoch 4, loss 0.4758, train acc 0.826, test acc 0.851
epoch 5, loss 0.4491, train acc 0.835, test acc 0.853
epoch 6, loss 0.4348, train acc 0.840, test acc 0.860
epoch 7, loss 0.4184, train acc 0.847, test acc 0.866
epoch 8, loss 0.4038, train acc 0.851, test acc 0.864
epoch 9, loss 0.3983, train acc 0.852, test acc 0.865
epoch 10, loss 0.3865, train acc 0.859, test acc 0.867


In [13]:
# 增大迭代周期数，比较使用丢弃法和不使用丢弃法的结果
# 使用丢弃法后，模型在训练集上的表现更好，但模型在测试集上的表现更差，说明使用丢弃法后模型过拟合更严重。
num_epochs=20
d2l.train_ch3(net,train_iter,test_iter,loss,num_epochs,batch_size,None,None,trainer)

epoch 1, loss 0.3781, train acc 0.860, test acc 0.874
epoch 2, loss 0.3732, train acc 0.862, test acc 0.876
epoch 3, loss 0.3680, train acc 0.864, test acc 0.872
epoch 4, loss 0.3607, train acc 0.867, test acc 0.871
epoch 5, loss 0.3580, train acc 0.867, test acc 0.875
epoch 6, loss 0.3500, train acc 0.871, test acc 0.878
epoch 7, loss 0.3476, train acc 0.871, test acc 0.881
epoch 8, loss 0.3417, train acc 0.872, test acc 0.883
epoch 9, loss 0.3393, train acc 0.874, test acc 0.883
epoch 10, loss 0.3376, train acc 0.876, test acc 0.883
epoch 11, loss 0.3326, train acc 0.877, test acc 0.880
epoch 12, loss 0.3295, train acc 0.877, test acc 0.883
epoch 13, loss 0.3268, train acc 0.878, test acc 0.884
epoch 14, loss 0.3219, train acc 0.880, test acc 0.887
epoch 15, loss 0.3200, train acc 0.880, test acc 0.883
epoch 16, loss 0.3140, train acc 0.883, test acc 0.888
epoch 17, loss 0.3146, train acc 0.883, test acc 0.886
epoch 18, loss 0.3111, train acc 0.884, test acc 0.888
epoch 19, loss 0.31

In [14]:
# 增大丢弃概率，比较使用丢弃法和不使用丢弃法的结果

net=nn.Sequential()
net.add(nn.Dense(256,activation='relu'))
net.add(nn.Dropout(0.2)) # 在第一个全连接层之后添加丢弃层
net.add(nn.Dense(256,activation='relu'))
net.add(nn.Dropout(0.7))  # 在第二个全连接层之后添加丢弃层
net.add(nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))
trainer=gluon.Trainer(net.collect_params(),'sgd',{'learning_rate':lr})
d2l.train_ch3(net,train_iter,test_iter,loss,10,batch_size,None,None,trainer)

epoch 1, loss 1.1309, train acc 0.561, test acc 0.778
epoch 2, loss 0.6095, train acc 0.775, test acc 0.814
epoch 3, loss 0.5072, train acc 0.818, test acc 0.849
epoch 4, loss 0.4646, train acc 0.834, test acc 0.858
epoch 5, loss 0.4437, train acc 0.841, test acc 0.860
epoch 6, loss 0.4179, train acc 0.850, test acc 0.868
epoch 7, loss 0.4018, train acc 0.856, test acc 0.861
epoch 8, loss 0.3809, train acc 0.862, test acc 0.864
epoch 9, loss 0.3713, train acc 0.866, test acc 0.876
epoch 10, loss 0.3629, train acc 0.869, test acc 0.878


In [25]:
# 同时使用丢弃法和权重衰减，比较使用丢弃法和不使用丢弃法的结果

from mxnet import autograd, gluon, init, nd
from mxnet.gluon import nn, data as gdata, loss as gloss
import d2l

# 超参数设置
num_epochs = 10
batch_size = 256
lr = 0.5
drop_prob1 = 0.2  # 第1个丢弃层的丢弃率
drop_prob2 = 0.7  # 第2个丢弃层的丢弃率

# # 加载数据
# train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)

# # 定义损失函数
# loss = gloss.SoftmaxCrossEntropyLoss()

# 定义模型
net = nn.Sequential()
net.add(
    nn.Dense(256, activation='relu'),
    nn.Dropout(drop_prob2),  # 第1个丢弃层
    nn.Dense(256, activation='relu'),
    nn.Dropout(drop_prob1),  # 第2个丢弃层
    nn.Dense(10)
)
net.initialize(init.Normal(sigma=0.01))

# 定义优化器
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr, 'wd': 3})

# 训练模型
for epoch in range(num_epochs):
    for X, y in train_iter:
        with autograd.record():
            l = loss(net(X), y)  # 计算损失
        l.backward()  # 反向传播
        trainer.step(batch_size, ignore_stale_grad=True)  # 更新参数，忽略陈旧梯度
    print(f'Epoch {epoch + 1}, Loss: {l.mean().asscalar()}')

KeyboardInterrupt: 

In [21]:
# 增加隐藏层的神经元数量，比较使用丢弃法和不使用丢弃法的结果

net=nn.Sequential()
net.add(nn.Dense(512,activation='relu'))
net.add(nn.Dropout(drop_prob1)) # 在第一个全连接层之后添加丢弃层
net.add(nn.Dense(512,activation='relu'))
net.add(nn.Dropout(0.7))  # 在第二个全连接层之后添加丢弃层
net.add(nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))
trainer=gluon.Trainer(net.collect_params(),'sgd',{'learning_rate':lr})
d2l.train_ch3(net,train_iter,test_iter,loss,10,batch_size,None,None,trainer)

epoch 1, loss 1.0519, train acc 0.597, test acc 0.791
epoch 2, loss 0.5616, train acc 0.791, test acc 0.835
epoch 3, loss 0.4764, train acc 0.826, test acc 0.852
epoch 4, loss 0.4424, train acc 0.838, test acc 0.865
epoch 5, loss 0.4160, train acc 0.849, test acc 0.869
epoch 6, loss 0.3920, train acc 0.857, test acc 0.864
epoch 7, loss 0.3755, train acc 0.863, test acc 0.877
epoch 8, loss 0.3634, train acc 0.867, test acc 0.872
epoch 9, loss 0.3485, train acc 0.873, test acc 0.878
epoch 10, loss 0.3420, train acc 0.874, test acc 0.874
