In [1]:
import d2lzh as d2l
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import loss as gloss, nn

def dropout(X, drop_prob):
    # 断言drop_prob在0-1之间就执行后面的代码
    assert 0 <= drop_prob <= 1
    keep_prob = 1 - drop_prob
    # 这种情况下把全部元素都丢弃，即drop_prob=1
    if keep_prob == 0:
        return X.zeros_like()
    # 随机取0-1之间的数，形状和X一样，每个数和keep_prob比大小，比它小是1，否则0
    # mask是一个只包含0，1，形状和X一样的矩阵
    mask = nd.random.uniform(0, 1, X.shape) < keep_prob
    # mask * X 把X里随机的数x0淘汰了，剩下x1的数还要再除以keep_prob拉伸一下
    return mask * X / keep_prob

In [2]:
X = nd.arange(16).reshape((2, 8))
dropout(X, 0)


[[ 0.  1.  2.  3.  4.  5.  6.  7.]
 [ 8.  9. 10. 11. 12. 13. 14. 15.]]
<NDArray 2x8 @cpu(0)>

In [3]:
dropout(X, 0.5)


[[ 0.  2.  4.  6.  0.  0.  0. 14.]
 [ 0. 18.  0.  0. 24. 26. 28.  0.]]
<NDArray 2x8 @cpu(0)>

In [4]:
dropout(X, 1)


[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]
<NDArray 2x8 @cpu(0)>

In [7]:
X = nd.arange(16).reshape((2, 8))
nd.random.uniform(0, 1, (2,2)) < 0.5


[[0. 0.]
 [1. 0.]]
<NDArray 2x2 @cpu(0)>

### 定义模型参数

In [8]:
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256

W1 = nd.random.normal(scale=0.01, shape=(num_inputs, num_hiddens1))
b1 = nd.zeros(num_hiddens1)
W2 = nd.random.normal(scale=0.01, shape=(num_hiddens1, num_hiddens2))
b2 = nd.zeros(num_hiddens2)
W3 = nd.random.normal(scale=0.01, shape=(num_hiddens2, num_outputs))
b3 = nd.zeros(num_outputs)

params = [W1, b1, W2, b2, W3, b3]
for param in params:
    param.attach_grad()

### 定义模型

In [9]:
# 通常的建议是把靠近输入层的丢弃概率设得小一点
drop_prob1, drop_prob2 = 0.2, 0.5

def net(X):
    X = X.reshape((-1, num_inputs))
    H1 = (nd.dot(X, W1) + b1).relu()
    if autograd.is_training():  # 只在训练模型时使用丢弃法
        H1 = dropout(H1, drop_prob1)  # 在第一层全连接后添加丢弃层
    H2 = (nd.dot(H1, W2) + b2).relu()
    if autograd.is_training(): # 只在训练模型时使用丢弃法
        H2 = dropout(H2, drop_prob2)  # 在第二层全连接后添加丢弃层
    return nd.dot(H2, W3) + b3

### 训练和测试模型

In [10]:
num_epochs, lr, batch_size = 5, 0.5, 256
loss = gloss.SoftmaxCrossEntropyLoss()
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,
              params, lr)

epoch 1, loss 1.2214, train acc 0.528, test acc 0.768
epoch 2, loss 0.6021, train acc 0.776, test acc 0.834
epoch 3, loss 0.5061, train acc 0.814, test acc 0.850
epoch 4, loss 0.4535, train acc 0.835, test acc 0.845
epoch 5, loss 0.4280, train acc 0.843, test acc 0.864


## 简洁实现

In [11]:
net = nn.Sequential()
net.add(nn.Dense(256, activation="relu"),
        nn.Dropout(0.2),  # 在第一个全连接层后添加丢弃层
        nn.Dense(256, activation="relu"),
        nn.Dropout(0.5),  # 在第二个全连接层后添加丢弃层
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))

In [12]:
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None,
              None, trainer)

epoch 1, loss 1.2017, train acc 0.536, test acc 0.762
epoch 2, loss 0.5930, train acc 0.780, test acc 0.835
epoch 3, loss 0.5007, train acc 0.817, test acc 0.849
epoch 4, loss 0.4581, train acc 0.833, test acc 0.860
epoch 5, loss 0.4249, train acc 0.846, test acc 0.868


## 小结

* 我们可以通过使用丢弃法应对过拟合。
* 丢弃法只在训练模型时使用。

## 练习

* 如果把本节中的两个丢弃概率超参数对调，会有什么结果？
* 增大迭代周期数，比较使用丢弃法与不使用丢弃法的结果。
* 如果将模型改得更加复杂，如增加隐藏层单元，使用丢弃法应对过拟合的效果是否更加明显？
* 以本节中的模型为例，比较使用丢弃法与权重衰减的效果。如果同时使用丢弃法和权重衰减，效果会如何？

In [13]:
net = nn.Sequential()
net.add(nn.Dense(256, activation="relu"),
        nn.Dropout(0.5),  # 在第一个全连接层后添加丢弃层
        nn.Dense(256, activation="relu"),
        nn.Dropout(0.2),  # 在第二个全连接层后添加丢弃层
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None,
              None, trainer)

epoch 1, loss 1.1552, train acc 0.546, test acc 0.732
epoch 2, loss 0.6035, train acc 0.774, test acc 0.826
epoch 3, loss 0.5197, train acc 0.808, test acc 0.846
epoch 4, loss 0.4755, train acc 0.824, test acc 0.857
epoch 5, loss 0.4478, train acc 0.834, test acc 0.863


In [14]:
net = nn.Sequential()
net.add(nn.Dense(256, activation="relu"), 
        nn.Dense(256, activation="relu"),
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, 30, batch_size, None,
              None, trainer)

epoch 1, loss 1.1848, train acc 0.539, test acc 0.782
epoch 2, loss 0.5565, train acc 0.790, test acc 0.839
epoch 3, loss 0.4637, train acc 0.830, test acc 0.843
epoch 4, loss 0.4151, train acc 0.846, test acc 0.858
epoch 5, loss 0.3848, train acc 0.857, test acc 0.843
epoch 6, loss 0.3702, train acc 0.863, test acc 0.867
epoch 7, loss 0.3475, train acc 0.871, test acc 0.879
epoch 8, loss 0.3332, train acc 0.876, test acc 0.880
epoch 9, loss 0.3277, train acc 0.879, test acc 0.876
epoch 10, loss 0.3154, train acc 0.882, test acc 0.877
epoch 11, loss 0.3009, train acc 0.887, test acc 0.885
epoch 12, loss 0.2922, train acc 0.890, test acc 0.878
epoch 13, loss 0.2830, train acc 0.893, test acc 0.887
epoch 14, loss 0.2783, train acc 0.897, test acc 0.885
epoch 15, loss 0.2729, train acc 0.897, test acc 0.882
epoch 16, loss 0.2680, train acc 0.899, test acc 0.884
epoch 17, loss 0.2564, train acc 0.904, test acc 0.889
epoch 18, loss 0.2523, train acc 0.905, test acc 0.883
epoch 19, loss 0.24

In [15]:
net = nn.Sequential()
net.add(nn.Dense(256, activation="relu"),
        nn.Dropout(0.2),  # 在第一个全连接层后添加丢弃层
        nn.Dense(256, activation="relu"),
        nn.Dropout(0.5),  # 在第二个全连接层后添加丢弃层
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, 30, batch_size, None,
              None, trainer)

epoch 1, loss 1.1040, train acc 0.572, test acc 0.785
epoch 2, loss 0.5757, train acc 0.788, test acc 0.830
epoch 3, loss 0.4928, train acc 0.821, test acc 0.843
epoch 4, loss 0.4389, train acc 0.840, test acc 0.859
epoch 5, loss 0.4141, train acc 0.850, test acc 0.855
epoch 6, loss 0.3964, train acc 0.856, test acc 0.869
epoch 7, loss 0.3773, train acc 0.863, test acc 0.870
epoch 8, loss 0.3647, train acc 0.867, test acc 0.874
epoch 9, loss 0.3546, train acc 0.871, test acc 0.874
epoch 10, loss 0.3451, train acc 0.874, test acc 0.881
epoch 11, loss 0.3341, train acc 0.876, test acc 0.883
epoch 12, loss 0.3285, train acc 0.878, test acc 0.884
epoch 13, loss 0.3189, train acc 0.882, test acc 0.887
epoch 14, loss 0.3114, train acc 0.885, test acc 0.885
epoch 15, loss 0.3032, train acc 0.887, test acc 0.884
epoch 16, loss 0.3026, train acc 0.889, test acc 0.889
epoch 17, loss 0.2957, train acc 0.890, test acc 0.887
epoch 18, loss 0.2902, train acc 0.892, test acc 0.887
epoch 19, loss 0.28