### 丢弃法

设丢弃概率为p，那么有p的概率hi会被清零，有1 - p的概率hi会除以 1 - p做拉伸

In [1]:
import d2lzh as d2l
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import loss as gloss, nn

$$h_i^{'} = \frac{\xi_i}{1 - p} h_i$$
$变量\xi为0和1的概率分别为p和1-p$

In [2]:
def dropout(X, drop_prob):
    assert 0 <= drop_prob <= 1
    keep_prob = 1 - drop_prob
    if keep_prob == 0: # 全部丢弃
        return X.zeros_like()  # keep_prob的概率小于 即为1  drop_prob的概率大于为0
    mask = nd.random.uniform(0, 1, X.shape) < keep_prob
    return mask * X / keep_prob

In [3]:
X = nd.arange(16).reshape((2, 8))
dropout(X, 0)


[[ 0.  1.  2.  3.  4.  5.  6.  7.]
 [ 8.  9. 10. 11. 12. 13. 14. 15.]]
<NDArray 2x8 @cpu(0)>

In [4]:
dropout(X, 0.5)


[[ 0.  2.  4.  6.  0.  0.  0. 14.]
 [ 0. 18.  0.  0. 24. 26. 28.  0.]]
<NDArray 2x8 @cpu(0)>

In [5]:
dropout(X, 1)


[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]
<NDArray 2x8 @cpu(0)>

In [6]:
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256

w1 = nd.random.normal(scale=0.01, shape=(num_inputs, num_hiddens1))
b1 = nd.zeros(shape=(num_hiddens1))
w2 = nd.random.normal(scale=0.01, shape=(num_hiddens1, num_hiddens2))
b2 = nd.zeros(shape=(num_hiddens2))
w3 = nd.random.normal(scale=0.01, shape=(num_hiddens2, num_outputs))
b3 = nd.zeros(shape=(num_outputs))
params = [w1, b1, w2, b2, w3, b3]

for param in params:
    param.attach_grad()

In [7]:
drop_prob1, drop_prob2 = 0.2, 0.5
def net(X):
    X = X.reshape(-1, num_inputs)
    H1 = (nd.dot(X, w1) + b1).relu()
    if autograd.is_training():  # 只在训练的时候使用丢弃法
        H1 = dropout(H1, drop_prob1)
    H2 = (nd.dot(H1, w2) + b2).relu()
    if autograd.is_training():
        H2 = dropout(H2, drop_prob2)
    return nd.dot(H2, w3) + b3

In [8]:
num_epochs, lr, batch_size = 5, 0.5, 256
loss = gloss.SoftmaxCrossEntropyLoss()
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr)

epoch 1, loss 1.2301, train acc 0.524, test acc 0.762
epoch 2, loss 0.6027, train acc 0.775, test acc 0.812
epoch 3, loss 0.5059, train acc 0.814, test acc 0.843
epoch 4, loss 0.4584, train acc 0.833, test acc 0.860
epoch 5, loss 0.4290, train acc 0.843, test acc 0.866


In [9]:
net = nn.Sequential()
net.add(nn.Dense(256, activation='relu'),
        nn.Dropout(drop_prob2),
        nn.Dense(256, activation='relu'),
        nn.Dropout(drop_prob1),
        nn.Dense(10))
net.initialize(init.Normal(sigma=0.01))

In [10]:
net_without_dropout = nn.Sequential()
net_without_dropout.add(nn.Dense(256, activation='relu'),
        nn.Dense(256, activation='relu'),
        nn.Dense(10))
net_without_dropout.initialize(init.Normal(sigma=0.01))

In [11]:
num_epochs = 15

In [13]:
trainer = gluon.Trainer(net_without_dropout.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net_without_dropout, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)

epoch 1, loss 1.1916, train acc 0.537, test acc 0.574
epoch 2, loss 0.5931, train acc 0.778, test acc 0.832
epoch 3, loss 0.4646, train acc 0.827, test acc 0.852
epoch 4, loss 0.4201, train acc 0.843, test acc 0.862
epoch 5, loss 0.3877, train acc 0.857, test acc 0.859
epoch 6, loss 0.3651, train acc 0.865, test acc 0.870
epoch 7, loss 0.3495, train acc 0.871, test acc 0.872
epoch 8, loss 0.3334, train acc 0.875, test acc 0.869
epoch 9, loss 0.3217, train acc 0.879, test acc 0.882
epoch 10, loss 0.3116, train acc 0.883, test acc 0.881
epoch 11, loss 0.3045, train acc 0.887, test acc 0.882
epoch 12, loss 0.2932, train acc 0.891, test acc 0.888
epoch 13, loss 0.2852, train acc 0.893, test acc 0.887
epoch 14, loss 0.2789, train acc 0.895, test acc 0.881
epoch 15, loss 0.2780, train acc 0.897, test acc 0.882


In [14]:
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr})
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, trainer)

epoch 1, loss 1.1319, train acc 0.557, test acc 0.764
epoch 2, loss 0.5959, train acc 0.774, test acc 0.829
epoch 3, loss 0.5090, train acc 0.810, test acc 0.846
epoch 4, loss 0.4621, train acc 0.830, test acc 0.839
epoch 5, loss 0.4418, train acc 0.836, test acc 0.860
epoch 6, loss 0.4212, train acc 0.845, test acc 0.866
epoch 7, loss 0.4090, train acc 0.850, test acc 0.866
epoch 8, loss 0.3955, train acc 0.854, test acc 0.871
epoch 9, loss 0.3834, train acc 0.859, test acc 0.869
epoch 10, loss 0.3765, train acc 0.861, test acc 0.873
epoch 11, loss 0.3651, train acc 0.865, test acc 0.879
epoch 12, loss 0.3618, train acc 0.866, test acc 0.877
epoch 13, loss 0.3532, train acc 0.869, test acc 0.880
epoch 14, loss 0.3484, train acc 0.870, test acc 0.882
epoch 15, loss 0.3430, train acc 0.872, test acc 0.883
