In [1]:
%matplotlib inline
import torch
import torch.nn as nn
import numpy as np
import sys
sys.path.append("..") 
import d2lzh_pytorch as d2l

print(torch.__version__)

1.8.1


In [2]:
def dropout(X, drop_prob):
    X = X.float()
    assert 0 <= drop_prob <= 1
    keep_prob = 1 - drop_prob
    # 这种情况下把全部元素都丢弃
    if keep_prob == 0:
        return torch.zeros_like(X)
    # torch.rand returns a tensor filled with random numbers from a uniform distribution on the interval [0, 1)
    mask = (torch.rand(X.shape) < keep_prob).float()
    
    # 某些元素归 0, 某些扩大
    return mask * X / keep_prob 


In [3]:
X = torch.arange(16).view(2, 8)
X_drop_0 = dropout(X, 0)
X_drop_05 = dropout(X, 0.5)
print('origin X\n', X.sum())

print('dropout 0\n', X_drop_0, X_drop_0.sum())

print('dropout 0.5\n', X_drop_05, X_drop_05.sum())

origin X
 tensor(120)
dropout 0
 tensor([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11., 12., 13., 14., 15.]]) tensor(120.)
dropout 0.5
 tensor([[ 0.,  2.,  4.,  0.,  0., 10.,  0.,  0.],
        [16.,  0.,  0., 22.,  0.,  0., 28.,  0.]]) tensor(82.)


In [4]:
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256
drop_prob1, drop_prob2 = 0.2, 0.5
num_epochs, lr, batch_size = 10, 100.0, 256 # 这里的学习率设置的很大，原因同 3.9.6 节。
loss = torch.nn.CrossEntropyLoss()
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)

W1 = torch.tensor(np.random.normal(0, 0.01, size=(num_inputs, num_hiddens1)), dtype=torch.float, requires_grad=True)
b1 = torch.zeros(num_hiddens1, requires_grad=True)
W2 = torch.tensor(np.random.normal(0, 0.01, size=(num_hiddens1, num_hiddens2)), dtype=torch.float, requires_grad=True)
b2 = torch.zeros(num_hiddens2, requires_grad=True)
W3 = torch.tensor(np.random.normal(0, 0.01, size=(num_hiddens2, num_outputs)), dtype=torch.float, requires_grad=True)
b3 = torch.zeros(num_outputs, requires_grad=True)

W1_ = W1.clone().detach().requires_grad_(True)
b1_ = b1.clone().detach().requires_grad_(True)
W2_ = W2.clone().detach().requires_grad_(True)
b2_ = b2.clone().detach().requires_grad_(True)
W3_ = W3.clone().detach().requires_grad_(True)
b3_ = b3.clone().detach().requires_grad_(True)

print(W1)
print(W1_)

params = [W1, b1, W2, b2, W3, b3]
params2 = [W1_, b1_, W2_, b2_, W3_, b3_]

len(mnist_train) 60000
len(mnist_test) 10000
tensor([[-0.0074,  0.0095, -0.0035,  ...,  0.0124,  0.0058, -0.0042],
        [ 0.0073, -0.0075, -0.0042,  ...,  0.0023,  0.0010,  0.0002],
        [ 0.0086, -0.0140, -0.0075,  ..., -0.0030,  0.0179, -0.0003],
        ...,
        [ 0.0071,  0.0027,  0.0003,  ..., -0.0006, -0.0082, -0.0028],
        [ 0.0083,  0.0016, -0.0102,  ..., -0.0283, -0.0167, -0.0107],
        [-0.0171,  0.0008,  0.0097,  ...,  0.0016,  0.0099,  0.0042]],
       requires_grad=True)
tensor([[-0.0074,  0.0095, -0.0035,  ...,  0.0124,  0.0058, -0.0042],
        [ 0.0073, -0.0075, -0.0042,  ...,  0.0023,  0.0010,  0.0002],
        [ 0.0086, -0.0140, -0.0075,  ..., -0.0030,  0.0179, -0.0003],
        ...,
        [ 0.0071,  0.0027,  0.0003,  ..., -0.0006, -0.0082, -0.0028],
        [ 0.0083,  0.0016, -0.0102,  ..., -0.0283, -0.0167, -0.0107],
        [-0.0171,  0.0008,  0.0097,  ...,  0.0016,  0.0099,  0.0042]],
       requires_grad=True)


In [5]:

def net(X, is_training=True):
    X = X.view(-1, num_inputs) # 将输入变形为形状 (batch_size, num_inputs)
    H1 = (torch.matmul(X, W1) + b1).relu()
    if is_training:  # 只在训练模型时使用丢弃法
        H1 = dropout(H1, drop_prob1)  # 在第一层全连接后添加丢弃层
    H2 = (torch.matmul(H1, W2) + b2).relu()
    if is_training:
        H2 = dropout(H2, drop_prob2)  # 在第二层全连接后添加丢弃层
    return torch.matmul(H2, W3) + b3

def net_without_dropout(X, is_training=True):
    X = X.view(-1, num_inputs) # 将输入变形为形状 (batch_size, num_inputs)
    H1 = (torch.matmul(X, W1_) + b1_).relu()
    H2 = (torch.matmul(H1, W2_) + b2_).relu()
    return torch.matmul(H2, W3_) + b3_

In [6]:
# 对比带 dropout 与 不带 dropout 的预测准确率
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr)
d2l.train_ch3(net_without_dropout, train_iter, test_iter, loss, num_epochs, batch_size, params2, lr)

epoch 1, loss 0.0045, train acc 0.555, test acc 0.738
epoch 2, loss 0.0023, train acc 0.786, test acc 0.746
epoch 3, loss 0.0019, train acc 0.821, test acc 0.810
epoch 4, loss 0.0017, train acc 0.840, test acc 0.840
epoch 5, loss 0.0016, train acc 0.850, test acc 0.831
epoch 6, loss 0.0015, train acc 0.856, test acc 0.856
epoch 7, loss 0.0015, train acc 0.862, test acc 0.860
epoch 8, loss 0.0014, train acc 0.866, test acc 0.848
epoch 9, loss 0.0014, train acc 0.871, test acc 0.833
epoch 10, loss 0.0014, train acc 0.873, test acc 0.857
epoch 1, loss 0.0044, train acc 0.572, test acc 0.730
epoch 2, loss 0.0023, train acc 0.786, test acc 0.753
epoch 3, loss 0.0018, train acc 0.829, test acc 0.832
epoch 4, loss 0.0016, train acc 0.848, test acc 0.837
epoch 5, loss 0.0015, train acc 0.856, test acc 0.827
epoch 6, loss 0.0014, train acc 0.867, test acc 0.852
epoch 7, loss 0.0014, train acc 0.872, test acc 0.858
epoch 8, loss 0.0013, train acc 0.877, test acc 0.852
epoch 9, loss 0.0012, train

In [7]:
# pytorch 实现
net2 = nn.Sequential(
        nn.Flatten(),
        nn.Linear(num_inputs, num_hiddens1),
        nn.ReLU(),
        nn.Dropout(drop_prob1),
        nn.Linear(num_hiddens1, num_hiddens2), 
        nn.ReLU(),
        nn.Dropout(drop_prob2),
        nn.Linear(num_hiddens2, 10)
)

net2_without_dropout = nn.Sequential(
        nn.Flatten(),
        nn.Linear(num_inputs, num_hiddens1),
        nn.ReLU(),
        nn.Linear(num_hiddens1, num_hiddens2), 
        nn.ReLU(),
        nn.Linear(num_hiddens2, 10)
)


In [8]:
optimizer2 = torch.optim.SGD(net2.parameters(), lr=0.5)
optimizer2_without_dropout = torch.optim.SGD(net2_without_dropout.parameters(), lr=0.5)

d2l.train_ch3(net2, train_iter, test_iter, loss, num_epochs, batch_size, None, None, optimizer2)
d2l.train_ch3(net2_without_dropout, train_iter, test_iter, loss, num_epochs, batch_size, None, None, optimizer2_without_dropout)

epoch 1, loss 0.0035, train acc 0.669, test acc 0.758
epoch 2, loss 0.0021, train acc 0.805, test acc 0.806
epoch 3, loss 0.0018, train acc 0.831, test acc 0.769
epoch 4, loss 0.0017, train acc 0.847, test acc 0.838
epoch 5, loss 0.0016, train acc 0.853, test acc 0.844
epoch 6, loss 0.0015, train acc 0.861, test acc 0.855
epoch 7, loss 0.0015, train acc 0.864, test acc 0.842
epoch 8, loss 0.0014, train acc 0.869, test acc 0.830
epoch 9, loss 0.0014, train acc 0.873, test acc 0.859
epoch 10, loss 0.0013, train acc 0.877, test acc 0.866
epoch 1, loss 0.0034, train acc 0.671, test acc 0.798
epoch 2, loss 0.0019, train acc 0.817, test acc 0.794
epoch 3, loss 0.0017, train acc 0.842, test acc 0.785
epoch 4, loss 0.0015, train acc 0.855, test acc 0.824
epoch 5, loss 0.0014, train acc 0.864, test acc 0.853
epoch 6, loss 0.0014, train acc 0.871, test acc 0.847
epoch 7, loss 0.0013, train acc 0.877, test acc 0.859
epoch 8, loss 0.0012, train acc 0.883, test acc 0.863
epoch 9, loss 0.0012, train