### 3.13 丢弃法

In [1]:
import torch
import numpy as np
import torch.nn as nn
import sys
import os, sys
sys.path.append("..")
import d2lzh_pytorch.utils as d2l
np.random.seed(666)
cur_path = os.path.abspath(os.path.dirname('__file__'))
data_path = cur_path.replace('dl\dive-into-dl\chapter03-dl-basics', 'data\\')
np.random.seed(666)
torch.manual_seed(666)

<torch._C.Generator at 0x11775af70>

In [2]:
def dropout(X, drop_prob):
    X = X.float()
    assert 0 <= drop_prob <= 1
    keep_prob = 1 - drop_prob
    # 这种情况下将元素全部丢弃掉
    if keep_prob == 0:
        return torch.zeros_like(X)
    # 返回一个张量，包含了从区间[0,1)的均匀分布中抽取的一组随机数
    mask = (torch.rand(X.shape) < keep_prob).float()
    return mask * X / keep_prob

In [3]:
X = torch.arange(16).view(2, 8)
dropout(X, 0)

tensor([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11., 12., 13., 14., 15.]])

In [4]:
dropout(X, 0.5)

tensor([[ 0.,  2.,  4.,  0.,  8., 10.,  0.,  0.],
        [16.,  0., 20., 22., 24.,  0., 28., 30.]])

In [5]:
dropout(X, 1)

tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [6]:
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256

W1 = torch.tensor(np.random.normal(0, 0.01, size=(num_inputs, num_hiddens1)), dtype=torch.float, requires_grad=True)
b1 = torch.zeros(num_hiddens1, requires_grad=True)
W2 = torch.tensor(np.random.normal(0, 0.01, size=(num_hiddens1, num_hiddens2)), dtype=torch.float, requires_grad=True)
b2 = torch.zeros(num_hiddens2, requires_grad=True)
W3 = torch.tensor(np.random.normal(0, 0.01, size=(num_hiddens2, num_outputs)), dtype=torch.float, requires_grad=True)
b3 = torch.zeros(num_outputs, requires_grad=True)

params = [W1, b1, W2, b2, W3, b3]

In [9]:
drop_prob1, drop_prob2 = 0.2, 0.5

def net(X, is_training=True):
    X = X.view(-1, num_inputs)
    H1 = (torch.matmul(X, W1) + b1).relu()
    if is_training: # 只在训练模型时候使用丢弃
        H1 = dropout(H1, drop_prob1) # 第一层hidden添加丢弃
    H2 = (torch.matmul(H1, W2) + b2).relu()
    if is_training:
        H2 = dropout(H2, drop_prob2)
    return torch.matmul(H2, W3) + b3

In [10]:
num_epochs, lr, batch_size = 5, 100.0, 256
loss = torch.nn.CrossEntropyLoss()
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)

d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr)

epoch 1, loss 0.0046, train acc 0.545, test acc 0.724
epoch 2, loss 0.0023, train acc 0.786, test acc 0.818
epoch 3, loss 0.0019, train acc 0.825, test acc 0.830
epoch 4, loss 0.0018, train acc 0.837, test acc 0.844
epoch 5, loss 0.0016, train acc 0.849, test acc 0.831


In [12]:
net = nn.Sequential(
    d2l.FlattenLayer(),
    nn.Linear(num_inputs, num_hiddens1),
    nn.ReLU(),
    nn.Dropout(drop_prob1),
    nn.Linear(num_hiddens1, num_hiddens2),
    nn.ReLU(),
    nn.Dropout(drop_prob2),
    nn.Linear(num_hiddens2, num_inputs)
)
for param in net.parameters():
    nn.init.normal_(param, mean=0, std=0.01)

In [13]:
optimizer = torch.optim.SGD(net.parameters(), lr=0.5)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, optimizer)

epoch 1, loss 0.0078, train acc 0.431, test acc 0.704
epoch 2, loss 0.0028, train acc 0.729, test acc 0.749
epoch 3, loss 0.0023, train acc 0.781, test acc 0.821
epoch 4, loss 0.0021, train acc 0.803, test acc 0.832
epoch 5, loss 0.0020, train acc 0.820, test acc 0.805
