# DropOut正则化

In [1]:
import torch
import torch.nn as nn
import numpy as np
import sys
sys.path.append(r'C:\D\ProgramFile\jupyter\torch_learn\dive_to_dp\utils')
import d2lzh as d2l

In [2]:
def dropout(X, drop_prob):
    X = X.float()
    assert 0<= drop_prob <=1
    keep_prob = 1- drop_prob
    if keep_prob == 0:
        return torch.zeros_like(X)
    mask = (torch.randn(X.shape) < keep_prob).float()
    
    return X * mask / keep_prob

In [3]:
test_tensor = torch.randn((3, 4))

In [4]:
test_tensor

tensor([[ 0.5961, -0.7102,  0.6794,  1.4897],
        [ 0.0999,  0.9866, -1.3454,  1.0851],
        [-1.3748, -0.7760,  0.3260,  0.6384]])

In [5]:
dropout(test_tensor, 0.1)

tensor([[ 0.6624, -0.7891,  0.7549,  0.0000],
        [ 0.0000,  1.0962, -1.4948,  1.2057],
        [-1.5276, -0.0000,  0.3622,  0.7093]])

## 模型创建

### 参数定义

In [6]:
num_inputs, num_outputs, num_hidden1, num_hidden2 = 784, 10, 256, 256

In [7]:
W1 = torch.tensor(np.random.normal(0, 0.01, (num_inputs, num_hidden1)), dtype=torch.float, requires_grad=True)
b1 = torch.zeros(num_hidden1, dtype=torch.float, requires_grad=True)
W2 = torch.tensor(np.random.normal(0, 0.01, (num_hidden1, num_hidden2)), dtype=torch.float, requires_grad=True)
b2 = torch.zeros(num_hidden2, dtype=torch.float, requires_grad=True)
W3 = torch.tensor(np.random.normal(0, 0.01, (num_hidden2, num_outputs)), dtype=torch.float, requires_grad=True)
b3 = torch.zeros(num_outputs, dtype=torch.float, requires_grad=True)

In [8]:
params = [W1, b1, W2, b2, W3, b3]

### 模型定义

In [9]:
drop_prob1, drop_prob2 = 0.5, 0.5

In [10]:
def net(X, is_training=True):
    X = X.view((-1, num_inputs))
    H1 = (torch.matmul(X, W1) + b1).relu()
    if is_training:
        H1 = dropout(H1, drop_prob1)
    H2 = (torch.matmul(H1, W2) + b2).relu()
    if is_training:
        H2 = dropout(H2, drop_prob2)
    return torch.matmul(H2, W3) + b3

### 损失函数

In [11]:
loss = torch.nn.CrossEntropyLoss()

## 加载数据

In [12]:
batch_size = 256
train_iter, test_iter = d2l.load_fashion_mnist_data(batch_size)

## 训练模型

In [13]:
lr, num_epochs = 100, 5

In [14]:
first_step = True
for i in range(num_epochs):
    train_loss, train_acc, n = 0.0, 0.0, 0
    for X, y in train_iter:
        y_hat = net(X)
        l = loss(y_hat, y).sum()
        acc = (y_hat.argmax(dim=-1) == y).sum().item()
        if not first_step:
            for param in params:
                param.grad.data.zero_()
        first_step = False
        l.backward()
        for param in params:
            param.data = param.data - lr * param.grad / batch_size
        train_loss += l
        train_acc += acc
        n += y.shape[0]
    test_acc = d2l.evaluate_accuracy(test_iter, net)
    print('epoch{:d}: train_loss {:.4f} train_acc {:.4f} test_acc {:.4f}'.format(i + 1, train_loss / n, train_acc / n, test_acc))

epoch1: train_loss 0.0047 train_acc 0.5452 test_acc 0.6960
epoch2: train_loss 0.0026 train_acc 0.7487 test_acc 0.7890
epoch3: train_loss 0.0021 train_acc 0.8067 test_acc 0.8175
epoch4: train_loss 0.0019 train_acc 0.8243 test_acc 0.8270
epoch5: train_loss 0.0017 train_acc 0.8371 test_acc 0.8042


没有relu可能会导致loss为nan