# Dropout丢弃法缓解过拟合

In [1]:
import collections
import math
import os
import random
import sys
import tarfile
import time
import json
import zipfile
from tqdm import tqdm
from PIL import Image
from collections import namedtuple

from IPython import display
from matplotlib import pyplot as plt
import torch
from torch import nn
from torch.nn import init
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torchtext
import torchtext.vocab as Vocab
import numpy as np


## 获取读取数据

In [2]:
def load_data_fashion_mnist(batch_size, resize=None, root='data/FashionMNIST'):
    """Download the fashion mnist dataset and then load into memory."""
    trans = []
    if resize:
        trans.append(torchvision.transforms.Resize(size=resize))
    trans.append(torchvision.transforms.ToTensor())
    
    transform = torchvision.transforms.Compose(trans)
    mnist_train = torchvision.datasets.FashionMNIST(root=root, train=True, download=True, transform=transform)
    mnist_test = torchvision.datasets.FashionMNIST(root=root, train=False, download=True, transform=transform)
    if sys.platform.startswith('win'):
        num_workers = 0  # 0表示不用额外的进程来加速读取数据
    else:
        num_workers = 4
    train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    test_iter = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    return train_iter, test_iter

In [4]:
batch_size = 256
train_iter, test_iter = load_data_fashion_mnist(batch_size)

## 初始化模型参数
已知样本输入为28×28像素，共10个类别。则softmax回归的w,b分别为784×10和1×10矩阵

In [5]:
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256
W1 = torch.tensor(np.random.normal(0, 0.01, size=(num_inputs, num_hiddens1)), dtype=torch.float, requires_grad=True)
b1 = torch.zeros(num_hiddens1, requires_grad=True)
W2 = torch.tensor(np.random.normal(0, 0.01, size=(num_hiddens1, num_hiddens2)), dtype=torch.float, requires_grad=True)
b2 = torch.zeros(num_hiddens2, requires_grad=True)
W3 = torch.tensor(np.random.normal(0, 0.01, size=(num_hiddens2, num_outputs)), dtype=torch.float, requires_grad=True)
b3 = torch.zeros(num_outputs, requires_grad=True)

params = [W1, b1, W2, b2, W3, b3]

softmax 运算和激活函数

In [7]:
def softmax(x):
    x_exp=x.exp()
    tot=x_exp.sum(dim=1,keepdim=True) #表示对第1维（行）求和且保持维度
    return x_exp / tot

def relu(x):
    return torch.max(input=x,other=torch.tensor(0.0))

实现`dropout`函数

In [8]:
def dropout(X, drop_prob):
    X=X.float()
    assert 0<=drop_prob<=1
    keep_prob = 1- drop_prob
    
    if keep_prob == 0:
        return torch.zeros_like(X)
    mask = (torch.rand(X.shape)< keep_prob).float()
    
    return mask*X/keep_prob

定义模型

In [18]:
drop_prob1, drop_prob2 = 0.2,0.4
def net(x,is_training=True):
    H1=relu(torch.mm(x.view(-1,num_inputs),W1)+b1)
    if is_training:
        H1=dropout(H1,drop_prob1)
    H2=relu(torch.mm(H1,W2)+b2)
    if is_training:
        H2=dropout(H2,drop_prob2)
    return softmax(torch.mm(H2,W3)+b3)

损失函数和优化函数

In [15]:
def loss(y_hat,y):
    return -torch.log(y_hat.gather(1,y.view(-1,1)))
    # torch.gather按索引取数
    # 如标签为y=[2,0]，对应真实概率为[0,0,1,...][1,0,0,...]，则从y_hat中取y.view（将y倒置）的数参与计算
def sgd(params, lr, batch_size):
    for param in params:
        param.data -= lr * param.grad / batch_size 
        
def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0.0, 0
    for X, y in data_iter:
        if isinstance(net, torch.nn.Module):
            net.eval() # 评估模式, 这会关闭dropout
            acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
            net.train() # 改回训练模式
        else: # 自定义的模型
            if('is_training' in net.__code__.co_varnames): # 如果有is_training这个参数
                # 将is_training设置成False
                acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
            else:
                acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
        n += y.shape[0]
    return acc_sum / n

## 训练模型

In [21]:
num_epochs = 10
lr = 0.1
def train(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr):
    for epoch in range(num_epochs):
        train_l_sum,train_acc_sum,n=0.0,0.0,0
        for X,y in train_iter:
            y_hat=net(X)
            l=loss(y_hat,y).sum()

            # w和b梯度清零
            for param in params:
                if param.grad is not None:
                    param.grad.data.zero_()

            # 计算loss函数梯度，反向传播
            l.backward()
            
            # 梯度下降
            sgd(params,lr,batch_size) 
               
            # loss和精确度加和
            train_l_sum+=l.item()
            train_acc_sum+=(y_hat.argmax(dim=1)==y).sum().item()
            n+=y.shape[0]
        test_acc=evaluate_accuracy(test_iter,net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' 
            % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))    

In [22]:
train (net, train_iter, test_iter, loss, num_epochs, batch_size, [W1, b1, W2, b2, W3, b3], lr)

epoch 1, loss 0.6861, train acc 0.751, test acc 0.786
epoch 2, loss 0.6017, train acc 0.786, test acc 0.808
epoch 3, loss 0.5407, train acc 0.808, test acc 0.823
epoch 4, loss 0.5001, train acc 0.823, test acc 0.832
epoch 5, loss 0.4689, train acc 0.834, test acc 0.840
epoch 6, loss 0.4463, train acc 0.841, test acc 0.836
epoch 7, loss 0.4279, train acc 0.847, test acc 0.847
epoch 8, loss 0.4139, train acc 0.852, test acc 0.851
epoch 9, loss 0.4011, train acc 0.857, test acc 0.855
epoch 10, loss 0.3910, train acc 0.861, test acc 0.847
