In [1]:
import torch
import torchvision
from torch.utils import data
from torchvision import transforms
from d2l import torch as d2l
from torch import nn

In [2]:
import func

# Hand MLP

In [23]:
batch_size = 256
train_iter, test_iter = func.load_data_fashion_mnist(batch_size)

In [24]:
num_inputs, num_outputs, num_hiddens = 784, 10, 256

W1 = nn.Parameter(torch.randn(num_inputs, num_hiddens, requires_grad=True) * 0.1)
b1 = nn.Parameter(torch.zeros(num_hiddens, requires_grad=True))
W2 = nn.Parameter(torch.randn(
    num_hiddens, num_outputs, requires_grad=True) * 0.1)
b2 = nn.Parameter(torch.zeros(num_outputs, requires_grad=True))

params = [W1, b1, W2, b2]

In [25]:
def relu(X):
    a = torch.zeros_like(X)
    return torch.max(X, a)

def net(X):
    X = X.view((-1, num_inputs))
    H = relu(torch.mm(X, W1) + b1)  # 这里“@”代表矩阵乘法
    return (torch.mm(H, W2) + b2)

In [26]:
net 

<function __main__.net(X)>

In [27]:
loss1 = nn.CrossEntropyLoss()
loss2 = nn.CrossEntropyLoss(reduction='none')

In [None]:
loss1(net(x), y)

In [107]:
loss2(net(x), y).mean() 

tensor(0.3717, grad_fn=<MeanBackward0>)

In [28]:
batch_size

256

In [29]:
lr

0.001953125

In [32]:
num_epochs, lr = 3, 0.5
updater = torch.optim.SGD(params, lr=lr)

In [33]:
func.train_ch3(net, train_iter, test_iter, loss2, num_epochs, updater)

epochs 1: train acc 0.10218333333333333, test acc 0.1
epochs 2: train acc 0.09833333333333333, test acc 0.1
epochs 3: train acc 0.09886666666666667, test acc 0.1


# MLP on mnist

In [6]:
net = nn.Sequential(nn.Flatten(),
                    nn.Linear(784, 256),
                    nn.ReLU(),
                    nn.Linear(256, 10))

def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, std=0.01)

net.apply(init_weights);

In [9]:
net._modules

OrderedDict([('0', Flatten(start_dim=1, end_dim=-1)),
             ('1', Linear(in_features=784, out_features=256, bias=True)),
             ('2', ReLU()),
             ('3', Linear(in_features=256, out_features=10, bias=True))])

In [14]:
batch_size, lr, num_epochs = 256, 0.35, 5
loss = nn.CrossEntropyLoss()
trainer = torch.optim.SGD(net.parameters(), lr=lr)
train_iter, test_iter = func.load_data_fashion_mnist(batch_size)

In [None]:
func.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)

# dropout

In [None]:
def dropout_layer(X, dropout):
    assert 0 <= dropout <= 1
    # 在本情况中，所有元素都被丢弃
    if dropout == 1:
        return torch.zeros_like(X)
    # 在本情况中，所有元素都被保留
    if dropout == 0:
        return X
    mask = (torch.rand(X.shape) > dropout).float()
    return mask * X / (1.0 - dropout)

In [10]:
dropout1, dropout2 = 0.2, 0.5

net = nn.Sequential(nn.Flatten(),
        nn.Linear(784, 256),
        nn.ReLU(),
        # 在第一个全连接层之后添加一个dropout层
        nn.Dropout(dropout1),
        nn.Linear(256, 256),
        nn.ReLU(),
        # 在第二个全连接层之后添加一个dropout层
        nn.Dropout(dropout2),
        nn.Linear(256, 10))

def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, std=0.01)

net.apply(init_weights);

In [11]:
batch_size, lr, num_epochs = 256, 0.5, 10
loss = nn.CrossEntropyLoss()
trainer = torch.optim.SGD(net.parameters(), lr=lr)
train_iter, test_iter = func.load_data_fashion_mnist(batch_size)

In [12]:
func.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)

epochs 1: train acc 0.5441666666666667, test acc 0.7098
epochs 2: train acc 0.7868, test acc 0.8089
epochs 3: train acc 0.8187166666666666, test acc 0.8014
epochs 4: train acc 0.8375166666666667, test acc 0.8392
epochs 5: train acc 0.8469833333333333, test acc 0.8468
epochs 6: train acc 0.8544166666666667, test acc 0.856
epochs 7: train acc 0.85855, test acc 0.8411
epochs 8: train acc 0.86395, test acc 0.8265
epochs 9: train acc 0.8666166666666667, test acc 0.7903
epochs 10: train acc 0.8710333333333333, test acc 0.8525
