In [1]:
import torch
import numpy as np
import sys
import sys
sys.path.append(r'C:\D\ProgramFile\jupyter\torch_learn\dive_to_dp\utils')
import d2lzh as d2l

## **MLP的计算过程**
第一层计算
$X \in \mathbb R^{n \times b}$           
$W_1 \in \mathbb R^{b \times h}$       
$Z_1 = XW_1 + b_1$       
$A_1 = activation(Z_1)$    
第二层计算    
$A_1 \in \mathbb R^{n \times h}$                   
$W_2 \in \mathbb R^{h \times output_dims}$     
$Z_2 = A_1W_2 + b_2$      
$output = activation(Z_2)$

In [2]:
batch_size = 256
train_iter, test_iter = d2l.load_fashion_mnist_data(batch_size)

## **定义模型的参数**

In [3]:
num_inputs, num_outputs, num_hiddens = 784, 10, 256
W_1 = torch.tensor(np.random.normal(0, 0.01, (num_inputs, num_hiddens)), dtype = torch.float)
b_1 = torch.zeros(num_hiddens, dtype = torch.float)
W_2 = torch.tensor(np.random.normal(0, 0.01, (num_hiddens, num_outputs)), dtype = torch.float)
b_2 = torch.zeros(num_outputs, dtype = torch.float)

In [4]:
parmas = [W_1, b_1, W_2, b_2]
for parma in parmas:
    parma.requires_grad_(True)

## ** 激活函数**

In [5]:
def relu(X):
    return torch.max(X, torch.tensor(0.0))

In [6]:
relu(torch.tensor([1, 2.0, -4]))

tensor([1., 2., 0.])

## **定义模型**

In [7]:
def net(X):
    X = X.view((-1, num_inputs))
    H = relu(torch.matmul(X, W_1) + b_1)
    return torch.matmul(H, W_2) + b_2

In [8]:
# 损失函数
loss = torch.nn.CrossEntropyLoss()

## **训练模型**

In [9]:
num_epochs, lr = 5, 100.0

In [12]:
for epoch in range(num_epochs):
    total_acc, total_loss, n = 0.0, 0.0, 0
    for X, y in train_iter:
        y_hat = net(X)
        l = loss(y_hat, y).sum()
        if not n == 0:
            for parma in parmas:
                parma.grad.data.zero_()
        l.backward()
        for parma in parmas:
            parma.data -= lr * parma.grad /batch_size
        total_loss += l
        total_acc += (torch.argmax(y_hat, dim = 1)==y).sum().item()
        n += y.shape[0]
    test_acc = d2l.evaluate_accuracy(test_iter, net)
    print('epoch{:d}: loss {:.4f} acc{:.4f} test_acc {:.4f}'.format(epoch + 1, total_loss / n, total_acc / n, test_acc))

epoch1: loss 0.0030 acc0.7147 test_acc 0.7745
epoch2: loss 0.0019 acc0.8223 test_acc 0.8234
epoch3: loss 0.0017 acc0.8446 test_acc 0.8202
epoch4: loss 0.0015 acc0.8556 test_acc 0.8266
epoch5: loss 0.0015 acc0.8632 test_acc 0.8371


初始化的时候张量的grad一般初始化为None，一般只在进行一次反向传播里面才会有梯度，所以一旦出现了NoneType has no attribute data的情况，请在允许一次backward后再计算