In [1]:
import torch
import torchvision
import numpy as np
import sys
import d2lzh_pytorch as d2l

#### 读取数据

In [2]:
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)

#### 初始化模型参数

In [37]:
num_inputs = 784
num_outputs = 10

W = torch.tensor(np.random.normal(0, 0.01, (num_inputs, num_outputs))).to(torch.float32)
b = torch.zeros(num_outputs, dtype=torch.float)

In [38]:
# 打开模型参数梯度
W.requires_grad_(True);
b.requires_grad_(True);

#### 定义Softmax函数<br>
$$
Softmax(x)=(\frac{e^{x_1}}{\sum_{i=1}^{n}{e^{x_i}}},\frac{e^{x_2}}{\sum_{i=1}^{n}{e^{x_i}}},...,\frac{e^{x_n}}{\sum_{i=1}^{n}{e^{x_i}}})
$$

In [8]:
def softmax(X):
    X_exp = X.exp()
    partition = X_exp.sum(dim=1, keepdim=True)
    return X_exp / partition

#### 定义模型

In [24]:
def net(X):
    return softmax(torch.mm(X.view((-1, num_inputs)), W) + b)

#### 定义损失函数
**交叉熵损失函数**<br>
$$
H(y,\hat{y})=-\log{\hat{y}_{i}},\,where\,y_i==1
$$

In [11]:
def cross_entropy(y_hat,y):
    ''''
    input:
    y_hat:nxm matrics float 0<=y_hat<=1
    y    :mx1 vector int 0<=y<=m
    output:
    the prescision: nx1 matrics float
    '''
    return -torch.log(y_hat.gather(1, y.view(-1,1)))
# gather用法：
# torch.gather(a,axis,b) or a.gather(axis,b)
# a为原始张量，b为按照axis轴索引值
# 返回一个b形状的张量

#### 计算分类准确率

In [12]:
def accuracy(y_hat, y):
    return (y_hat.argmax(dim=1)==y).float().mean().item()

In [13]:
def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0.0, 0
    for X, y in data_iter:
        acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
        n += y.shape[0]
    return acc_sum / n

#### 训练模型

In [14]:
def sgd(params, lr, batch_size):
    for param in params:
        param.data -= lr * param.grad / batch_size # 注意这里更改param时用的param.data

In [42]:
num_epochs, lr = 5, 0.1

def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params=None,lr=None,optimizer=None):
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
        for X, y in train_iter:
            y_hat = net(X)
            l = loss(y_hat, y).sum()
            if optimizer is not None:
                optimizer.zero_grad()
            elif params is not None and params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()
            
            l.backward()
            if optimizer is None:
                sgd(params, lr, batch_size)
            else:
                optimizer.step()
                
            train_l_sum += l.item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item()
            n+=y.shape[0]
            
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))
        

In [43]:
train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs, batch_size, [W, b], lr)

epoch 1, loss 0.5705, train acc 0.813, test acc 0.809
epoch 2, loss 0.5262, train acc 0.826, test acc 0.818
epoch 3, loss 0.5025, train acc 0.832, test acc 0.823
epoch 4, loss 0.4858, train acc 0.837, test acc 0.827
epoch 5, loss 0.4740, train acc 0.839, test acc 0.828
