多层感知机就是含有⾄少⼀个隐藏层的由全连接层组成的神经⽹络，且每个隐藏层的输出通过激活函数进⾏变换。

In [3]:
import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np
import sys

# 获取和读取数据

In [4]:
batch_size=256
mnist_train=torchvision.datasets.FashionMNIST(root='./Datasets/FashionMNIST',train=True,download=False,transform=transforms.ToTensor())
mnist_test=torchvision.datasets.FashionMNIST(root='./Datasets/FashionMNIST',train=False,download=False,transform=transforms.ToTensor())
if sys.platform.startswith('win'):
    num_workers=0
else:
    num_workers=4
train_iter=torch.utils.data.DataLoader(mnist_train,batch_size=batch_size,shuffle=True,num_workers=num_workers)
test_iter=torch.utils.data.DataLoader(mnist_test,batch_size=batch_size,shuffle=False,num_workers=num_workers)

# 定义模型参数

In [6]:
num_inputs,num_outputs,num_hiddens=784,10,256
W1=torch.tensor(np.random.normal(0,0.01,(num_inputs,num_hiddens)),dtype=torch.float)
b1=torch.zeros(num_hiddens,dtype=torch.float)
W2=torch.tensor(np.random.normal(0,0.01,(num_hiddens,num_outputs)),dtype=torch.float)
b2=torch.zeros(num_outputs,dtype=torch.float)

params=[W1,b1,W2,b2]
for param in params:
    param.requires_grad_(requires_grad=True)

# 定义激活函数

In [8]:
def relu(X):
    return torch.max(input=X,other=torch.tensor(0.0))

# 定义模型

In [9]:
def net(X):
    X=X.view(-1,num_inputs)
    H=relu(torch.matmul(X,W1)+b1)
    return torch.matmul(H,W2)+b2

# 定义损失函数

In [10]:
loss=torch.nn.CrossEntropyLoss()

# 定义优化算法

In [13]:
def sgd(params,lr,batch_size):
    for param in params:
        param.data-=lr*param.grad/batch_size

# 训练模型

In [11]:
def evaluate_accuracy(data_iter,net):
    acc_sum,n=0.0,0
    for X,y in data_iter:
        acc_sum+=(net(X).argmax(dim=1)==y).float().sum().item()
        n+=y.shape[0]
    return acc_sum/n

In [14]:
def train_ch3(net,train_iter,test_iter,loss,num_epochs,batch_size,params=None,lr=None,optimizer=None):
    for epoch in range(num_epochs):
        train_l_sum,train_acc_sum,n=0.0,0.0,0
        for X,y in train_iter:
            y_hat=net(X)
            l=loss(y_hat,y).sum()
            
            if optimizer is not None:
                optimizer.zero_grad()
            elif params is not None and params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()
                    
            l.backward()
            if optimizer is None:
                sgd(params,lr,batch_size)
            else:
                optimizer.step()
                
            train_l_sum+=l.item()
            train_acc_sum+=(y_hat.argmax(dim=1)==y).sum().item()
            n+=y.shape[0]
        
        test_acc=evaluate_accuracy(test_iter,net)
        print('epoch %d,loss %.4f,train acc %.3f,test acc %.3f'%(epoch+1,train_l_sum/n,train_acc_sum/n,test_acc))

In [15]:
num_epochs=5
lr=100
train_ch3(net,train_iter,test_iter,loss,num_epochs,batch_size,params,lr)

epoch 1,loss 0.0030,train acc 0.715,test acc 0.731
epoch 2,loss 0.0019,train acc 0.822,test acc 0.799
epoch 3,loss 0.0017,train acc 0.846,test acc 0.837
epoch 4,loss 0.0015,train acc 0.854,test acc 0.844
epoch 5,loss 0.0015,train acc 0.863,test acc 0.859
