In [2]:
import torchvision
import torchvision.transforms as transforms
import time
import sys

In [1]:
from torch.nn import init

In [1]:
import torch
from matplotlib import pyplot as plt
import numpy as np
import random
import torch.nn as nn
%matplotlib

ModuleNotFoundError: No module named 'matplotlib'

# pytorch

## 继承Module类来构造模型

In [10]:
class MLP(nn.Module):
    # 声明带有模型参数的层，这里声明了两个全连接层
    def __init__(self, **kwargs):
        # 调用MLP父类Module的构造函数来进行必要的初始化。这样在构造实例时还可以指定其他函数
        # 参数，如“模型参数的访问、初始化和共享”一节将介绍的模型参数params
        super(MLP, self).__init__(**kwargs)
        self.hidden = nn.Linear(5, 3) # 隐藏层
        self.act = nn.ReLU()
        self.output = nn.Linear(3, 1)  # 输出层


    # 定义模型的前向计算，即如何根据输入x计算返回所需要的模型输出
    def forward(self, x):
        a = self.act(self.hidden(x))
        return self.output(a)

## Sequential类

当模型的前向计算为简单串联各个层的计算时，Sequential类可以通过更加简单的方式定义模型。  
这正是Sequential类的目的：它可以接收一个子模块的有序字典（OrderedDict）或者一系列子模块作为参数来逐一添加Module的实例，  
而模型的前向计算就是将这些实例按添加的顺序逐一计算。  

下面我们实现一个与Sequential类有相同功能的MySequential类。这或许可以帮助读者更加清晰地理解Sequential类的工作机制。  

In [None]:
class MySequential(nn.Module):
    from collections import OrderedDict
    def __init__(self, *args):
        super(MySequential, self).__init__()
        if len(args) == 1 and isinstance(args[0], OrderedDict): # 如果传入的是一个OrderedDict
            for key, module in args[0].items():
                self.add_module(key, module)  # add_module方法会将module添加进self._modules(一个OrderedDict)
        else:  # 传入的是一些Module
            for idx, module in enumerate(args):
                self.add_module(str(idx), module)
    def forward(self, input):
        # self._modules返回一个 OrderedDict，保证会按照成员添加时的顺序遍历成员
        for module in self._modules.values():
            input = module(input)
        return input

In [None]:
net = MySequential(
        nn.Linear(784, 256),
        nn.ReLU(),
        nn.Linear(256, 10), 
        )
print(net)

+ 可以通过继承Module类来构造模型。  
+ Sequential、ModuleList、ModuleDict类都继承自Module类。  
+ 与Sequential不同，ModuleList和ModuleDict并没有定义一个完整的网络，它们只是将不同的模块存放在一起，需要自己定义forward函数。  
+ 虽然Sequential等类可以使模型构造更加简单，但直接继承Module类可以极大地拓展模型构造的灵活性。  

## 模型参数及初始化

In [3]:
net = nn.Sequential(nn.Linear(4, 3), nn.ReLU(), nn.Linear(3, 1))  # pytorch已进行默认初始化

In [17]:
net1 = MLP()

In [None]:
for name, param in net.named_parameters():
    print(name, param)

In [18]:
# 对weight进行正态初始化
for name, param in net.named_parameters():
    if 'weight' in name:
        init.normal_(param, mean=0, std=0.01)
        print(name, param.data)

0.weight tensor([[ 0.0017, -0.0090, -0.0178,  0.0006],
        [-0.0129,  0.0072, -0.0127,  0.0072],
        [ 0.0021, -0.0057, -0.0134,  0.0087]])
2.weight tensor([[ 0.0175, -0.0124, -0.0074]])


In [None]:
# 对bias进行0初始化
for name, param in net.named_parameters():
    if 'bias' in name:
        init.constant_(param, val=0)
        print(name, param.data)

# 基础

## 线性回归

我们使用线性回归模型真实权重 w=[2,−3.4]⊤ 和偏差 b=4.2以及一个随机噪声项 ϵϵ 来生成标签   
y=Xw+b+ϵ

### 手写

In [86]:
# 生成1000个数据
torch.manual_seed(10)
num_inputs = 2
num_examples = 100
true_w = [2, -3.4]
true_b = 4.2
features = torch.randn(num_examples, num_inputs, dtype=torch.float32)
labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
labels += torch.tensor(np.random.normal(0, 0.1, size=labels.size()), dtype=torch.float32)

In [91]:
# 每次返回batch_size（批量⼤小）个随机样本的特征和标签
def data_iter(batch_size, features, labels):
    num_examples = len(features)
    indices = list(range(num_examples))
    random.shuffle(indices)  # 样本的读取顺序是随机的
    for i in range(0, num_examples, batch_size):
        j = torch.LongTensor(indices[i: min(i + batch_size, num_examples)]) # 最后一次可能不足一个batch
        yield  features.index_select(0, j), labels.index_select(0, j)

#定义模型
def linreg(X, w, b):  
    return torch.mm(X, w) + b

# 损失函数
def squared_loss(y_hat, y):  
    # print(y.view(y_hat.size()))
    return (y_hat - y.view(y_hat.size())) ** 2 / 2

def sgd(params, lr, batch_size):  # 本函数已保存在d2lzh_pytorch包中方便以后使用
    for param in params:
        param.data -= lr * param.grad / batch_size # 注意这里更改param时用的param.data

In [92]:
# 初始化参数
w = torch.tensor(np.random.normal(0, 0.01, (num_inputs, 1)), dtype=torch.float32, requires_grad=True)
b = torch.zeros(1, dtype=torch.float32, requires_grad=True)

lr = 0.03
num_epochs = 10
net = linreg
loss = squared_loss

for epoch in range(num_epochs):  # 训练模型一共需要num_epochs个迭代周期
    # 在每一个迭代周期中，会使用训练数据集中所有样本一次（假设样本数能够被批量大小整除）。X
    # 和y分别是小批量样本的特征和标签
    for X, y in data_iter(batch_size, features, labels):
        l = loss(net(X, w, b), y).sum()  # l是有关小批量X和y的损失
        l.backward()  # 小批量的损失对模型参数求梯度
        sgd([w, b], lr, batch_size)  # 使用小批量随机梯度下降迭代模型参数

        # 不要忘了梯度清零
        w.grad.data.zero_()
        b.grad.data.zero_()
    train_l = loss(net(features, w, b), labels)
    print('epoch %d, loss %f' % (epoch + 1, train_l.mean()))

epoch 1, loss 9.067955
epoch 2, loss 4.756135
epoch 3, loss 2.508253
epoch 4, loss 1.328605
epoch 5, loss 0.707993
epoch 6, loss 0.379959
epoch 7, loss 0.206142
epoch 8, loss 0.113420
epoch 9, loss 0.063855
epoch 10, loss 0.037072


In [93]:
print(true_w, '\n', w)
print(true_b, '\n', b)

[2, -3.4] 
 tensor([[ 1.8336],
        [-3.2457]], requires_grad=True)
4.2 
 tensor([4.0469], requires_grad=True)


## LR简洁实现

In [None]:
# 生成1000个数据
torch.manual_seed(10)
num_inputs = 2
num_examples = 100
true_w = [2, -3.4]
true_b = 4.2
features = torch.randn(num_examples, num_inputs, dtype=torch.float32)
labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
labels += torch.tensor(np.random.normal(0, 0.1, size=labels.size()), dtype=torch.float32)

### 读取数据

In [105]:
import torch.utils.data as Data

batch_size = 10
# 将训练数据的特征和标签组合
dataset = Data.TensorDataset(features, labels)
# 随机读取小批量
data_iter = Data.DataLoader(dataset, batch_size, shuffle=True)

In [99]:
# DataLoader(dataset, batch_size=1, shuffle=False, sampler=None,
#            batch_sampler=None, num_workers=0, collate_fn=None,
#            pin_memory=False, drop_last=False, timeout=0,
#            worker_init_fn=None, *, prefetch_factor=2,
#            persistent_workers=False)

### 定义一个nn

In [178]:
class LinearNet(nn.Module):
    def __init__(self, n_feature):
        super(LinearNet, self).__init__()
        self.linear = nn.Linear(n_feature, 1)
    # forward 定义前向传播
    def forward(self, x):
        y = self.linear(x)
        return y

net = LinearNet(num_inputs)
print(net) # 使用print可以打印出网络的结构

LinearNet(
  (linear): Linear(in_features=2, out_features=1, bias=True)
)


In [None]:
# 事实上我们还可以用nn.Sequential来更加方便地搭建网络，Sequential是一个有序的容器，
# 网络层将按照在传入Sequential的顺序依次被添加到计算图中。

# 写法一
net = nn.Sequential(
    nn.Linear(num_inputs, 1)
    # 此处还可以传入其他层
    )

# 写法二
net = nn.Sequential()
net.add_module('linear', nn.Linear(num_inputs, 1))
# net.add_module ......

In [176]:
# 可以通过net.parameters()来查看模型所有的可学习参数，此函数将返回一个生成器。
for param in net.parameters():
    print(param)

Parameter containing:
tensor([[-0.4271, -0.3423]], requires_grad=True)
Parameter containing:
tensor([0.5118], requires_grad=True)
Parameter containing:
tensor([[-0.7674]], requires_grad=True)
Parameter containing:
tensor([-0.0857], requires_grad=True)


### 初始化参数

在使用net前，我们需要初始化模型参数，如线性回归模型中的权重和偏差。PyTorch在init模块中提供了  
多种参数初始化方法。这里的init是initializer的缩写形式。我们通过init.normal_将权重参数每个元素初  
始化为随机采样于均值为0、标准差为0.01的正态分布。偏差会初始化为零。  

In [115]:
net.linear.bias

Parameter containing:
tensor([-0.0942], requires_grad=True)

In [140]:
from torch.nn import init

init.normal_(net.linear.weight, mean=0, std=0.1)
init.constant_(net.linear.bias, val=0)  # 也可以直接修改bias的data: net[0].bias.data.fill_(0)

Parameter containing:
tensor([0.], requires_grad=True)

### 损失函数

In [141]:
loss = nn.MSELoss()

### optim

同样，我们也无须自己实现小批量随机梯度下降算法。torch.optim模块提供了很多常用的优  
化算法比如SGD、Adam和RMSProp等。下面我们创建一个用于优化net所有参数的优化器实例，  
并指定学习率为0.03的小批量随机梯度下降（SGD）为优化算法。  

In [142]:
import torch.optim as optim

optimizer = optim.SGD(net.parameters(), lr=0.03)
print(optimizer)

SGD (
Parameter Group 0
    dampening: 0
    lr: 0.03
    momentum: 0
    nesterov: False
    weight_decay: 0
)


### 训练

In [143]:
num_epochs = 4
for epoch in range(1, num_epochs + 1):
    for X, y in data_iter:
        output = net(X)
        # print(output.shape)
        l = loss(output, y.view(output.size()))
        optimizer.zero_grad() # 梯度清零，等价于net.zero_grad()
        l.backward()
        optimizer.step()
    print('epoch %d, loss: %f' % (epoch, l.item()))

epoch 1, loss: 7.204216
epoch 2, loss: 3.043922
epoch 3, loss: 0.444130
epoch 4, loss: 0.328976


In [171]:
print(true_w, net.linear.weight)

[2, -3.4] Parameter containing:
tensor([[ 1.7104, -3.1337]], requires_grad=True)


In [172]:
print(true_b, net.linear.bias)

4.2 Parameter containing:
tensor([3.9224], requires_grad=True)


## softmax

In [87]:
mnist_train = torchvision.datasets.FashionMNIST(root='F:/Jupyter_note/DL_pytorch/FashionMNIST', train=True, download=True, transform=transforms.ToTensor())
mnist_test = torchvision.datasets.FashionMNIST(root='F:/Jupyter_note/DL_pytorch/FashionMNIST', train=False, download=True, transform=transforms.ToTensor())

In [14]:
print(len(mnist_train), len(mnist_test))

60000 10000


Fashion-MNIST中一共包括了10个类别，分别为t-shirt（T恤）、trouser（裤子）、pullover（套衫）、  
dress（连衣裙）、coat（外套）、sandal（凉鞋）、shirt（衬衫）、sneaker（运动鞋）、bag（包）和ankle boot（短靴）。  
以下函数可以将数值标签转成相应的文本标签。  

In [None]:
def get_fashion_mnist_labels(labels):
    text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat',
                   'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
    return [text_labels[int(i)] for i in labels]

### 批量读取

In [16]:
import torch.utils.data as Data

In [17]:
batch_size = 256
if sys.platform.startswith('win'):
    num_workers = 0  # 0表示不用额外的进程来加速读取数据
else:
    num_workers = 4
train_iter = Data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
test_iter = Data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)

In [19]:
start = time.time()
for X, y in train_iter:
    continue
print('%.2f sec' % (time.time() - start))

5.00 sec


### 定义模型

在3.4节（softmax回归）中提到，softmax回归的输出层是一个全连接层，所以我们用一个线性模块就可以了。 
因为前面我们数据返回的每个batch样本x的形状为(batch_size, 1, 28, 28),   
所以我们要先用view()将x的形状转换成(batch_size, 784)才送入全连接层。  

In [84]:
num_inputs = 784
num_outputs = 10

class LinearNet(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(LinearNet, self).__init__()
        self.linear = nn.Linear(num_inputs, num_outputs)
    def forward(self, x): # x shape: (batch, 1, 28, 28)
        y = self.linear(x.view(x.shape[0], -1))
        return y

net = LinearNet(num_inputs, num_outputs)

In [85]:
init.normal_(net.linear.weight, mean=0, std=0.1)
init.constant_(net.linear.bias, val=0) 
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.1)

In [82]:
def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0.0, 0
    for X, y in data_iter:
        acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
        n += y.shape[0]
    return acc_sum / n

In [86]:
num_epochs = 5
for epoch in range(num_epochs):
    train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
    for X, y in train_iter:
        y_hat = net(X)
        l = loss(y_hat, y).sum()
        # 梯度清零
        optimizer.zero_grad()
        # 计算梯度并更新
        l.backward()
        optimizer.step()


        train_l_sum += l.item()
        train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item()
        n += y.shape[0]
    test_acc = evaluate_accuracy(test_iter, net)
    print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
          % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))

epoch 1, loss 0.0032, train acc 0.730, test acc 0.777
epoch 2, loss 0.0023, train acc 0.805, test acc 0.804
epoch 3, loss 0.0021, train acc 0.821, test acc 0.812
epoch 4, loss 0.0020, train acc 0.827, test acc 0.819
epoch 5, loss 0.0019, train acc 0.833, test acc 0.824


### 多层感知机

In [15]:
import torchvision
import torchvision.transforms as transforms
from torch.nn import init
import torch
import numpy as np
import random
import torch.nn as nn
import torch.utils.data as Data

In [16]:
def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0.0, 0
    for X, y in data_iter:
        acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
        n += y.shape[0]
    return acc_sum / n

def get_data_iter():
    # 读取数据
    mnist_train = torchvision.datasets.FashionMNIST(root='F:/Jupyter_note/DL_pytorch/FashionMNIST',
                                                    train=True, download=True, transform=transforms.ToTensor())
    mnist_test = torchvision.datasets.FashionMNIST(root='F:/Jupyter_note/DL_pytorch/FashionMNIST',
                                                   train=False, download=True, transform=transforms.ToTensor())
    # 生成iter
    batch_size = 256
    num_workers = 0  # 0表示不用额外的进程来加速读取数据
    train_iter = Data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    test_iter = Data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)
    return train_iter, test_iter

In [42]:
class Net(nn.Module):
    def __init__(self, num_inputs, num_outputs, num_hiddens):
        super(Net, self).__init__()
        self.linear_1 = nn.Linear(num_inputs, num_hiddens)
        self.ReLU = nn.ReLU()
        self.linear_2 = nn.Linear(num_hiddens, num_outputs)
        
    def forward(self, x): # x shape: (batch, 1, 28, 28)
        y = x.view(x.shape[0], -1)
        y = self.linear_1(y)
        y = self.ReLU(y)
        y = self.linear_2(y)
        return y

In [50]:
num_inputs, num_outputs, num_hiddens = 784, 10, 256
net = Net(num_inputs, num_outputs, num_hiddens)
for params in net.parameters():
    init.normal_(params, mean=0, std=0.1)

In [53]:
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.5)

In [17]:
train_iter, test_iter = get_data_iter()

In [54]:
num_epochs = 5
for epoch in range(num_epochs):
    train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
    for X, y in train_iter:
        y_hat = net(X)
        l = loss(y_hat, y).sum()
        # 梯度清零
        optimizer.zero_grad()
        # 计算梯度并更新
        l.backward()
        optimizer.step()
        # 计算预测acc
        train_l_sum += l.item()
        train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item()
        n += y.shape[0]
    test_acc = evaluate_accuracy(test_iter, net)
    print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
          % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))

epoch 1, loss 0.0034, train acc 0.732, test acc 0.808
epoch 2, loss 0.0018, train acc 0.830, test acc 0.818
epoch 3, loss 0.0016, train acc 0.847, test acc 0.846
epoch 4, loss 0.0015, train acc 0.862, test acc 0.829
epoch 5, loss 0.0014, train acc 0.869, test acc 0.828


# 卷积神经网络

## Lenet

In [26]:
import torchvision
import torchvision.transforms as transforms

In [24]:
import torch.utils.data as Data

In [1]:
import time
import torch
from torch import nn, optim

In [67]:
def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0.0, 0
    net.eval()
    for X, y in data_iter:
        X = X.to(device)
        y = y.to(device)
        acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
        n += y.shape[0]
    net.train()
    return acc_sum / n

def get_data_iter():
    # 读取数据
    mnist_train = torchvision.datasets.FashionMNIST(root='F:/Jupyter_note/DL_pytorch/FashionMNIST',
                                                    train=True, download=True, transform=transforms.ToTensor())
    mnist_test = torchvision.datasets.FashionMNIST(root='F:/Jupyter_note/DL_pytorch/FashionMNIST',
                                                   train=False, download=True, transform=transforms.ToTensor())
    # 生成iter
    batch_size = 256
    num_workers = 0  # 0表示不用额外的进程来加速读取数据
    train_iter = Data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    test_iter = Data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)
    return train_iter, test_iter

In [97]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 6, 5), # in_channels, out_channels, kernel_size
            nn.Sigmoid(),
            nn.MaxPool2d(2, 2), # kernel_size, stride
            nn.Conv2d(6, 16, 5),
            nn.Sigmoid(),
            nn.MaxPool2d(2, 2)
        )
        self.fc = nn.Sequential(
            nn.Linear(16*4*4, 120),
            nn.Sigmoid(),
            nn.Linear(120, 84),
            nn.Sigmoid(),
            nn.Linear(84, 10)
        )

    def forward(self, img):# img (batch, 28, 28)
        feature = self.conv(img)# feature (batch, 16, 4, 4)
        output = self.fc(feature.view(img.shape[0], -1))
        return output

In [98]:
net = LeNet()
net.to(device)
print(net)

LeNet(
  (conv): Sequential(
    (0): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
    (1): Sigmoid()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
    (4): Sigmoid()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Sequential(
    (0): Linear(in_features=256, out_features=120, bias=True)
    (1): Sigmoid()
    (2): Linear(in_features=120, out_features=84, bias=True)
    (3): Sigmoid()
    (4): Linear(in_features=84, out_features=10, bias=True)
  )
)


In [99]:
for name, param in net.named_parameters():
    print(name, param)
    break

conv.0.weight Parameter containing:
tensor([[[[-0.0447,  0.1498,  0.1664, -0.1300, -0.1933],
          [-0.0044, -0.0787,  0.0704, -0.1642, -0.0604],
          [-0.0275,  0.0372,  0.0364, -0.1809,  0.0630],
          [-0.1100,  0.0910, -0.0143, -0.1816,  0.1740],
          [ 0.1726, -0.0012,  0.0366, -0.1901,  0.0428]]],


        [[[-0.0514, -0.0151, -0.0135,  0.0042, -0.1135],
          [ 0.1302, -0.0471,  0.1057, -0.1515,  0.1249],
          [-0.0178,  0.0595,  0.0802,  0.1590, -0.1888],
          [-0.1518, -0.1957, -0.1996, -0.1830, -0.1277],
          [-0.1735,  0.0279,  0.0448, -0.0790, -0.0608]]],


        [[[-0.1874,  0.0503,  0.0393,  0.0466,  0.0062],
          [ 0.0718,  0.0105,  0.1346,  0.0446, -0.0668],
          [ 0.1350,  0.1670,  0.1149,  0.0852, -0.0159],
          [ 0.1259, -0.0072, -0.0493,  0.0628, -0.0880],
          [ 0.1495, -0.0791, -0.1339,  0.1067, -0.0509]]],


        [[[ 0.1717,  0.1382,  0.0347,  0.1180,  0.1857],
          [ 0.0899,  0.0757,  0.0455,  0

In [83]:
for name, param in net.named_parameters():
    print(name, param)
    break

conv.0.weight Parameter containing:
tensor([[[[ 0.1408, -0.1915, -0.0606,  0.1904,  0.1267],
          [-0.1221,  0.0621, -0.1017, -0.1557,  0.0313],
          [ 0.1557,  0.1507,  0.0728,  0.0564, -0.0363],
          [ 0.1698, -0.0343,  0.0551,  0.1881,  0.1608],
          [ 0.1054, -0.0694,  0.1368, -0.1188,  0.0069]]],


        [[[ 0.0129,  0.1824,  0.1115,  0.1559, -0.0152],
          [ 0.0184, -0.0912, -0.1593, -0.0969, -0.0770],
          [-0.1178,  0.1054,  0.0763, -0.1982,  0.1377],
          [-0.1782,  0.1849,  0.1206, -0.1703,  0.0546],
          [ 0.0590, -0.0479,  0.0069, -0.0824, -0.1583]]],


        [[[-0.1276,  0.1847,  0.1320, -0.1566, -0.1842],
          [-0.1012,  0.0070, -0.0594, -0.0676, -0.1713],
          [-0.1644,  0.1918, -0.1226, -0.0124, -0.1112],
          [ 0.0790, -0.1671,  0.1654,  0.1232, -0.0453],
          [-0.1147,  0.0073, -0.1968, -0.0542,  0.0527]]],


        [[[-0.0663,  0.0003,  0.1822, -0.0942, -0.0629],
          [ 0.0748, -0.1549, -0.0485, -0

In [44]:
train_iter, test_iter = get_data_iter()

In [100]:
optimizer = torch.optim.Adam(net.parameters(), lr=0.01)
loss = nn.CrossEntropyLoss()

In [46]:
import time 

In [101]:
num_epochs = 2

start = time.time()
for epoch in range(num_epochs):
    for X, y in train_iter:
        X = X.to(device)
        y = y.to(device)
        y_hat = net(X)
        l = loss(y_hat, y).sum()
        # 梯度清零
        optimizer.zero_grad()
        # 计算梯度并更新
        l.backward()
        optimizer.step()
    test_acc = evaluate_accuracy(test_iter, net)
    print('epoch %d, test acc %.3f, time %d'
          % (epoch + 1, test_acc, time.time() - start))

epoch 1, test acc 0.746, time 5
epoch 2, test acc 0.812, time 11
