In [2]:
import torchvision
import torchvision.transforms as transforms
import time
import sys

In [34]:
from torch.nn import init

In [25]:
import torch
from matplotlib import pyplot as plt
import numpy as np
import random
import torch.nn as nn
%matplotlib

Using matplotlib backend: Qt5Agg


# 基础

## 线性回归

我们使用线性回归模型真实权重 w=[2,−3.4]⊤ 和偏差 b=4.2以及一个随机噪声项 ϵϵ 来生成标签   
y=Xw+b+ϵ

### 手写

In [86]:
# 生成1000个数据
torch.manual_seed(10)
num_inputs = 2
num_examples = 100
true_w = [2, -3.4]
true_b = 4.2
features = torch.randn(num_examples, num_inputs, dtype=torch.float32)
labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
labels += torch.tensor(np.random.normal(0, 0.1, size=labels.size()), dtype=torch.float32)

In [91]:
# 每次返回batch_size（批量⼤小）个随机样本的特征和标签
def data_iter(batch_size, features, labels):
    num_examples = len(features)
    indices = list(range(num_examples))
    random.shuffle(indices)  # 样本的读取顺序是随机的
    for i in range(0, num_examples, batch_size):
        j = torch.LongTensor(indices[i: min(i + batch_size, num_examples)]) # 最后一次可能不足一个batch
        yield  features.index_select(0, j), labels.index_select(0, j)

#定义模型
def linreg(X, w, b):  
    return torch.mm(X, w) + b

# 损失函数
def squared_loss(y_hat, y):  
    # print(y.view(y_hat.size()))
    return (y_hat - y.view(y_hat.size())) ** 2 / 2

def sgd(params, lr, batch_size):  # 本函数已保存在d2lzh_pytorch包中方便以后使用
    for param in params:
        param.data -= lr * param.grad / batch_size # 注意这里更改param时用的param.data

In [92]:
# 初始化参数
w = torch.tensor(np.random.normal(0, 0.01, (num_inputs, 1)), dtype=torch.float32, requires_grad=True)
b = torch.zeros(1, dtype=torch.float32, requires_grad=True)

lr = 0.03
num_epochs = 10
net = linreg
loss = squared_loss

for epoch in range(num_epochs):  # 训练模型一共需要num_epochs个迭代周期
    # 在每一个迭代周期中，会使用训练数据集中所有样本一次（假设样本数能够被批量大小整除）。X
    # 和y分别是小批量样本的特征和标签
    for X, y in data_iter(batch_size, features, labels):
        l = loss(net(X, w, b), y).sum()  # l是有关小批量X和y的损失
        l.backward()  # 小批量的损失对模型参数求梯度
        sgd([w, b], lr, batch_size)  # 使用小批量随机梯度下降迭代模型参数

        # 不要忘了梯度清零
        w.grad.data.zero_()
        b.grad.data.zero_()
    train_l = loss(net(features, w, b), labels)
    print('epoch %d, loss %f' % (epoch + 1, train_l.mean()))

epoch 1, loss 9.067955
epoch 2, loss 4.756135
epoch 3, loss 2.508253
epoch 4, loss 1.328605
epoch 5, loss 0.707993
epoch 6, loss 0.379959
epoch 7, loss 0.206142
epoch 8, loss 0.113420
epoch 9, loss 0.063855
epoch 10, loss 0.037072


In [93]:
print(true_w, '\n', w)
print(true_b, '\n', b)

[2, -3.4] 
 tensor([[ 1.8336],
        [-3.2457]], requires_grad=True)
4.2 
 tensor([4.0469], requires_grad=True)


## LR简洁实现

In [None]:
# 生成1000个数据
torch.manual_seed(10)
num_inputs = 2
num_examples = 100
true_w = [2, -3.4]
true_b = 4.2
features = torch.randn(num_examples, num_inputs, dtype=torch.float32)
labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
labels += torch.tensor(np.random.normal(0, 0.1, size=labels.size()), dtype=torch.float32)

### 读取数据

In [105]:
import torch.utils.data as Data

batch_size = 10
# 将训练数据的特征和标签组合
dataset = Data.TensorDataset(features, labels)
# 随机读取小批量
data_iter = Data.DataLoader(dataset, batch_size, shuffle=True)

In [99]:
# DataLoader(dataset, batch_size=1, shuffle=False, sampler=None,
#            batch_sampler=None, num_workers=0, collate_fn=None,
#            pin_memory=False, drop_last=False, timeout=0,
#            worker_init_fn=None, *, prefetch_factor=2,
#            persistent_workers=False)

### 定义一个nn

In [178]:
class LinearNet(nn.Module):
    def __init__(self, n_feature):
        super(LinearNet, self).__init__()
        self.linear = nn.Linear(n_feature, 1)
    # forward 定义前向传播
    def forward(self, x):
        y = self.linear(x)
        return y

net = LinearNet(num_inputs)
print(net) # 使用print可以打印出网络的结构

LinearNet(
  (linear): Linear(in_features=2, out_features=1, bias=True)
)


In [None]:
# 事实上我们还可以用nn.Sequential来更加方便地搭建网络，Sequential是一个有序的容器，
# 网络层将按照在传入Sequential的顺序依次被添加到计算图中。

# 写法一
net = nn.Sequential(
    nn.Linear(num_inputs, 1)
    # 此处还可以传入其他层
    )

# 写法二
net = nn.Sequential()
net.add_module('linear', nn.Linear(num_inputs, 1))
# net.add_module ......

In [176]:
# 可以通过net.parameters()来查看模型所有的可学习参数，此函数将返回一个生成器。
for param in net.parameters():
    print(param)

Parameter containing:
tensor([[-0.4271, -0.3423]], requires_grad=True)
Parameter containing:
tensor([0.5118], requires_grad=True)
Parameter containing:
tensor([[-0.7674]], requires_grad=True)
Parameter containing:
tensor([-0.0857], requires_grad=True)


### 初始化参数

在使用net前，我们需要初始化模型参数，如线性回归模型中的权重和偏差。PyTorch在init模块中提供了  
多种参数初始化方法。这里的init是initializer的缩写形式。我们通过init.normal_将权重参数每个元素初  
始化为随机采样于均值为0、标准差为0.01的正态分布。偏差会初始化为零。  

In [115]:
net.linear.bias

Parameter containing:
tensor([-0.0942], requires_grad=True)

In [140]:
from torch.nn import init

init.normal_(net.linear.weight, mean=0, std=0.1)
init.constant_(net.linear.bias, val=0)  # 也可以直接修改bias的data: net[0].bias.data.fill_(0)

Parameter containing:
tensor([0.], requires_grad=True)

### 损失函数

In [141]:
loss = nn.MSELoss()

### optim

同样，我们也无须自己实现小批量随机梯度下降算法。torch.optim模块提供了很多常用的优  
化算法比如SGD、Adam和RMSProp等。下面我们创建一个用于优化net所有参数的优化器实例，  
并指定学习率为0.03的小批量随机梯度下降（SGD）为优化算法。  

In [142]:
import torch.optim as optim

optimizer = optim.SGD(net.parameters(), lr=0.03)
print(optimizer)

SGD (
Parameter Group 0
    dampening: 0
    lr: 0.03
    momentum: 0
    nesterov: False
    weight_decay: 0
)


### 训练

In [143]:
num_epochs = 4
for epoch in range(1, num_epochs + 1):
    for X, y in data_iter:
        output = net(X)
        # print(output.shape)
        l = loss(output, y.view(output.size()))
        optimizer.zero_grad() # 梯度清零，等价于net.zero_grad()
        l.backward()
        optimizer.step()
    print('epoch %d, loss: %f' % (epoch, l.item()))

epoch 1, loss: 7.204216
epoch 2, loss: 3.043922
epoch 3, loss: 0.444130
epoch 4, loss: 0.328976


In [171]:
print(true_w, net.linear.weight)

[2, -3.4] Parameter containing:
tensor([[ 1.7104, -3.1337]], requires_grad=True)


In [172]:
print(true_b, net.linear.bias)

4.2 Parameter containing:
tensor([3.9224], requires_grad=True)


## softmax

In [87]:
mnist_train = torchvision.datasets.FashionMNIST(root='F:/Jupyter_note/DL_pytorch/FashionMNIST', train=True, download=True, transform=transforms.ToTensor())
mnist_test = torchvision.datasets.FashionMNIST(root='F:/Jupyter_note/DL_pytorch/FashionMNIST', train=False, download=True, transform=transforms.ToTensor())

In [14]:
print(len(mnist_train), len(mnist_test))

60000 10000


Fashion-MNIST中一共包括了10个类别，分别为t-shirt（T恤）、trouser（裤子）、pullover（套衫）、  
dress（连衣裙）、coat（外套）、sandal（凉鞋）、shirt（衬衫）、sneaker（运动鞋）、bag（包）和ankle boot（短靴）。  
以下函数可以将数值标签转成相应的文本标签。  

In [None]:
def get_fashion_mnist_labels(labels):
    text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat',
                   'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot']
    return [text_labels[int(i)] for i in labels]

### 批量读取

In [16]:
import torch.utils.data as Data

In [17]:
batch_size = 256
if sys.platform.startswith('win'):
    num_workers = 0  # 0表示不用额外的进程来加速读取数据
else:
    num_workers = 4
train_iter = Data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=num_workers)
test_iter = Data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=num_workers)

In [19]:
start = time.time()
for X, y in train_iter:
    continue
print('%.2f sec' % (time.time() - start))

5.00 sec


### 定义模型

在3.4节（softmax回归）中提到，softmax回归的输出层是一个全连接层，所以我们用一个线性模块就可以了。 
因为前面我们数据返回的每个batch样本x的形状为(batch_size, 1, 28, 28),   
所以我们要先用view()将x的形状转换成(batch_size, 784)才送入全连接层。  

In [84]:
num_inputs = 784
num_outputs = 10

class LinearNet(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(LinearNet, self).__init__()
        self.linear = nn.Linear(num_inputs, num_outputs)
    def forward(self, x): # x shape: (batch, 1, 28, 28)
        y = self.linear(x.view(x.shape[0], -1))
        return y

net = LinearNet(num_inputs, num_outputs)

In [85]:
init.normal_(net.linear.weight, mean=0, std=0.1)
init.constant_(net.linear.bias, val=0) 
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.1)

In [82]:
def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0.0, 0
    for X, y in data_iter:
        acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
        n += y.shape[0]
    return acc_sum / n

In [86]:
num_epochs = 5
for epoch in range(num_epochs):
    train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
    for X, y in train_iter:
        y_hat = net(X)
        l = loss(y_hat, y).sum()
        # 梯度清零
        optimizer.zero_grad()
        # 计算梯度并更新
        l.backward()
        optimizer.step()


        train_l_sum += l.item()
        train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item()
        n += y.shape[0]
    test_acc = evaluate_accuracy(test_iter, net)
    print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
          % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))

epoch 1, loss 0.0032, train acc 0.730, test acc 0.777
epoch 2, loss 0.0023, train acc 0.805, test acc 0.804
epoch 3, loss 0.0021, train acc 0.821, test acc 0.812
epoch 4, loss 0.0020, train acc 0.827, test acc 0.819
epoch 5, loss 0.0019, train acc 0.833, test acc 0.824


In [56]:
x = mnist_train[0][0]