In [1]:
import torch
# torchvision.datasets.FashionMNIST
import torchvision
# 修改数据集格式
from torchvision import transforms
# DataLoader
from torch.utils.data import DataLoader
# nn块
from torch import nn

# net模式

In [None]:
# 将模型设置为评估模式
net.eval()

# 将模型设置为训练模式
net.train()

# -------------------------------如何使用的例子
if isinstance(net, torch.nn.Module):
    net.train()

In [None]:
# 正态分布n(0,1), size (100, 10)
torch.normal(0, 1, (100, 10))

# 激活函数

## 修正线性单元（Rectified linear unit，ReLU）

$$ \operatorname{ReLU}(x) = \max(x, 0) $$

求导表现得特别好(要么是0,要么是1)：要么让参数消失，要么让参数通过

0点不可导, 不光滑

当使用不同的参数初始化方法时，ReLU激活函数使训练模型更加容易。 当sigmoid激活函数的输出非常接近于0或1时，这些区域的梯度几乎为0，因此反向传播无法继续更新一些模型参数。 相反，ReLU激活函数在正区间的梯度总是1。 因此，如果模型参数没有正确初始化，sigmoid函数可能在正区间内得到几乎为0的梯度，从而使模型无法得到有效的训练。

## sigmoid 挤压函数（squashing function）

$$ \operatorname{sigmoid}(x) = \frac{1}{1 + \exp(-x)} $$

$$ \frac{d}{dx} \operatorname{sigmoid}(x) = \frac{\exp(-x)}{(1 + \exp(-x))^2} = \operatorname{sigmoid}(x)\left(1-\operatorname{sigmoid}(x)\right) $$

它将范围（-inf, inf）中的任意输入压缩到区间（0, 1）中的某个值.

平滑的、可微的.

当输入为0时，sigmoid函数的导数达到最大值0.25； 而输入在任一方向上越远离0点时，导数越接近0。



## tanh函数
$$\operatorname{tanh}(x) = \frac{1 - \exp(-2x)}{1 + \exp(-2x)}$$

将其输入压缩转换到区间(-1, 1)

不同的是tanh函数关于坐标系原点中心对称。

$$\frac{d}{dx} \operatorname{tanh}(x) = 1 - \operatorname{tanh}^2(x)$$

当输入接近0时，tanh函数的导数接近最大值1。 与我们在sigmoid函数图像中看到的类似， 输入在任一方向上越远离0点，导数越接近0。

# loss

In [None]:
def init_weights(m):
    if type(m) == nn.Linear or type(m) == nn.Conv2d:
        nn.init.xavier_uniform_(m.weight)


net.apply(init_weights)
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()

In [None]:
# reduction='none'


# 交叉熵损失
loss = nn.CrossEntropyLoss()

# 均方误差
loss = nn.MSELoss()

> 均方误差

reduction的意思是维度要不要缩减，以及怎么缩减。

$ℓ(x,y)= \begin{cases} 
L, where\ L_i=(x_i−y_i)^2 &\text{if reduction='None'} \\
sum(L) &\text{if reduction='sum'} \\
mean(L) &\text{if reduction='mean'} \\
\end{cases}$

In [1]:
import torch
import torch.nn as nn

a = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)
b = torch.tensor([[3, 5], [8, 6]], dtype=torch.float32)

loss_fn1 = torch.nn.MSELoss(reduction='none')
loss1 = loss_fn1(a, b)
print(loss1)

loss_fn2 = torch.nn.MSELoss(reduction='sum')
loss2 = loss_fn2(a, b)
print(loss2)

loss_fn3 = torch.nn.MSELoss(reduction='mean')
loss3 = loss_fn3(a, b)
print(loss3)

tensor([[ 4.,  9.],
        [25.,  4.]])
tensor(42.)
tensor(10.5000)


In [5]:
a = torch.randint(0, 9, (2, 2, 3)).float()
b = torch.randint(0, 9, (2, 2, 3)).float()
print('a:', a)
print('b:', b)
 
loss_fn1 = torch.nn.MSELoss(reduction='none')
loss1 = loss_fn1(a, b)
print('loss_none:', loss1)
 
loss_fn2 = torch.nn.MSELoss(reduction='sum')
loss2 = loss_fn2(a, b)
print('loss_sum:', loss2)
 
 
loss_fn3 = torch.nn.MSELoss(reduction='mean')
loss3 = loss_fn3(a, b)
print('loss_mean:', loss3)

a: tensor([[[3., 0., 0.],
         [6., 5., 4.]],

        [[0., 3., 7.],
         [7., 4., 2.]]])
b: tensor([[[6., 3., 7.],
         [2., 6., 4.]],

        [[0., 4., 2.],
         [5., 6., 2.]]])
loss_none: tensor([[[ 9.,  9., 49.],
         [16.,  1.,  0.]],

        [[ 0.,  1., 25.],
         [ 4.,  4.,  0.]]])
loss_sum: tensor(118.)
loss_mean: tensor(9.8333)


# train_loop

![](../image/eval.png)

In [None]:
optimizer.zero_grad()  # clear gradients for next train
train_loss.backward()  # backpropagation, compute gradients
optimizer.step()       # apply gradients

- 优化器

  测试网络时又不用优化器, 那确实没必要写在外面, 也少写一个传参. 

In [None]:
def train_loop(train_iter, net, loss, optimizer):
    # 共有几批
    num_batchs = len(train_iter)
    # 总平均loss
    total_train_loss = 0
    for batch, (X, y) in enumerate(train_iter):
        # move to device
        X, y = X.to(device), y.to(device)
        # 该批的推断结果
        y_hat = net(X)
        
        train_loss = loss(y_hat, y)
        total_train_loss += train_loss.item()

        # Backpropagation
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        # --------打印进度        
        print(f"\r[{batch+1:>8d}/{num_batchs:>8d}]  ", end='')

    
    return total_train_loss / num_batchs

In [None]:
# ---------训练
for epoch in range(num_epochs):
    total_train_loss = train_loop(train_iter, net, loss, optimizer)
    print(f'epoch {epoch + 1}, total_train_loss {total_train_loss:f}')

In [None]:
# ----------预测
def test_net(test_iter, net, loss):
    # 共有几批
    num_batchs = len(test_iter)
    # 总平均loss, 总平均准确率
    total_test_loss, total_correct = 0, 0
    # 设定评估模式
    net.eval()
    # 不要梯度
    with torch.no_grad():
        for batch, (X, y) in enumerate(test_iter):
            # move to device
            X, y = X.to(device), y.to(device)
            y_hat = net(X)

            test_loss = loss(y_hat, y)
            # 分类0,1,2,3的类别对的上否
            correct = (y_hat.argmax(1) == y).float().sum().item()
            total_test_loss += test_loss.item()
            total_correct += correct/len(X)

            # --------打印进度
            print(f"\r[{batch+1:>8d}/{num_batchs:>8d}]  ", end='')


    total_test_loss /= num_batchs
    total_correct /= num_batchs
    print(
        f"\nTest: Accuracy: {total_correct:.1%}, Avg loss: {total_test_loss:f}")
    
test_net(test_iter, net, loss)