<a href="https://colab.research.google.com/github/veager/StudyNotes/blob/new/Codes/PyTorch-Tutorial/PyTorch-torch.optim%E6%A8%A1%E5%9D%97-%E4%BC%98%E5%8C%96%E5%99%A8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

PyTorch `torch.optim` 模块 优化器

参考资料：

- 博客：Pytorch torch.optim 模块 优化器, [site](https://www.cnblogs.com/veager/articles/16305151.html)

- Github：Codes/PyTorch-Tutorial/PyTorch-torch.optim模块-优化器.ipynb

# 0 加载数据

In [1]:
import time
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

## 0.1 加载 Iris 数据集

In [2]:
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import MinMaxScaler

data = load_diabetes()
X = data.data
Y = data.target

# 将输入输出数据归一化到 [0, 1] 之间
scaler_X = MinMaxScaler().fit(X)
scaler_Y = MinMaxScaler().fit(np.expand_dims(Y, axis=1))

Xs = scaler_X.transform(X)
Ys = scaler_Y.transform(np.expand_dims(Y, axis=1))

print(Xs.shape, Ys.shape)
print(Xs[:5], Ys[:5])
print(pd.DataFrame(Xs).describe(), pd.DataFrame(Ys).describe())

# 将数据转换为 tensor 类型 
Xs_tensor = torch.tensor(Xs, dtype=torch.float)
Ys_tensor = torch.tensor(Ys, dtype=torch.float)

(442, 10) (442, 1)
[[0.66666667 1.         0.58264463 0.54929577 0.29411765 0.25697211
  0.20779221 0.28208745 0.56221737 0.43939394]
 [0.48333333 0.         0.14876033 0.35211268 0.42156863 0.30677291
  0.62337662 0.14104372 0.22244301 0.16666667]
 [0.88333333 1.         0.51652893 0.43661972 0.28921569 0.25896414
  0.24675325 0.28208745 0.49658437 0.40909091]
 [0.08333333 0.         0.30165289 0.30985915 0.49509804 0.44721116
  0.23376623 0.42313117 0.57293604 0.46969697]
 [0.51666667 0.         0.20661157 0.54929577 0.46568627 0.41733068
  0.38961039 0.28208745 0.36236911 0.33333333]] [[0.39252336]
 [0.15576324]
 [0.36137072]
 [0.56386293]
 [0.34267913]]
                0           1           2           3           4           5  \
count  442.000000  442.000000  442.000000  442.000000  442.000000  442.000000   
mean     0.491968    0.468326    0.346107    0.459818    0.451668    0.367725   
std      0.218484    0.499561    0.182567    0.194806    0.169647    0.151460   
min      0

## 0.2 全局参数设置

In [3]:
# 样本信息，划分的数据集
N_SAMPLE = Xs_tensor.size()[0]              # 样本总数
N_TRAIN = int(N_SAMPLE * 0.7)               # 训练样本数
N_VALID = int(N_SAMPLE * 0.2)               # 验证样本数
N_TEST = N_SAMPLE - N_TRAIN - N_VALID       # 测试样本数


# 训练过程超参数设置
BATCH_SIZE = 64
N_EPOCH = 1000
LEARNING_RATE = 0.05


# 神经网络模型参数
HIDDEN_DIM = 4
INPUT_DIM = Xs_tensor.size()[1]     # sizes of input data and output data
OUTPUT_DIM = Ys_tensor.size()[1]
print("NN Structure:", INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM)


# 设置 device，如果 GPU 可用，则使用
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print("device:", DEVICE)

NN Structure: 10 4 1
device: cpu


## 0.3 创建结构化数据

In [4]:
from torch.utils.data import TensorDataset, Dataset, random_split, DataLoader
from torch.utils.data.dataloader import default_collate

# DataSet 类
ds = TensorDataset(Xs_tensor, Ys_tensor)

# split training, validation, testing data
ds_train, ds_vaild, ds_test = random_split(ds, lengths=[N_TRAIN, N_VALID, N_TEST])
print(len(ds_train), len(ds_vaild), len(ds_test))


# DataLoader of train data, valid data, test data
dl_train = DataLoader(ds_train, batch_size = BATCH_SIZE, shuffle = True,
    collate_fn = lambda x: tuple(x_.to(DEVICE) for x_ in default_collate(x))
    )

dl_valid = DataLoader(ds_vaild, batch_size = BATCH_SIZE, # default shuffle = False,
    collate_fn = lambda x: tuple(x_.to(DEVICE) for x_ in default_collate(x))
    )

dl_test = DataLoader(ds_test, batch_size = BATCH_SIZE, # default shuffle = False,
    collate_fn = lambda x: tuple(x_.to(DEVICE) for x_ in default_collate(x))
    )

309 88 45


## 0.4 定义模型

In [5]:
# 方式 2: 全部使用 层类（nn.Module 类）
class BPNNModeler2(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):

        super(BPNNModeler2, self).__init__()

        # Layer 1
        self.layer1_linear = nn.Linear(input_dim, hidden_dim)
        self.layer1_sigmoid = nn.Sigmoid()
        # Layer 2
        self.layer2_linear = nn.Linear(hidden_dim, output_dim)
        self.layer2_sigmoid = nn.Sigmoid()
        # Output
        self.layer2_flattern = nn.Flatten(0, -1)
    
    # 正向传播
    def forward(self, x):
        # Layer 1
        out_layer1 = self.layer1_linear(x)
        out_layer1 = self.layer1_sigmoid(out_layer1)
        # Layer 2
        out_layer2 = self.layer2_linear(out_layer1)
        out_layer2 = self.layer2_sigmoid(out_layer2)
        # Output
        out = self.layer2_flattern(out_layer2)
        return out
# ==============================================================================

### 0.4.1 初始化模型

In [6]:
# 定义模型
model = BPNNModeler2(input_dim = INPUT_DIM, hidden_dim = HIDDEN_DIM, output_dim = OUTPUT_DIM)
model = model.to(DEVICE)

## 0.5 定义损失函数

In [7]:
# 定义 损失函数 MSE 损失
def loss_func(model_out, target, reduction='mean'):
    loss = F.mse_loss(model_out, target, reduction=reduction) 
    return loss

# 1 优化器：`torch.optim` 模块

## 1.1 优化器 `torch.optim.Optimizer` 类

### 1.1.1 主要参数

**实例**：传入参数组

In [8]:
optimizer_1 = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

optimizer_2 = torch.optim.SGD(
    # 参数组 1：模型的第1个线性层。特别设置参数 lr
    [{'params': model.layer1_linear.parameters(), 'lr': 0.5},  
    # 参数组 2：模型的的第2个线性层。未设置参数，由全局参数决定
     {'params': model.layer2_linear.parameters()}], 
    # 在 list 外的，为全局参数
    lr = LEARNING_RATE)

# 输出优化器的状态和传入参数
print(optimizer_2.state_dict())
# Output
# {'state': {}, 'param_groups': [{'lr': 0.5, ...}, 
#                                {'lr': 0.1, ...}]}

{'state': {}, 'param_groups': [{'lr': 0.5, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'maximize': False, 'params': [0, 1]}, {'lr': 0.05, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'maximize': False, 'params': [2, 3]}]}


### 1.1.2 主要方法

In [9]:
# 输出 优化器的状态和 传入参数
print(optimizer_1.state_dict())
print(optimizer_2.state_dict())

{'state': {}, 'param_groups': [{'lr': 0.05, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'maximize': False, 'params': [0, 1, 2, 3]}]}
{'state': {}, 'param_groups': [{'lr': 0.5, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'maximize': False, 'params': [0, 1]}, {'lr': 0.05, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'maximize': False, 'params': [2, 3]}]}


### 1.1.3 主要属性

In [10]:
# 优化器 默认参数
print(optimizer_2.defaults)

# 优化器 参数组 具体信息
print(optimizer_2.param_groups)
print(optimizer_2.state_dict()['param_groups'])

# 获取 参数组1 的参数， 返回为 list 类型
param_group_1 = optimizer_2.param_groups[0]['params']
print(type(param_group_1), type(param_group_1[0]))

# 获取 参数组1 的 第1个参数
print(param_group_1[0].data)

{'lr': 0.05, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'maximize': False}
[{'params': [Parameter containing:
tensor([[ 0.0235,  0.2899,  0.0748, -0.0052, -0.0145,  0.0643,  0.0654,  0.0825,
         -0.2345,  0.2105],
        [-0.1453, -0.1634, -0.2978, -0.0543, -0.2249,  0.1285,  0.0708, -0.0051,
         -0.1920,  0.2287],
        [ 0.2997, -0.2347,  0.2108,  0.2138, -0.0253,  0.3065, -0.2477, -0.1689,
          0.3011,  0.2740],
        [ 0.1893, -0.0620, -0.0113,  0.1808, -0.0836, -0.0066, -0.2588, -0.1311,
         -0.1181,  0.1099]], requires_grad=True), Parameter containing:
tensor([-0.2799, -0.1103, -0.0127, -0.0850], requires_grad=True)], 'lr': 0.5, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'maximize': False}, {'params': [Parameter containing:
tensor([[ 0.4514, -0.3771,  0.2805,  0.4148]], requires_grad=True), Parameter containing:
tensor([0.4143], requires_grad=True)], 'lr': 0.05, 'momentum': 0, 'dampening': 0, 'weight_dec

## 1.2 优化实例

### 1.2.1 优化器和模型的`.zero_grad()`方法使用区别


**实例 1**： 网络模型训练实例，使用 `optimizer.zero_grad()` 方法



In [11]:
# Step 1: 定义优化器
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

for epoch in range(N_EPOCH):

    train_total_loss = 0.

    for i, (X_batch, Y_batch) in enumerate(dl_train): 

        optimizer.zero_grad()    # Step 2: 参数梯度归零

        out = model(X_batch)                           
        loss = loss_func(out, Y_batch.flatten())  

        loss.backward()          # Step 3: 反向传播，计算梯度
        optimizer.step()         # Step 4: 执行一步优化，更新参数

        train_total_loss += loss.item()


**实例 2**：使用梯度下降更新参数，使用 `model.zero_grad()` 方法

In [12]:
for epoch in range(N_EPOCH):
    
    train_total_loss = 0.

    for i, (X_batch, Y_batch) in enumerate(dl_train):

        model.zero_grad()       # Step 1: 参数梯度归零
        out = model(X_batch)      

        loss = loss_func(out, Y_batch.flatten())  
        loss.backward()         # Step 2: 反向传播，计算梯度

        with torch.no_grad():   
            for param in model.parameters():
                param -= LEARNING_RATE * param.grad     # Step 3: 更新参数
        
        train_total_loss += loss.item()

### 1.2.2 部分参数训练

In [13]:
LEANING_RATE = 0.1
N_EPOCH = 10

# 模型初始参数
w1 = model.layer1_linear.weight.data.clone()
b1 = model.layer1_linear.bias.data.clone()
w2 = model.layer2_linear.weight.data.clone()
b2 = model.layer2_linear.bias.data.clone()

# 仅训练 model2 第2层的线性层
param = model.layer2_linear.parameters()

# Step 1: 定义优化器
optimizer = torch.optim.SGD(param, lr=LEANING_RATE)

for epoch in range(N_EPOCH):
    train_total_loss = 0.
    for i, (X_batch, Y_batch) in enumerate(dl_train): 

        optimizer.zero_grad()    # Step 2: 参数梯度归零

        out = model(X_batch)                           
        loss = loss_func(out, Y_batch.flatten())  

        loss.backward()          # Step 3: 反向传播，计算梯度
        optimizer.step()         # Step 4: 执行一步优化，更新参数

        train_total_loss += loss.item()
    
    # 检查 参数 是否改变
    w1_new = model.layer1_linear.weight.data
    b1_new = model.layer1_linear.bias.data
    w2_new = model.layer2_linear.weight.data
    b2_new = model.layer2_linear.bias.data
    print(epoch + 1, 
          'w1:', torch.sum(torch.abs(w1 - w1_new)).item(),
          'b1:', torch.sum(torch.abs(b1 - b1_new)).item(),
          'w2:', torch.sum(torch.abs(w2 - w2_new)).item(),
          'b2:', torch.sum(torch.abs(b2 - b2_new)).item())

1 w1: 0.0 b1: 0.0 w2: 0.0031652748584747314 b2: 0.0001932978630065918
2 w1: 0.0 b1: 0.0 w2: 0.0065653324127197266 b2: 0.0001818239688873291
3 w1: 0.0 b1: 0.0 w2: 0.00960451364517212 b2: 0.0004006922245025635
4 w1: 0.0 b1: 0.0 w2: 0.012634485960006714 b2: 0.0006747841835021973
5 w1: 0.0 b1: 0.0 w2: 0.015293598175048828 b2: 0.0012349188327789307
6 w1: 0.0 b1: 0.0 w2: 0.01871246099472046 b2: 0.0012495815753936768
7 w1: 0.0 b1: 0.0 w2: 0.022029966115951538 b2: 0.0013096928596496582
8 w1: 0.0 b1: 0.0 w2: 0.02543380856513977 b2: 0.0013377070426940918
9 w1: 0.0 b1: 0.0 w2: 0.028123706579208374 b2: 0.0018067657947540283
10 w1: 0.0 b1: 0.0 w2: 0.03120484948158264 b2: 0.0019614696502685547


# 2 学习速度策略：`torch.optim.lr_scheduler` 模块

In [14]:
optimizer = torch.optim.SGD(
    [{'params': model.parameters(), 'initial_lr': LEARNING_RATE}],  # 需要向 param group 传入 initial_lr 参数
    lr = LEARNING_RATE)     # lr 参数不可忽略 

## 2.1 `StepLR()`

In [15]:
N_EPOCH = 50
LEARNING_RATE = 1.

# 定义 优化器
optimizer = torch.optim.SGD(
    [{'params': model.parameters(), 'initial_lr': LEARNING_RATE}],
    lr = LEARNING_RATE)  # lr 参数不可忽略

# 定义 学习速率调整方式
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

for epoch in range(N_EPOCH):
    train_total_loss = 0.
    
    # 训练
    for i, (X_batch, Y_batch) in enumerate(dl_train):  
        optimizer.zero_grad()
        out = model(X_batch)                        
        loss = loss_func(out, Y_batch.flatten())    
        loss.backward()
        optimizer.step()   
        train_total_loss += loss.item() 

    # 更新 学习速率
    scheduler.step() 

    # Print Traing information
    print('Epoch: {0:>4}, Train Loss: {1:>10.5f}, LR: {2:>10.8f}, Init LR: {3:>10.8f}'.format(
        epoch+1, 
        train_total_loss, 
        optimizer.param_groups[0]['lr'], 
        optimizer.param_groups[0]['initial_lr']))

Epoch:    1, Train Loss:    0.15933, LR: 1.00000000, Init LR: 1.00000000
Epoch:    2, Train Loss:    0.15805, LR: 1.00000000, Init LR: 1.00000000
Epoch:    3, Train Loss:    0.15863, LR: 1.00000000, Init LR: 1.00000000
Epoch:    4, Train Loss:    0.15653, LR: 1.00000000, Init LR: 1.00000000
Epoch:    5, Train Loss:    0.15495, LR: 1.00000000, Init LR: 1.00000000
Epoch:    6, Train Loss:    0.15516, LR: 1.00000000, Init LR: 1.00000000
Epoch:    7, Train Loss:    0.15625, LR: 1.00000000, Init LR: 1.00000000
Epoch:    8, Train Loss:    0.15471, LR: 1.00000000, Init LR: 1.00000000
Epoch:    9, Train Loss:    0.15203, LR: 1.00000000, Init LR: 1.00000000
Epoch:   10, Train Loss:    0.15387, LR: 0.10000000, Init LR: 1.00000000
Epoch:   11, Train Loss:    0.15218, LR: 0.10000000, Init LR: 1.00000000
Epoch:   12, Train Loss:    0.15082, LR: 0.10000000, Init LR: 1.00000000
Epoch:   13, Train Loss:    0.15110, LR: 0.10000000, Init LR: 1.00000000
Epoch:   14, Train Loss:    0.15186, LR: 0.10000000