pytorch的优化器：管理更新模型中可学习的参数值，使得模型更接近真实值  
基本属性：
* defaults :优化器超参数
* state：参数的缓存，如momentum的缓存
* params_groups：管理的参数组
* —step_count：记录更新次数，学习率调整使用

### 1 基本方法

In [1]:
import torch
from torch.optim import SGD
import os

* zero_grad()：清空所管理参数的梯度
* step()：执行更新
* add_param_group():添加参数组
* state_dict():获取优化器当前状态信息字典
* load_state_dict():加载状态信息字典

In [2]:
weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))

optimizer = SGD([weight], lr=0.1)

1 step

In [31]:
print("weight before step:{}".format(weight.data))
optimizer.step()        # 修改lr=1 0.1观察结果
print("weight after step:{}".format(weight.data))

weight before step:tensor([[ 1.1966, -0.1597],
        [-0.1541, -0.4374]])
weight after step:tensor([[ 1.0966, -0.2597],
        [-0.2541, -0.5374]])


2.zero_grad

In [32]:
print("weight in optimizer:{}\nweight in weight:{}\n".format(id(optimizer.param_groups[0]['params'][0]), id(weight)))

print("weight.grad is {}\n".format(weight.grad))
optimizer.zero_grad()
print("after optimizer.zero_grad(), weight.grad is\n{}".format(weight.grad))

weight in optimizer:140293055829192
weight in weight:140293055829192

weight.grad is tensor([[1., 1.],
        [1., 1.]])

after optimizer.zero_grad(), weight.grad is
tensor([[0., 0.],
        [0., 0.]])


3.add_param_group

In [33]:
print("optimizer.param_groups is\n{}".format(optimizer.param_groups))

optimizer.param_groups is
[{'params': [tensor([[ 1.0966, -0.2597],
        [-0.2541, -0.5374]], requires_grad=True)], 'lr': 0.1, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False}]


In [34]:
 w2 = torch.randn((3, 3), requires_grad=True)

In [35]:
optimizer.add_param_group({"params": w2, 'lr': 0.0001})

In [36]:
print("optimizer.param_groups is\n{}".format(optimizer.param_groups))

optimizer.param_groups is
[{'params': [tensor([[ 1.0966, -0.2597],
        [-0.2541, -0.5374]], requires_grad=True)], 'lr': 0.1, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False}, {'params': [tensor([[ 1.0436, -0.0512,  0.5632],
        [ 1.8535, -0.5516, -1.3481],
        [-1.2838, -0.8173,  1.8077]], requires_grad=True)], 'lr': 0.0001, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False}]


In [6]:
weight1 = torch.randn((2, 2), requires_grad=True)
weight1.grad = torch.ones((2, 2))

4 state_dict

In [7]:
optimizer1 = SGD([weight1], lr=0.1, momentum=0.9)
opt_state_dict = optimizer1.state_dict()

print("state_dict before step:\n", opt_state_dict)

state_dict before step:
 {'state': {}, 'param_groups': [{'lr': 0.1, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'params': [140617682880696]}]}


In [39]:
for i in range(10):
    optimizer1.step()

In [40]:
print("state_dict after step:\n", optimizer1.state_dict())

state_dict after step:
 {'state': {140294388229536: {'momentum_buffer': tensor([[6.5132, 6.5132],
        [6.5132, 6.5132]])}}, 'param_groups': [{'lr': 0.1, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'params': [140294388229536]}]}


In [41]:
 torch.save(optimizer1.state_dict(), os.path.join('/home/zhanggong-study/pytorch_tutorial', "optimizer_state_dict.pkl"))

In [3]:
state_dict = torch.load(os.path.join('/home/zhanggong-study/pytorch_tutorial', "optimizer_state_dict.pkl"))

In [8]:
print("state_dict before load state:\n", optimizer1.state_dict())

state_dict before load state:
 {'state': {}, 'param_groups': [{'lr': 0.1, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'params': [140617682880696]}]}


In [9]:
optimizer1.load_state_dict(state_dict)
print("state_dict after load state:\n", optimizer1.state_dict())

state_dict after load state:
 {'state': {140617682880696: {'momentum_buffer': tensor([[6.5132, 6.5132],
        [6.5132, 6.5132]])}}, 'param_groups': [{'lr': 0.1, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'params': [140617682880696]}]}
