In [1]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

In [6]:
#build a simple network
class SimpleNet(nn.Module):
    def __init__(self,num_class=10):
        super(SimpleNet,self).__init__()
        self.num_class_ = num_class
        self.layers1 = nn.Sequential(
            nn.Conv2d(3,16,7),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
            )
        self.layers2 = nn.Sequential(
            nn.Conv2d(16,64,3),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
        )
        self.fc3 = nn.Linear(64,256)
        self.fc4 = nn.Linear(256,num_class)
    
    def forward(self,x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.fc3(out)
        out = self.fc4(out)
        return out
        

In [10]:
net = SimpleNet()
for name,param in net.named_parameters():
    print(name,param.shape)

layers1.0.weight torch.Size([16, 3, 7, 7])
layers1.0.bias torch.Size([16])
layers1.1.weight torch.Size([16])
layers1.1.bias torch.Size([16])
layers2.0.weight torch.Size([64, 16, 3, 3])
layers2.0.bias torch.Size([64])
layers2.1.weight torch.Size([16])
layers2.1.bias torch.Size([16])
fc3.weight torch.Size([256, 64])
fc3.bias torch.Size([256])
fc4.weight torch.Size([10, 256])
fc4.bias torch.Size([10])


In [13]:
## optimizer 
## only list command optim
optimizer1 = optim.SGD(net.parameters(),lr=0.001,momentum=0.9,weight_decay=1e-5)
optimizer2 = optim.Adam(net.parameters(),lr=0.01,betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-5)
optimizer3 = optim.RMSprop(net.parameters(),lr=0.01,alpha=0.99,eps=1e-08,weight_decay=1e-5)
#迭代时一般做两步骤
#1、optimizer.zero_grad()
#2、optimizer.step()

In [14]:
## adjust learning rate
## 两种方法调整学习率，1自己写函数、2是用optim.lr_scheduler
#1
base_lr = 0.1
def adjust_learning_rate(optimizer,epoch):
    lr = base_lr*(0.1**(epoch//200))
    for param in optimizer.param_groups:
        param['lr'] = lr
        

In [16]:
#2 lr_scheduler
## 2.1 StepLR
scheduler = optim.lr_scheduler.StepLR(optimizer1,step_size=200,gamma=0.1)
for epoch in range(600):
    scheduler.step()
    #train(....)
    #validate(....)
## lr = 0.1  if epoch < 200
## lr = 0.01 if 200 <= epoch < 400
## lr = 0.001 if 400 <= epoch < 600


In [17]:
#2 lr_scheduler
## 2.2 MultiStepLR
millestones = [200,500,800]
scheduler = optim.lr_scheduler.MultiStepLR(optimizer2,millestones,gamma=0.1)
for epoch in range(1000):
    scheduler.step()
    #train(....)
    #validate(....)
## lr = 0.1  if epoch < 200
## lr = 0.01 if 200 <= epoch <500
## lr = 0.001 if 500 <= epoch < 800
## lr = 0.0001 if 800 <= epoch

In [18]:
#2 lr_scheduler
## 2.2 ExponentialLR
scheduler = optim.lr_scheduler.ExponentialLR(optimizer3,0.95,)
for epoch in range(1000):
    scheduler.step()
    #train(....)
    #validate(....)
## lr = lr * gamma**epoch

In [22]:
## 通常我们finetune的时候会为不同的层次设置不同的学习率。
# 1， 前面的层次不学习了，只更新后续更改的层次
for param in net.parameters():
    param.requires_grad = False

net.fc4 = nn.Linear(256,100) # 10 类别变成了100  只学习该层次的参数
optimizer4 = optim.SGD(params=[net.fc4.weight,net.fc4.bias],lr = base_lr,weight_decay=1e-5,momentum=0.9)

In [23]:
#2,前面的层次也学习，但是学习率与后续的不一致
ignore_param = list(map(id,net.fc4.parameters()))
base_param = filter(lambda p: id(p) not in ignore_param,net.parameters())
optimizer5 = optim.SGD([
    {'params':base_param},
    {'params':net.fc4.parameters(),'lr':base_lr*10}],0.001,momentum=0.9,weight_decay=1e-4
)

In [29]:
for param in optimizer5.param_groups:
    print(param['lr'])

0.001
0.1
