 https://huggingface.co/transformers/main_classes/optimizer_schedules.html#learning-rate-schedules-pytorch

In [30]:
import numpy as np
import plotly
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import plotly.offline as pyo
pio.templates.default='plotly_white'

import torch
import torch.nn as nn

import transformers
from transformers import AutoModel
from transformers import AdamW

from transformers import (
    get_constant_schedule,
    get_constant_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
    get_cosine_with_hard_restarts_schedule_with_warmup,
    get_linear_schedule_with_warmup,
    get_polynomial_decay_schedule_with_warmup
)
epochs = 10


In [31]:
class Net(nn.Module):
    def __init__(self,model_name):
        super(Net,self).__init__()
        self.roberta=AutoModel.from_pretrained(model_name)
        self.classifier=nn.Linear(768,1)
    def forward(self,input_ids):
        outputs=self.roberta(input_ids)
        sequence_output=outputs[0]
        return self.classifier(sequence_output)

## 差分学习率策略





In [32]:
def get_optimizer_params(model, type='s'):
    # differential learning rate and weight decay
    param_optimizer = list(model.named_parameters())
    learning_rate = 5e-5
    no_decay = ['bias', 'gamma', 'beta']
    if type == 's':
        optimizer_parameters = filter(lambda x: x.requires_grad, model.parameters())
    elif type == 'i':
        optimizer_parameters = [
            {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.01},
            {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.0},
            {'params': [p for n, p in model.named_parameters() if "roberta" not in n],
             'lr': 1e-3,
             'weight_decay_rate':0.01}
        ]
    elif type == 'a':
        group1=['layer.0.','layer.1.','layer.2.','layer.3.']
        group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
        group3=['layer.8.','layer.9.','layer.10.','layer.11.']
        group_all=['layer.0.','layer.1.','layer.2.','layer.3.','layer.4.','layer.5.','layer.6.','layer.7.','layer.8.','layer.9.','layer.10.','layer.11.']
        optimizer_parameters = [
            {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay_rate': 0.01},
            {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay_rate': 0.01, 'lr': learning_rate/2.6},
            {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay_rate': 0.01, 'lr': learning_rate},
            {'params': [p for n, p in model.roberta.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay_rate': 0.01, 'lr': learning_rate*2.6},
            {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and not any(nd in n for nd in group_all)],'weight_decay_rate': 0.0},
            {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],'weight_decay_rate': 0.0, 'lr': learning_rate/2.6},
            {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],'weight_decay_rate': 0.0, 'lr': learning_rate},
            {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],'weight_decay_rate': 0.0, 'lr': learning_rate*2.6},
            {'params': [p for n, p in model.named_parameters() if "roberta" not in n], 'lr':1e-3, "momentum" : 0.99},
        ]
    return optimizer_parameters

## 辅助函数

In [33]:
def get_default_layout(title):
    font_style = 'Courier New'
    layout = {}
    layout['height'] = 400
    layout['width'] = 1200
    layout['template'] = 'plotly_white'
    layout['dragmode'] = 'zoom'
    layout['hovermode'] = 'x'
    layout['hoverlabel'] = {
        'font_size': 14,
        'font_family':font_style
    }
    layout['font'] = {
        'size':14,
        'family':font_style,
        'color':'rgb(128, 128, 128)'
    }
    layout['xaxis'] = {
        'title': 'Epochs',
        'showgrid': True,
        'type': 'linear',
        'categoryarray': None,
        'gridwidth': 1,
        'ticks': 'outside',
        'showline': True, 
        'showticklabels': True,
        'tickangle': 0,
        'tickmode': 'array'
    }
    layout['yaxis'] = {
        'title': 'Learning Rate',
        'exponentformat':'none',
        'showgrid': True,
        'type': 'linear',
        'categoryarray': None,
        'gridwidth': 1,
        'ticks': 'outside',
        'showline': True, 
        'showticklabels': True,
        'tickangle': 0,
        'tickmode': 'array'
    }
    layout['title'] = {
        'text':title,
        'x': 0.5,
        'y': 0.95,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': {
            'family':font_style,
            'size':14,
            'color':'black'
        }
    }
    layout['showlegend'] = True
    layout['legend'] = {
        'x':0.1,
        'y':1.1,
        'orientation':'h',
        'itemclick': 'toggleothers',
        'font': {
            'family':font_style,
            'size':14,
            'color':'black'
        }
    }
    return go.Layout(layout)

## Constant Schedule 常数学习率策略

使用优化器中设置的学习率，是一个不变的常数

In [34]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 's')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_constant_schedule(optimizer)

learning_rates = []
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates.append(optimizer.param_groups[0]["lr"])

trace = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='LR',
    marker=dict(color='#3498d5'),
)
layout=get_default_layout('Constant Schedule')
go.Figure(data=[trace], layout=layout)

In [35]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'i')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_constant_schedule(optimizer)

learning_rates1, learning_rates2 = [[] for i in range(2)]
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates1.append(optimizer.param_groups[0]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])

trace1 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates1, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta',
    marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates2, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Regressor',
    marker=dict(color='#f29191'),
)
layout=get_default_layout('Constant Schedule')
go.Figure(data=[trace1, trace2], layout=layout)

## Constant Schedule with Warmup

In [36]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 's')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=3)

learning_rates = []
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates.append(optimizer.param_groups[0]["lr"])

trace = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='LR',
    marker=dict(color='#3498d5'),
)
layout=get_default_layout('Constant Schedule with Warmup')
go.Figure(data=[trace], layout=layout)

In [37]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'i')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=3)

learning_rates1, learning_rates2 = [[] for i in range(2)]
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates1.append(optimizer.param_groups[0]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])

trace1 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates1, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta',
    marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates2, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Regressor',
    marker=dict(color='#f29191'),
)
layout=get_default_layout('Constant Schedule with Warmup')
go.Figure(data=[trace1, trace2], layout=layout)

## Cosine with Warmup

In [38]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'i')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10)

learning_rates1, learning_rates2 = [[] for i in range(2)]
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates1.append(optimizer.param_groups[0]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])

trace1 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates1, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta',
    marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates2, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Regressor',
    marker=dict(color='#f29191'),
)
layout=get_default_layout('Cosine Schedule with Warmup')
go.Figure(data=[trace1, trace2], layout=layout)

In [39]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'i')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10)

learning_rates1, learning_rates2 = [[] for i in range(2)]
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates1.append(optimizer.param_groups[0]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])

trace1 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates1, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta',
    marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates2, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Regressor',
    marker=dict(color='#f29191'),
)
layout=get_default_layout('Cosine Schedule with Warmup')
go.Figure(data=[trace1, trace2], layout=layout)

In [40]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'a')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10)

learning_rates1, learning_rates2, learning_rates3, learning_rates4 = [[] for i in range(4)]
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates1.append(optimizer.param_groups[1]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])
    learning_rates3.append(optimizer.param_groups[3]["lr"])
    learning_rates4.append(optimizer.param_groups[8]["lr"])

trace1 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates1, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 1-4',
    marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates2, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 5-8',
    marker=dict(color='#a678de'),
)
trace3 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates3, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 9-12',
    marker=dict(color='#6ad49b'),
)
trace4 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates4, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Regressor',
    marker=dict(color='#f29191'),
)
layout=get_default_layout('Cosine Schedule with Warmup')
go.Figure(data=[trace1, trace2, trace3, trace4], layout=layout)

## Cosine With Hard Restarts

In [41]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 's')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10, num_cycles=5)

learning_rates = []
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates.append(optimizer.param_groups[0]["lr"])
trace = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='LR',
    marker=dict(color='#3498d5'),
)
layout=get_default_layout('Cosine Schedule with Hard Restarts with Warmup')
go.Figure(data=[trace], layout=layout)

In [42]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'i')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10, num_cycles=5)

learning_rates1, learning_rates2 = [[] for i in range(2)]
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates1.append(optimizer.param_groups[0]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])

trace1 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates1, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta',
    marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates2, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Regressor',
    marker=dict(color='#f29191'),
)
layout=get_default_layout('Cosine Schedule with Hard Restarts with Warmup')
go.Figure(data=[trace1, trace2], layout=layout)

In [43]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'a')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10, num_cycles=5)

learning_rates1, learning_rates2, learning_rates3, learning_rates4 = [[] for i in range(4)]
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates1.append(optimizer.param_groups[1]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])
    learning_rates3.append(optimizer.param_groups[3]["lr"])
    learning_rates4.append(optimizer.param_groups[8]["lr"])

trace1 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates1, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 1-4',
    marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates2, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 5-8',
    marker=dict(color='#a678de'),
)
trace3 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates3, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 9-12',
    marker=dict(color='#6ad49b'),
)
trace4 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates4, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Regressor',
    marker=dict(color='#f29191'),
)
layout=get_default_layout('Cosine Schedule with Hard Restarts with Warmup')
go.Figure(data=[trace1, trace2, trace3, trace4], layout=layout)

## Linear Schedule with Warmup

In [44]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 's')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10)

learning_rates = []
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates.append(optimizer.param_groups[0]["lr"])
trace = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='LR',
    marker=dict(color='#3498d5'),
)
layout=get_default_layout('Linear Schedule with Warmup')
go.Figure(data=[trace], layout=layout)

In [45]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'i')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10)

learning_rates1, learning_rates2 = [[] for i in range(2)]
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates1.append(optimizer.param_groups[0]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])

trace1 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates1, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta',
    marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates2, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Regressor',
    marker=dict(color='#f29191'),
)
layout=get_default_layout('Linear Schedule with Warmup')
go.Figure(data=[trace1, trace2], layout=layout)

In [46]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'a')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10)

learning_rates1, learning_rates2, learning_rates3, learning_rates4 = [[] for i in range(4)]
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates1.append(optimizer.param_groups[1]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])
    learning_rates3.append(optimizer.param_groups[3]["lr"])
    learning_rates4.append(optimizer.param_groups[8]["lr"])

trace1 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates1, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 1-4',
    marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates2, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 5-8',
    marker=dict(color='#a678de'),
)
trace3 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates3, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 9-12',
    marker=dict(color='#6ad49b'),
)
trace4 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates4, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Regressor',
    marker=dict(color='#f29191'),
)
layout=get_default_layout('Linear Schedule with Warmup')
go.Figure(data=[trace1, trace2, trace3, trace4], layout=layout)


## Polynomial Decay with Warmup

In [47]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 's')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_polynomial_decay_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10, power=2)

learning_rates = []
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates.append(optimizer.param_groups[0]["lr"])

trace = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='LR',
    marker=dict(color='#3498d5'),
)
layout=get_default_layout('Polynomial Decay Schedule with Warmup')
go.Figure(data=[trace], layout=layout)

In [48]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'i')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_polynomial_decay_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10, power=2)

learning_rates1, learning_rates2 = [[] for i in range(2)]
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates1.append(optimizer.param_groups[0]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])

trace1 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates1, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta',
    marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates2, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Regressor',
    marker=dict(color='#f29191'),
)
layout=get_default_layout('Polynomial Decay Schedule with Warmup')
go.Figure(data=[trace1, trace2], layout=layout)

In [49]:
model = Net('roberta-base')
parameters = get_optimizer_params(model, 'a')
kwargs = {
    'betas': (0.9, 0.999),
    'eps': 1e-08
}
optimizer = AdamW(parameters, lr=5e-5, **kwargs)
scheduler = get_polynomial_decay_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=10, power=2)

learning_rates1, learning_rates2, learning_rates3, learning_rates4 = [[] for i in range(4)]
for i in range(epochs):
    optimizer.step()
    scheduler.step()
    learning_rates1.append(optimizer.param_groups[1]["lr"])
    learning_rates2.append(optimizer.param_groups[2]["lr"])
    learning_rates3.append(optimizer.param_groups[3]["lr"])
    learning_rates4.append(optimizer.param_groups[8]["lr"])

trace1 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates1, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 1-4',
    marker=dict(color='#3498d5'),
)
trace2 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates2, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 5-8',
    marker=dict(color='#a678de'),
)
trace3 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates3, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Roberta Layers 9-12',
    marker=dict(color='#6ad49b'),
)
trace4 = go.Scatter(
    x=np.arange(0, epochs, 1), 
    y=learning_rates4, 
    texttemplate="%{y:.6f}",
    mode='markers+lines',
    name='Regressor',
    marker=dict(color='#f29191'),
)
layout=get_default_layout('Polynomial Decay Schedule with Warmup')
go.Figure(data=[trace1, trace2, trace3, trace4], layout=layout)

https://www.kaggle.com/rhtsingh/guide-to-huggingface-schedulers-differential-lrs