Skip to content
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
113 lines (95 sloc) 4.92 KB
import torch
from .optimizer import Optimizer
class Adagradnorm(Optimizer):
# Modified from Adagrad
"""Implements Adagradnorm algorithm.
It has been proposed in `Adaptive Subgradient Methods for Online Learning
and Stochastic Optimization`_.
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float, optional): learning rate (default: 1.0)
momentum (float, optional): momentum (default: 0.0)
lr_decay (float, optional): learning rate decay (default: 0)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
initial_accumulator_value (float, optional): initial value of the adaptive sum (default: 1e-2)
.. _AdaGrad stepsizes: Sharp convergence over nonconvex landscapes, from any
.. _Adaptive Subgradient Methods for Online Learning and Stochastic
def __init__(self, params, lr=1.0, momentum=0.0, lr_decay=0, weight_decay=0, initial_accumulator_value=1e-2):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= lr_decay:
raise ValueError("Invalid lr_decay value: {}".format(lr_decay))
if not 0.0 <= weight_decay:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
if not 0.0 <= initial_accumulator_value:
raise ValueError("Invalid initial_accumulator_value value: {}".format(initial_accumulator_value))
defaults = dict(lr=lr, momentum=momentum,lr_decay=lr_decay, weight_decay=weight_decay,
super(Adagradnorm, self).__init__(params, defaults)
for group in self.param_groups:
for p in group['params']:
grad =
state = self.state[p]
state['step'] = 0
if len(grad.size())==4:
state['sum'] = torch.ones_like(grad.view(grad.size()[0],-1)).mul_(initial_accumulator_value)
state['sum'] = torch.ones_like(grad).mul_(initial_accumulator_value)
if group['momentum'] > 0:
state['momentum_buffer'] = torch.zeros_like(grad)
def share_memory(self):
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
def step(self, closure=None):
"""Performs a single optimization step.
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
grad =
state = self.state[p]
state['step'] += 1
clr = group['lr'] / (1 + (state['step'] - 1) * group['lr_decay'])
# grad.is_sparse TODO not yet implement
if len(grad.size()) == 4:
output_channel, input_channel, filter_h, filter_w = grad.size()
# Note you could also try:
# grad_sqaure = grad.view(output_channel*input_channel,-1).norm(dim=1,p=2)**2
grad_sqaure = grad.view(output_channel,-1).norm(dim=1,p=2)**2
elif len(grad.size()) == 2:
output_channel, input_channel = grad.size()
grad_sqaure = grad.norm(dim=1,p=2)**2
## a tricky way to find the learning rate to update the bias term
state['sum'].addcmul_(1.0, grad, grad)
state['sum'].addcmul_(1.0, grad, grad)
std = state['sum'].sqrt().add_(1e-10).view(*grad.shape)
if group['momentum'] > 0:
buf = state['momentum_buffer']
buf.mul_(group['momentum']).add_(1-group['momentum'], grad), buf, std)
else:, grad, std)
if group['weight_decay'] != 0:
raise RuntimeError("weight_decay option is not compatible with sparse gradients")*group['weight_decay'], grad)
return loss
You can’t perform that action at this time.