In [None]:
import numpy as np

# 需要了解的知识：
# 指数移动平均EMA: w_t = βw_t-1+（1-β） 

class EMA():
    def __init__(self, decay):
        self.decay = decay
        self.shadow = {}
 
    def register(self, name, val):
        self.shadow[name] = val
 
    def get(self, name):
        return self.shadow[name]
 
    def update(self, name, x):
        assert name in self.shadow
        new_average = (1.0 - self.decay) * x + self.decay * self.shadow[name]
        print(new_average)
        self.shadow[name] = new_average
        
# 参考：https://blog.csdn.net/apodx/article/details/124646664       
              
class EMA:
    def __init__(self,delay):
        self.decay = decay
        self.pre_w = {}
        
    def update(self, name, x):
        if 'name' not in self.pre_w:
            self.pre_w[name] = x
        ema = (1.0 - self.decay) * x + self.decay * self.pre_w[name]
        print(ema)
        self.pre_w[name] = ema

        
# 带偏差修正的指数加权平均
'''
在机器学习中，多数的指数加权平均运算并不会使用偏差修正。因为大多数人更愿意在初始阶段，用一个捎带偏差的值进行运算。
不过，如果在初试阶段就开始考虑偏差，指数加权移动均值仍处于预热阶段，偏差修正可以做出更好的估计。
'''

In [1]:
# SGD 随机梯度下降: w = w-lr*grad(w)

class SGD:
    def __init__(self,lr=0.01):
        self.lr = lr
        
    def update(self,params,grads):
        for key in params.key():
            params[key] -= self.lr*grads[key]

In [2]:
# 使用d2l画图
from d2l import torch as d2l

# 参考：https://blog.csdn.net/tcn760/article/details/123965374


In [3]:
# momentum
class Momentum:
    def __init__(self,lr,β=0.9):
        self.lr = lr
        self.β = β
        self.v = None
        
    def update(self,params,grads):
        if self.v is None:
            self.v = {}
            for key, val in params.items():  # 开始先初始化为0，后面是遇到相同的key进行更新即迭代的时候更新
                self.v[key] = np.zeros_like(val)
            
        for key in params.key():         
            self.v[key] =  self.β*self.v[key] + self.lr*grads[key]
            params[key] -= self.v[key] 

In [4]:
# Adagrad:引入了过去的梯度的平方之和进行优化
class Momentum:
    def __init__(self,lr):
        self.lr = lr
        self.gg = None
        
    def update(self,params,grads):
        if self.gg is None:
            self.gg = {}
            for key, val in params.items():  # 开始先初始化为0，后面是遇到相同的key进行更新即迭代的时候更新
                self.gg[key] = np.zeros_like(val)
            
        for key in params.key():         
            self.gg[key] += self.grads[key]*self.grads[key]
            params[key] -= self.lr*grads[key]/np.sqrt(self.gg[key]+1e-8)

In [6]:
# RMSprop:利用了历史梯度的平方的EMA
class Momentum:
    def __init__(self,lr=0.1,β=0.9):
        self.lr = lr
        self.β = β
        self.v_gg = None
        
    def update(self,params,grads):
        if self.v_gg is None:
            self.v_gg = {}
            for key, val in params.items():  # 开始先初始化为0，后面是遇到相同的key进行更新即迭代的时候更新
                self.v_gg[key] = np.zeros_like(val)
            
        for key in params.key():         
            self.v_gg[key] = self.β* self.v_gg[key]+ (1-self.β)*grads[key]*grads[key]
            params[key] -= self.lr*grads[key]/np.sqrt(self.v_gg[key]+1e-8)

In [None]:
# Adam:结合了moment和rmsprop
class Momentum:
    def __init__(self,lr=0.1,β1=0.9,β2=0.999):
        self.lr = lr
        self.β1 = β1
        self.β2 = β2
        self.m = None
        self.v_gg = None
        
    def update(self,params,grads):
        if self.v_gg  is None:   
            self.v_gg = {}
            for key, val in params.items():  # 开始先初始化为0，后面是遇到相同的key进行更新即迭代的时候更新
                self.v_gg[key] = np.zeros_like(val)
        if self.m is None:   
            self.m = {}
            for key, val in params.items():  # 开始先初始化为0，后面是遇到相同的key进行更新即迭代的时候更新
                self.m[key] = np.zeros_like(val)
            
        for key in params.key():  
            self.m[key] = self.β1*self.m[key] + (1-β1)*grads[key]
            self.v_gg[key] = self.β2* self.v_gg[key]+ (1-self.β2)*grads[key]*.grads[key]
            params[key] -= self.lr*self.m[key]/np.sqrt(self.v_gg[key]+1e-8)
