# AdamW Optimization

ref
https://www.ruder.io/optimizing-gradient-descent/

https://medium.com/%E9%9B%9E%E9%9B%9E%E8%88%87%E5%85%94%E5%85%94%E7%9A%84%E5%B7%A5%E7%A8%8B%E4%B8%96%E7%95%8C/%E6%A9%9F%E5%99%A8%E5%AD%B8%E7%BF%92ml-note-sgd-momentum-adagrad-adam-optimizer-f20568c968db

https://gaomj.cn/sgd-adam/

Claude-3.5-Sonnet 提供初版代码，改写成非自动求导版本

In [4]:
import torch
torch.manual_seed(42)

# model
# y = x @ w

# evaluation
# loss = 0.5 ||x @ w - label||^2


class Adam:
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8):
        self.w = w
        self.lr = lr
        self.beta1, self.beta2 = betas
        self.eps = eps
        
        self.m = torch.zeros_like(w)
        self.v = torch.zeros_like(w) 
        self.t = 0
    
    def step(self, w,  grad, weight_decay=1e-2):
        self.t += 1

        # 一阶估计，调整
        self.m = self.beta1 * self.m + (1 - self.beta1) * grad
        # 二阶估计，调整学习率
        self.v = self.beta2 * self.v + (1 - self.beta2) * grad.pow(2)
        
        m_hat = self.m / (1 - self.beta1 ** self.t)
        v_hat = self.v / (1 - self.beta2 ** self.t)

        if weight_decay is not None: # adamW
            return w - self.lr * (m_hat / (v_hat.sqrt() + self.eps) + weight_decay * w)

        return w - self.lr * m_hat / (v_hat.sqrt() + self.eps)

w = torch.randn(10, 1)
optimizer = Adam(w)
input_data = torch.randn(8,10);
target = torch.randn(8,1);

def loss_function(pred, label):
    b, _ = pred.size()
    return (0.5 / b )*(pred-label) ** 2.0

for epoch in range(1000):
    # 前向传播
    output = input_data @ w
    # 求W的梯度
    grad = input_data.transpose(1,0) @ (output - target) 
    if epoch % 100 == 0:
        print(loss_function(output, target).mean())

    # weight_decay = 1e-2 ~ AdamW
    w = optimizer.step(w, grad, weight_decay=1e-2)

    # weight_decay = None ~ Adam
    # w = optimizer.step(w, grad, weight_decay=None) 
    