# Adam
Adam 是一个结合了动量法和 RMSProp 的优化算法，其结合了两者的优点。

## Adam 算法
Adam 算法会使用一个动量变量 v 和一个 RMSProp 中的梯度元素平方的移动指数加权平均 s，首先将他们全部初始化为 0，然后在每次迭代中，计算他们的移动加权平均进行更新

$$
v = \beta_1 v + (1 - \beta_1) g \\
s = \beta_2 s + (1 - \beta_2) g^2
$$

在 adam 算法里，为了减轻 v 和 s 被初始化为 0 的初期对计算指数加权移动平均的影响，每次 v 和 s 都做下面的修正

$$
\hat{v} = \frac{v}{1 - \beta_1^t} \\
\hat{s} = \frac{s}{1 - \beta_2^t}
$$

这里 t 是迭代次数，可以看到，当 $0 \leq \beta_1, \beta_2 \leq 1$ 的时候，迭代到后期 t 比较大，那么 $\beta_1^t$ 和 $\beta_2^t$ 就几乎为 0，就不会对 v 和 s 有任何影响了，算法作者建议$\beta_1 = 0.9$, $\beta_2 = 0.999$。

最后使用修正之后的 $\hat{v}$ 和 $\hat{s}$ 进行学习率的重新计算

$$
g' = \frac{\eta \hat{v}}{\sqrt{\hat{s} + \epsilon}}
$$

这里 $\eta$ 是学习率，$epsilon$ 仍然是为了数值稳定性而添加的常数，最后参数更新有

$$
\theta_i = \theta_{i-1} - g'
$$

下面我们来实现以下 adam 算法

In [None]:
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function

import tensorflow as tf
import numpy as np
import time
import tensorflow.examples.tutorials.mnist.input_data as input_data

import sys
sys.path.append('..')
from utils.layers import hidden_layer, DNN

tf.set_random_seed(2017)

In [None]:
# 数据导入
mnist = input_data.read_data_sets('../MNIST_data', one_hot=True)

train_set = mnist.train
test_set = mnist.test

In [None]:
# 定义模型
input_ph = tf.placeholder(shape=(None, 784), dtype=tf.float32)
label_ph = tf.placeholder(shape=(None, 10), dtype=tf.int64)

dnn = DNN(input_ph, [200], weights_collection='params', biases_collection='params')

In [None]:
# 构建`loss`和`acc`
loss = tf.losses.softmax_cross_entropy(logits=dnn, onehot_labels=label_ph)

acc = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(dnn, axis=-1), tf.argmax(label_ph, axis=-1)), dtype=tf.float32))

In [None]:
# 获取梯度
params = tf.get_collection('params')

gradients = tf.gradients(loss, params)

### 定义`Adam`更新算法

In [None]:
def adam_update(params, gradients, vs, sqrs, lr, t, beta1=0.9, beta2=0.999, name='adam_update'):
    eps = 1e-8
    
    update_ops = []
    for param, gradient, v, sqr in zip(params, gradients, vs, sqrs):
        v_update = v.assign(beta1 * v + (1 - beta1) * gradient)
        sqr_update = sqr.assign(beta2 * sqr + (1 - beta2) * tf.square(gradient))
        with tf.control_dependencies([v_update, sqr_update]):
            v_hat = v / (1 - beta1 ** t)
            s_hat = sqr / (1 - beta2 ** t)
            update_ops.append(param.assign_sub(lr * v_hat / tf.sqrt(s_hat + eps)))
            
    update_op = tf.group(*update_ops, name=name)
    return update_op

### 定义辅助变量

In [None]:
with tf.variable_scope('sqrs'):
    for i, param in enumerate(params):
        v = tf.get_variable(param.op.name, shape=param.get_shape(), initializer=tf.zeros_initializer(), dtype=tf.float32)
        tf.add_to_collection('sqrs', v)

with tf.variable_scope('vs'):
    for i, param in enumerate(params):
        v = tf.get_variable(param.op.name, shape=param.get_shape(), initializer=tf.zeros_initializer(), dtype=tf.float32)
        tf.add_to_collection('vs', v)

In [None]:
sqrs = tf.get_collection('sqrs')
vs = tf.get_collection('vs')

### 使用adam定义更新`op`

In [None]:
t = 1

update_op = adam_update(params, gradients, vs, sqrs, 1e-3, t)

In [None]:
sess = tf.InteractiveSession()

### 开始训练

In [None]:
batch_size = 64

sess.run(tf.global_variables_initializer())

train_losses = []

epoch = 0
samples_passed = 0
epoch_done = False
step = 0

_start = time.time()
while (epoch < 5):
    if samples_passed + batch_size >= mnist.train.num_examples:
        this_batch = mnist.train.num_examples - samples_passed
        samples_passed = 0
        epoch += 1
        epoch_done = True
    else:
        samples_passed += batch_size
        this_batch = batch_size
        
    # 获取 batch_size个训练样本
    images, labels = train_set.next_batch(this_batch)
    if epoch_done:
        # 计算所有训练样本的损失值
        train_loss = []
        for _ in range(train_set.num_examples // 100):
            image, label = train_set.next_batch(100)
            loss_train = sess.run(loss, feed_dict={input_ph: image, label_ph: label})
            train_loss.append(loss_train)

        print('Epoch {} Train loss: {:.6f}'.format(epoch, np.array(train_loss).mean()))
        epoch_done = False
        
    # 每30步记录一次训练误差
    if step % 30 == 0:
        loss_train = sess.run(loss, feed_dict={input_ph: images, label_ph: labels})
        train_losses.append(loss_train)
        
    sess.run(update_op, feed_dict={input_ph: images, label_ph: labels})
    step += 1

_end = time.time()
print('Train Done! Cost Time: {:.2f}s'.format(_end - _start))

我们来看看结果

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
x_axis = np.linspace(0, 5, len(train_losses), endpoint=True)
plt.semilogy(x_axis, train_losses, label='adam')
plt.legend(loc='best')

### tf.train.AdadeltaOptimizer
`tensorflow`中也集成了`Adadelta`方法

In [None]:
train_op = tf.train.AdamOptimizer(1e-3).minimize(loss)

In [None]:
sess.run(tf.global_variables_initializer())

train_losses1 = []

epoch = 0
samples_passed = 0
epoch_done = False
step = 0

_start = time.time()
while (epoch < 5):
    if samples_passed + batch_size >= mnist.train.num_examples:
        this_batch = mnist.train.num_examples - samples_passed
        samples_passed = 0
        epoch += 1
        epoch_done = True
    else:
        samples_passed += batch_size
        this_batch = batch_size
        
    # 获取 batch_size个训练样本
    images, labels = train_set.next_batch(this_batch)
    if epoch_done:
        # 计算所有训练样本的损失值
        train_loss = []
        for _ in range(train_set.num_examples // 100):
            image, label = train_set.next_batch(100)
            loss_train = sess.run(loss, feed_dict={input_ph: image, label_ph: label})
            train_loss.append(loss_train)

        print('Epoch {} Train loss: {:.6f}'.format(epoch, np.array(train_loss).mean()))
        epoch_done = False
        
    # 每30步记录一次训练误差
    if step % 30 == 0:
        loss_train = sess.run(loss, feed_dict={input_ph: images, label_ph: labels})
        train_losses1.append(loss_train)
        
    sess.run(train_op, feed_dict={input_ph: images, label_ph: labels})
    step += 1

_end = time.time()
print('Train Done! Cost Time: {:.2f}s'.format(_end - _start))