# Adadelta
Adadelta 算是 Adagrad 法的延伸，它跟 RMSProp 一样，都是为了解决 Adagrad 中学习率不断减小的问题，RMSProp 是通过移动加权平均的方式，而 Adadelta 也是一种方法，有趣的是，它并不需要学习率这个参数。

## Adadelta 法
Adadelta 跟 RMSProp 一样，先使用移动平均来计算 s

$$
s = \rho s + (1 - \rho) g^2
$$

这里 $\rho$ 和 RMSProp 中的 $\alpha$ 都是移动平均系数，g 是参数的梯度，然后我们会计算需要更新的参数的变化量

$$
g' = \frac{\sqrt{\Delta \theta + \epsilon}}{\sqrt{s + \epsilon}} g
$$

$\Delta \theta$ 初始为 0 张量，每一步做如下的指数加权移动平均更新

$$
\Delta \theta = \rho \Delta \theta + (1 - \rho) g'^2
$$

最后参数更新如下

$$
\theta = \theta - g'
$$

下面我们实现以下 Adadelta

In [None]:
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function

import tensorflow as tf
import numpy as np
import time
import tensorflow.examples.tutorials.mnist.input_data as input_data

import sys
sys.path.append('..')
from utils.layers import hidden_layer, DNN

tf.set_random_seed(2017)

In [None]:
# 数据导入
mnist = input_data.read_data_sets('../MNIST_data', one_hot=True)

train_set = mnist.train
test_set = mnist.test

In [None]:
# 定义模型
input_ph = tf.placeholder(shape=(None, 784), dtype=tf.float32)
label_ph = tf.placeholder(shape=(None, 10), dtype=tf.int64)

dnn = DNN(input_ph, [200], weights_collection='params', biases_collection='params')

In [None]:
# 构建`loss`和`acc`
loss = tf.losses.softmax_cross_entropy(logits=dnn, onehot_labels=label_ph)

acc = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(dnn, axis=-1), tf.argmax(label_ph, axis=-1)), dtype=tf.float32))

In [None]:
# 获取梯度
params = tf.get_collection('params')

gradients = tf.gradients(loss, params)

### 定义`Adadelta`更新算法

In [None]:
def adadelta_update(params, gradients, sqrs, deltas, rho, name='adadelta_update'):
    eps = 1e-6
    
    update_ops = []
    for param, gradient, sqr, delta in zip(params, gradients, sqrs, deltas):
        sqr_update = sqr.assign(rho * sqr + (1 - rho) * tf.square(gradient))
        with tf.control_dependencies([sqr_update]):
            curr_delta = tf.sqrt(delta + eps) / tf.sqrt(sqr + eps) * gradient
            delta_update = delta.assign(rho * delta + (1 - rho) * tf.square(gradient))
            with tf.control_dependencies([delta_update]):
                update_ops.append(param.assign_sub(curr_delta))
                
    update_op = tf.group(*update_ops, name=name)
    return update_op

### 定义辅助变量

In [None]:
with tf.variable_scope('sqrs'):
    for i, param in enumerate(params):
        v = tf.get_variable(param.op.name, shape=param.get_shape(), initializer=tf.zeros_initializer(), dtype=tf.float32)
        tf.add_to_collection('sqrs', v)

with tf.variable_scope('deltas'):
    for i, param in enumerate(params):
        v = tf.get_variable(param.op.name, shape=param.get_shape(), initializer=tf.zeros_initializer(), dtype=tf.float32)
        tf.add_to_collection('deltas', v)

In [None]:
sqrs = tf.get_collection('sqrs')
deltas = tf.get_collection('deltas')

### 使用adadelta定义更新`op`

In [None]:
update_op = adadelta_update(params, gradients, sqrs, deltas, 0.9)

In [None]:
sess = tf.InteractiveSession()

### 开始训练

In [None]:
batch_size = 64

sess.run(tf.global_variables_initializer())

train_losses = []

epoch = 0
samples_passed = 0
epoch_done = False
step = 0

_start = time.time()
while (epoch < 5):
    if samples_passed + batch_size >= mnist.train.num_examples:
        this_batch = mnist.train.num_examples - samples_passed
        samples_passed = 0
        epoch += 1
        epoch_done = True
    else:
        samples_passed += batch_size
        this_batch = batch_size
        
    # 获取 batch_size个训练样本
    images, labels = train_set.next_batch(this_batch)
    if epoch_done:
        # 计算所有训练样本的损失值
        train_loss = []
        for _ in range(train_set.num_examples // 100):
            image, label = train_set.next_batch(100)
            loss_train = sess.run(loss, feed_dict={input_ph: image, label_ph: label})
            train_loss.append(loss_train)

        print('Epoch {} Train loss: {:.6f}'.format(epoch, np.array(train_loss).mean()))
        epoch_done = False
        
    # 每30步记录一次训练误差
    if step % 30 == 0:
        loss_train = sess.run(loss, feed_dict={input_ph: images, label_ph: labels})
        train_losses.append(loss_train)
        
    sess.run(update_op, feed_dict={input_ph: images, label_ph: labels})
    step += 1

_end = time.time()
print('Train Done! Cost Time: {:.2f}s'.format(_end - _start))

我们来看看结果

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
x_axis = np.linspace(0, 5, len(train_losses), endpoint=True)
plt.semilogy(x_axis, train_losses, label='adadelta')
plt.legend(loc='best')

### tf.train.AdadeltaOptimizer
`tensorflow`中也集成了`Adadelta`方法

In [None]:
train_op = tf.train.AdadeltaOptimizer(learning_rate=1.0, rho=0.9).minimize(loss)

In [None]:
sess.run(tf.global_variables_initializer())

train_losses1 = []

epoch = 0
samples_passed = 0
epoch_done = False
step = 0

_start = time.time()
while (epoch < 5):
    if samples_passed + batch_size >= mnist.train.num_examples:
        this_batch = mnist.train.num_examples - samples_passed
        samples_passed = 0
        epoch += 1
        epoch_done = True
    else:
        samples_passed += batch_size
        this_batch = batch_size
        
    # 获取 batch_size个训练样本
    images, labels = train_set.next_batch(this_batch)
    if epoch_done:
        # 计算所有训练样本的损失值
        train_loss = []
        for _ in range(train_set.num_examples // 100):
            image, label = train_set.next_batch(100)
            loss_train = sess.run(loss, feed_dict={input_ph: image, label_ph: label})
            train_loss.append(loss_train)

        print('Epoch {} Train loss: {:.6f}'.format(epoch, np.array(train_loss).mean()))
        epoch_done = False
        
    # 每30步记录一次训练误差
    if step % 30 == 0:
        loss_train = sess.run(loss, feed_dict={input_ph: images, label_ph: labels})
        train_losses1.append(loss_train)
        
    sess.run(train_op, feed_dict={input_ph: images, label_ph: labels})
    step += 1

_end = time.time()
print('Train Done! Cost Time: {:.2f}s'.format(_end - _start))