diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc index c57895b460afc1..415b8926653f6a 100644 --- a/tensorflow/core/kernels/training_ops.cc +++ b/tensorflow/core/kernels/training_ops.cc @@ -41,6 +41,28 @@ struct ApplyGradientDescent { } }; +template +struct ApplyAdadelta { + void operator()(const CPUDevice& d, typename TTypes::Flat var, + typename TTypes::Flat accum, + typename TTypes::Flat update_accum, + typename TTypes::ConstScalar decay_rate, + typename TTypes::ConstScalar epsilon, + typename TTypes::ConstFlat grad) { + if (DoInline(var.size())) { + accum += accum * decay_rate() + grad.square() * (1 - decay_rate()); + auto update = (update_accum + epsilon()).sqrt() * (accum + epsilon()).rsqrt() * grad; + update_accum = update_accum * decay_rate() + update.square() * (1 - decay_rate()); + var -= update; + } else { + accum.device(d) += accum * decay_rate() + grad.square() * (1 - decay_rate()) ; + auto update = update_accum * (accum + epsilon()).rsqrt() * grad; + update_accum.device(d) = update_accum * decay_rate() + update.square() * (1 - decay_rate()); + var.device(d) -= update; + } + } +}; + template struct ApplyAdagrad { void operator()(const CPUDevice& d, typename TTypes::Flat var, @@ -202,6 +224,126 @@ REGISTER_KERNELS(GPU, double); #endif #undef REGISTER_KERNELS +template +class ApplyAdadeltaOp : public OpKernel { + public: + explicit ApplyAdadeltaOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_)); + } + + void Compute(OpKernelContext* ctx) override { + if (use_exclusive_lock_) { + mutex_lock l1(*ctx->input_ref_mutex(0)); + // Don't try to acquire a lock on the second ref as they share the same + // mutex. + // + // mutex_lock l2(*ctx->input_ref_mutex(1)); + DoValidate(ctx); + if (!ctx->status().ok()) return; + DoCompute(ctx); + } else { + DoValidate(ctx); + if (!ctx->status().ok()) return; + DoCompute(ctx); + } + ctx->forward_ref_input_to_ref_output(0, 0); + } + + private: + bool use_exclusive_lock_; + + void DoValidate(OpKernelContext* ctx) { + Tensor var = ctx->mutable_input(0, use_exclusive_lock_); + Tensor accum = ctx->mutable_input(1, use_exclusive_lock_); + Tensor update_accum = ctx->mutable_input(2, use_exclusive_lock_); + + OP_REQUIRES( + ctx, var.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(0))); + OP_REQUIRES( + ctx, accum.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(1))); + OP_REQUIRES( + ctx, update_accum.IsInitialized(), + errors::FailedPrecondition( + "Attempting to use uninitialized variables: ", def().input(2))); + + const Tensor& decay_rate = ctx->input(3); + const Tensor& epsilon = ctx->input(4); + const Tensor& grad = ctx->input(5); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(decay_rate.shape()), + errors::InvalidArgument("decay_rate is not a scalar: ", + decay_rate.shape().DebugString())); + + OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()), + errors::InvalidArgument("epsilon is not a scalar: ", + epsilon.shape().DebugString())); + + OP_REQUIRES( + ctx, var.shape().IsSameSize(accum.shape()), + errors::InvalidArgument("var and accum do not have the same shape", + var.shape().DebugString(), " ", + accum.shape().DebugString())); + OP_REQUIRES( + ctx, var.shape().IsSameSize(grad.shape()), + errors::InvalidArgument("var and grad do not have the same shape", + var.shape().DebugString(), " ", + grad.shape().DebugString())); + } + + void DoCompute(OpKernelContext* ctx) { + const Device& device = ctx->template eigen_device(); + Tensor var = ctx->mutable_input(0, use_exclusive_lock_); + Tensor accum = ctx->mutable_input(1, use_exclusive_lock_); + Tensor update_accum = ctx->mutable_input(2, use_exclusive_lock_); + + const Tensor& decay_rate = ctx->input(3); + const Tensor& epsilon = ctx->input(4); + const Tensor& grad = ctx->input(5); + + functor::ApplyAdadelta()(device, var.flat(), accum.flat(), + update_accum.flat(), decay_rate.scalar(), + epsilon.scalar(), grad.flat()); + } +}; + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +#define REGISTER_KERNELS(D, T) \ + REGISTER_KERNEL_BUILDER( \ + Name("ApplyAdadelta").Device(DEVICE_##D).TypeConstraint("T"), \ + ApplyAdadeltaOp); + +REGISTER_KERNELS(CPU, float); +REGISTER_KERNELS(CPU, double); + +#if GOOGLE_CUDA +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void ApplyAdadelta::operator()( \ + const GPUDevice& d, typename TTypes::Flat var, \ + typename TTypes::Flat accum, \ + typename TTypes::Flat update_accum, \ + typename TTypes::ConstScalar decay_rate, \ + typename TTypes::ConstScalar epsilon, \ + typename TTypes::ConstFlat grad); \ + extern template struct ApplyAdadelta; +DECLARE_GPU_SPEC(float); +DECLARE_GPU_SPEC(double); +#undef DECLARE_GPU_SPEC +} // namespace functor + +REGISTER_KERNELS(GPU, float); +REGISTER_KERNELS(GPU, double); +#endif +#undef REGISTER_KERNELS + template class ApplyAdagradOp : public OpKernel { public: diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h index 3cc31155cb9fe5..bc88e18aeac43d 100644 --- a/tensorflow/core/kernels/training_ops.h +++ b/tensorflow/core/kernels/training_ops.h @@ -33,6 +33,16 @@ struct ApplyGradientDescent { typename TTypes::ConstFlat delta); }; +template +struct ApplyAdadelta { + void operator()(const Device& d, typename TTypes::Flat var, + typename TTypes::Flat accum, + typename TTypes::Flat update_accum, + typename TTypes::ConstScalar decay_rate, + typename TTypes::ConstScalar epsilon, + typename TTypes::ConstFlat grad); +}; + template struct ApplyAdagrad { void operator()(const Device& d, typename TTypes::Flat var, diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc index f156e972c8118c..fa01dc220a8af3 100644 --- a/tensorflow/core/ops/training_ops.cc +++ b/tensorflow/core/ops/training_ops.cc @@ -35,6 +35,35 @@ use_locking: If True, the subtraction will be protected by a lock; otherwise the behavior is undefined, but may exhibit less contention. )doc"); +REGISTER_OP("ApplyAdadelta") + .Input("var: Ref(T)") + .Input("accum: Ref(T)") + .Input("update_accum: Ref(T)") + .Input("decay_rate: T") + .Input("epsilon: T") + .Input("grad: T") + .Output("out: Ref(T)") + .Attr("T: numbertype") + .Attr("use_locking: bool = false") + .Doc(R"doc( +Update '*var' according to the adadelta scheme. + +accum += decay_rate() * accum + (1 - decay_rate()) * grad.square(); +update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad; +update_accum = decay_rate() * update_accum + (1 - decay_rate()) * update.square(); +var -= update; + +var: Should be from a Variable(). +accum: Should be from a Variable(). +update_accum: Should be from a Variable(). +decay_rate: Scaling factor. Must be a scalar. +epsilon: Constant factor. Must be a scalar. +grad: The gradient. +out: Same as "var". +use_locking: If True, updating of the var, accum and update_accum tensors will be protected by +a lock; otherwise the behavior is undefined, but may exhibit less contention. +)doc"); + REGISTER_OP("ApplyAdagrad") .Input("var: Ref(T)") .Input("accum: Ref(T)") diff --git a/tensorflow/python/training/adadelta.py b/tensorflow/python/training/adadelta.py new file mode 100644 index 00000000000000..a0a734c37cffc5 --- /dev/null +++ b/tensorflow/python/training/adadelta.py @@ -0,0 +1,84 @@ +# Copyright 2015 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Adadelta for TensorFlow.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.framework import ops +from tensorflow.python.ops import constant_op +from tensorflow.python.training import optimizer +from tensorflow.python.training import training_ops + + +class AdadeltaOptimizer(optimizer.Optimizer): + """Optimizer that implements the Adadelta algorithm. + + @@__init__ + """ + + def __init__(self, decay_rate=0.001, epsilon=1e-8, + use_locking=False, name="Adadelta"): + """Construct a new Adadelta optimizer. + + Implementation is based on http://arxiv.org/abs/1212.5701 + + Args: + decay_rate: A `Tensor` or a floating point value. The decay_rate. + epsilon: A `Tensor` or a floating point value. A constant epsilon used + to better conditioning the grad update. + use_locking: If `True` use locks for update operations. + name: Optional name prefix for the operations created when applying + gradients. Defaults to "Adadelta". + """ + super(AdadeltaOptimizer, self).__init__(use_locking, name) + self._decay_rate = decay_rate + self._epsilon = epsilon + + # Tensor versions of the constructor arguments, created in _prepare(). + self._decay_rate_t = None + self.epsilon_t = None + + def _create_slots(self, var_list): + for v in var_list: + self._zeros_slot(v, "decay_rate", self._name) + self._zeros_slot(v, "epsilon", self._name) + self._zeros_slot(v, "accum", self._name) + self._zeros_slot(v, "update_accum", self._name) + + def _prepare(self): + self._decay_rate_t = ops.convert_to_tensor(self._decay_rate, + name="decay_rate") + self._epsilon_t = ops.convert_to_tensor(self._epsilon, + name="epsilon") + + def _apply_dense(self, grad, var): + decay_rate = self.get_slot(var, "decay_rate") + epsilon = self.get_slot(var, "epsilon") + accum = self.get_slot(var, "accum") + update_accum = self.get_slot(var, "update_accum") + + return training_ops.apply_adadelta( + var, accum, update_accum, + self._decay_rate_t, self._epsilon_t, grad, + use_locking=self._use_locking).op + + # def _apply_sparse(self, grad, var): + # mom = self.get_slot(var, "adadelta") + # return training_ops.sparse_apply_adadelta( + # var, mom, + # self._learning_rate_tensor, grad.values, grad.indices, + # self._adadelta_tensor, use_locking=self._use_locking).op diff --git a/tensorflow/python/training/adadelta_test.py b/tensorflow/python/training/adadelta_test.py new file mode 100644 index 00000000000000..3426b3f61a2f9c --- /dev/null +++ b/tensorflow/python/training/adadelta_test.py @@ -0,0 +1,83 @@ +# Copyright 2015 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for Momentum.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow.python.platform + +import numpy as np +from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow as tf + + +class AdadeltaOptimizerTest(tf.test.TestCase): + + def testBasic(self): + with self.test_session(): + var0 = tf.Variable([1.0, 2.0]) + var1 = tf.Variable([3.0, 4.0]) + grads0 = tf.constant([0.1, 0.1]) + grads1 = tf.constant([0.01, 0.01]) + adadelta_opt = tf.train.AdadeltaOptimizer() + adadelta_update = adadelta_opt.apply_gradients(zip([grads0, grads1], [var0, var1])) + tf.initialize_all_variables().run() + + # Check we have slots + self.assertEqual(["accum", "decay_rate", "epsilon", "update_accum"], adadelta_opt.get_slot_names()) + slot0 = adadelta_opt.get_slot(var0, "accum") + self.assertEquals(slot0.get_shape(), var0.get_shape()) + self.assertFalse(slot0 in tf.trainable_variables()) + slot1 = adadelta_opt.get_slot(var1, "accum") + self.assertEquals(slot1.get_shape(), var1.get_shape()) + self.assertFalse(slot1 in tf.trainable_variables()) + + # Fetch params to validate initial values + self.assertAllClose([1.0, 2.0], var0.eval()) + self.assertAllClose([3.0, 4.0], var1.eval()) + + adadelta_update.run() + + # Check that the accumulators have been updated. + self.assertAllClose(np.array([0.1, 0.1]), slot0.eval()) + self.assertAllClose(np.array([0.01, 0.01]), slot1.eval()) + + # Check that the parameters have been updated. + self.assertAllClose(np.array([1.0 - (0.1 * 2.0), + 2.0 - (0.1 * 2.0)]), + var0.eval()) + self.assertAllClose(np.array([3.0 - (0.01 * 2.0), + 4.0 - (0.01 * 2.0)]), + var1.eval()) + # Step 2: the momentum accumulators contain the previous update. + adadelta_update.run() + # Check that the momentum accumulators have been updated. + self.assertAllClose(np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), + slot0.eval()) + self.assertAllClose(np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), + slot1.eval()) + # Check that the parameters have been updated. + self.assertAllClose( + np.array([1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0), + 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)]), + var0.eval()) + self.assertAllClose(np.array([2.98 - ((0.9 * 0.01 + 0.01) * 2.0), + 3.98 - ((0.9 * 0.01 + 0.01) * 2.0)]), + var1.eval()) + +if __name__ == "__main__": + tf.test.main() diff --git a/tensorflow/python/training/momentum_test.py b/tensorflow/python/training/momentum_test.py index f7e1e3095c2223..79b4ebb9c79745 100644 --- a/tensorflow/python/training/momentum_test.py +++ b/tensorflow/python/training/momentum_test.py @@ -165,7 +165,7 @@ def _dbParamsMom01(self): """Return dist-belief momentum values. Return values been generated from the dist-belief momentum unittest, - running with a learning rate of 0.1 and a momemntum of 0.1. + running with a learning rate of 0.1 and a momentum of 0.1. These values record how a parameter vector of size 10, initialized with 0.0, gets updated with 10 consecutive momentum steps. It uses random gradients. diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py index b9f5f9a54b04ac..bc36266b49586a 100644 --- a/tensorflow/python/training/training.py +++ b/tensorflow/python/training/training.py @@ -28,6 +28,7 @@ @@Optimizer @@GradientDescentOptimizer +@@AdadeltaOptimizer @@AdagradOptimizer @@MomentumOptimizer @@AdamOptimizer @@ -128,6 +129,7 @@ from __future__ import division from __future__ import print_function +from tensorflow.python.training.adadelta import AdadeltaOptimizer from tensorflow.python.training.adagrad import AdagradOptimizer from tensorflow.python.training.adam import AdamOptimizer from tensorflow.python.training.ftrl import FtrlOptimizer diff --git a/tensorflow/python/training/training_ops.py b/tensorflow/python/training/training_ops.py index 363ceda978f3ed..c1ac386baec723 100644 --- a/tensorflow/python/training/training_ops.py +++ b/tensorflow/python/training/training_ops.py @@ -47,6 +47,17 @@ def _AssertInputIsScalar(op, index): op.inputs[index].get_shape().assert_is_compatible_with(tensor_shape.scalar()) +@ops.RegisterShape("ApplyAdadelta") +def _ApplyAdadeltaShape(op): + """Shape function for the ApplyAdadelta op.""" + var_shape = op.inputs[0].get_shape() + accum_shape = op.inputs[1].get_shape().merge_with(var_shape) + update_accum_shape = op.inputs[2].get_shape().merge_with(var_shape) + _AssertInputIsScalar(op, 3) # decay_rate + _AssertInputIsScalar(op, 4) # epsilon + grad_shape = op.inputs[5].get_shape().merge_with(accum_shape) + return [grad_shape] + @ops.RegisterShape("ApplyAdagrad") def _ApplyAdagradShape(op): """Shape function for the ApplyAdagrad op."""