tensorflow · nsthorat · Oct 12, 2017 · Oct 7, 2017 · Oct 11, 2017
diff --git a/src/graph/optimizers/adamax_optimizer.ts b/src/graph/optimizers/adamax_optimizer.ts
@@ -0,0 +1,116 @@
+/**
+ * @license
+ * Copyright 2017 Google Inc. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {NDArrayMath} from '../../math/math';
+import {NDArray, Scalar} from '../../math/ndarray';
+import {Node} from '../graph';
+import {SessionRuntime} from '../session';
+import {SummedTensorArrayMap, TensorArrayMap} from '../tensor_array_map';
+
+import {Optimizer} from './optimizer';
+
+export class AdamMaxOptimizer extends Optimizer {
+  constructor(
+      protected learningRate: number,
+      private beta1: number, private beta2: number,
+      specifiedVariableList?: Node[]) {
+    super(learningRate, specifiedVariableList);
+    // b1, b2 keep initial value of beta* hyperparameters.
+    this.b1 = Scalar.new(this.beta1);
+    this.b2 = Scalar.new(this.beta2);
+  }
+
+  beforeBatch(
+      math: NDArrayMath, batchSize: number, runtime: SessionRuntime,
+      activationArrayMap: TensorArrayMap,
+      gradientArrayMap: SummedTensorArrayMap) {
+    super.beforeBatch(
+        math, batchSize, runtime, activationArrayMap, gradientArrayMap);
+
+    if (this.firstMoment.size() === 0) {
+      this.variableNodes.forEach(node => {
+        this.firstMoment.set(node.output, NDArray.zeros(node.output.shape));
+      });
+    }
+
+    if (this.weightedInfNorm.size() === 0) {
+      this.variableNodes.forEach(node => {
+        this.weightedInfNorm.set(node.output, NDArray.zeros(node.output.shape));
+      });
+    }
+  }
+
+  afterBatch(
+      math: NDArrayMath, batchSize: number, runtime: SessionRuntime,
+      activationArrayMap: TensorArrayMap,
+      gradientArrayMap: SummedTensorArrayMap) {
+    math.scope((keep) => {
+        this.variableNodes.forEach(node => {
+
+        const oldVariable = activationArrayMap.get(node.output);
+
+        const gradient = this.variableGradients.get(node.output);
+        const oldFirstMoment = this.firstMoment.get(node.output);
+        const oldWeightedInfNorm = this.weightedInfNorm.get(node.output);
+
+        const newFirstMoment = math.scaledArrayAdd(
+          this.b1, oldFirstMoment, math.sub(this.one, this.b1), gradient);
+
+        const ut0 = math.multiply(this.b2, oldWeightedInfNorm);
+        const ut1 = math.abs(gradient);
+
+        const newWeightedInfNorm = math.add(
+            math.relu(math.sub(ut0, ut1)), ut1); // update with element-wise max
+
+        const variable = math.scaledArrayAdd(this.one, oldVariable,
+            math.divide(this.c, math.sub(this.one, this.b1)),
+            math.divide(newFirstMoment, newWeightedInfNorm));
+
+        activationArrayMap.set(node.output, keep(variable));
+        node.data = variable;
+
+        this.firstMoment.set(node.output, keep(newFirstMoment));
+        this.weightedInfNorm.set(node.output, keep(newWeightedInfNorm));
+
+        oldVariable.dispose();
+        gradient.dispose();
+        oldFirstMoment.dispose();
+        oldWeightedInfNorm.dispose();
+      });
+    });
+
+    this.variableGradients.dispose();
+    this.variableGradients = new TensorArrayMap();
+  }
+
+  dispose() {
+    super.dispose();
+    this.firstMoment.dispose();
+    this.weightedInfNorm.dispose();
+    this.eps.dispose();
+    this.b1.dispose();
+    this.b2.dispose();
+  }
+
+  // Average of 1st gradient
+  private firstMoment = new TensorArrayMap();
+  // Average of exponentially weighed infinity norm 
+  private weightedInfNorm = new TensorArrayMap();
+  private eps: Scalar;
+  private b1: Scalar;
+  private b2: Scalar;
+}
diff --git a/src/graph/session_test.ts b/src/graph/session_test.ts
@@ -28,6 +28,7 @@ import {RMSPropOptimizer} from './optimizers/rmsprop_optimizer';
 import {SGDOptimizer} from './optimizers/sgd_optimizer';
 import {AdadeltaOptimizer} from './optimizers/adadelta_optimizer';
 import {AdamOptimizer} from './optimizers/adam_optimizer';
+import {AdamMaxOptimizer} from './optimizers/adamax_optimizer';
 import {FeedDictionary, FeedEntry, Session} from './session';
 
 describe('FeedDictionary', () => {
@@ -500,7 +501,7 @@ describe('Session', () => {
     });
   });
 
-    it('adam', () => {
+  it('adam', () => {
     const x = g.placeholder('x', [2]);
     const w = g.variable('w', NDArray.zeros([1, 2]));
     const b = g.variable('b', NDArray.zeros([1]));
@@ -557,5 +558,69 @@ describe('Session', () => {
       test_util.expectArraysClose(
           dydw2, new Float32Array([-.2, -.2]), 2e-5);
     });
+    });
+
+  it('adamax', () => {
+      const x = g.placeholder('x', [2]);
+      const w = g.variable('w', NDArray.zeros([1, 2]));
+      const b = g.variable('b', NDArray.zeros([1]));
+      const y = g.reduceSum(g.add(g.matmul(w, x), b));
+
+      const safeMode = true;
+      const optimizer = new AdamMaxOptimizer(0.1, 0.8, 0.9);
+      const math = new NDArrayMathCPU(safeMode);
+      const session = new Session(g, math);
+      const inputProvider: InputProvider = {
+          getNextCopy() {
+              return Array1D.new([2, 4]);
+          },
+          disposeCopy(math, example) { }
+      };
+
+      math.scope(() => {
+          // w = reduce_sum(w_1*x_1 + w_2*x_2 + b)
+          // new_first_m = [beta1*old_first_m_w1 + (1-beta1)*grad_w1,
+          //                beta1*old_first_m_w2 + (1-beta1)*grad_w2]
+          //             = [.4, .8]
+          //
+          // ut_0 = beta2*old_weighted_inf_norm = [0, 0]
+          // u1_1 = [(1-beta2)*grad_w1, (1-beta2)*grad_w2] = [.2 .4]
+          // new_weighted_inf_norm = max(ut_0, ut_1 ) = [.2 .4]
+          // 
+          // coefficient = alpha/(1-beta1) = 0.5
+          // updates = coefficient*[new_first_m1/new_weighted_inf_norm1, 
+          //                        new_first_m2/new_weighted_inf_norm2]
+          //         = [1.0, 1.0]
+          // w = [ w1_old - lr*updates_1, w2_old - lr*updates_2]
+          //            = [-0.1, -0.1]
+          //
+          session.train(y, [{ tensor: x, data: inputProvider }], 1, optimizer);
+          const dydw = session.activationArrayMap.get(w).getValues();
+          test_util.expectArraysClose(
+              dydw, new Float32Array([-0.1, -0.1]), 1e-5);
+
+          // w = reduce_sum(w_1*x_1 + w_2*x_2 + b)
+          // new_first_m = [beta1*old_first_m_w1 + (1-beta1)*grad_w1,
+          //                beta1*old_first_m_w2 + (1-beta1)*grad_w2]
+          //             = [0.8*0.4 + 0.2*2, 0.8*0.8 + 0.2*4]
+          //             = [0.72, 1.44]
+          //
+          // ut_0 = beta2*old_weighted_inf_norm = [.18 .36]
+          // u1_1 = [(1-beta2)*grad_w1, (1-beta2)*grad_w2] = [.2 .4]
+          // new_weighted_inf_norm = max(ut_0, ut_1 ) = [.2 .4]
+          // 
+          // coefficient = alpha/(1-beta1) = 0.5
+          // updates = coefficient*[new_first_m1/new_weighted_inf_norm1, 
+          //                        new_first_m2/new_weighted_inf_norm2]
+          //         = [1.8, 1.8]
+          // w = [ w1_old - lr*updates_1, w2_old - lr*updates_2]
+          //            = [-0.28, -0.28]
+
+          session.train(y, [{ tensor: x, data: inputProvider }], 1, optimizer);
+          const dydw2 = session.activationArrayMap.get(w).getValues();
+          test_util.expectArraysClose(
+              dydw2, new Float32Array([-.28, -.28]), 2e-5);
+      });
   });
+
 });
diff --git a/src/index.ts b/src/index.ts
@@ -37,6 +37,7 @@ export {Optimizer} from './graph/optimizers/optimizer';
 export {RMSPropOptimizer} from './graph/optimizers/rmsprop_optimizer';
 export {SGDOptimizer} from './graph/optimizers/sgd_optimizer';
 export {AdamOptimizer} from './graph/optimizers/adam_optimizer';
+export {AdamMaxOptimizer} from './graph/optimizers/adamax_optimizer';
 export {CostReduction, FeedEntry, Session} from './graph/session';
 // tslint:disable-next-line:max-line-length
 export {GraphRunner, GraphRunnerEventObserver, MetricReduction} from './graph_runner';