diff --git a/tfjs-backend-webgl/src/argminmax_gpu.ts b/tfjs-backend-webgl/src/argminmax_gpu.ts
index cbaca78deee..d3e63c0e6a9 100644
--- a/tfjs-backend-webgl/src/argminmax_gpu.ts
+++ b/tfjs-backend-webgl/src/argminmax_gpu.ts
@@ -26,10 +26,7 @@ export class ArgMinMaxProgram implements GPGPUProgram {
   constructor(
       reduceInfo: backend_util.ReduceInfo, op: 'max'|'min',
       firstPass: boolean) {
-    const windowSize = reduceInfo.windowSize;
-    const batchSize = reduceInfo.batchSize;
-    const inSize = reduceInfo.inSize;
-    const outSize = Math.ceil(inSize / windowSize);
+    const {windowSize, batchSize, outSize} = reduceInfo;
     if (!firstPass) {
       this.variableNames.push('bestIndicesA');
     }
diff --git a/tfjs-backend-webgl/src/backend_webgl.ts b/tfjs-backend-webgl/src/backend_webgl.ts
index a939a659304..30c2168e9c6 100644
--- a/tfjs-backend-webgl/src/backend_webgl.ts
+++ b/tfjs-backend-webgl/src/backend_webgl.ts
@@ -1043,7 +1043,8 @@ export class MathBackendWebGL extends KernelBackend {
     const batchSize = x.shape[0];
     const inSize = x.shape[1];
     const windowSize = backend_util.computeOptimalWindowSize(inSize);
-    const reduceInfo = {windowSize, inSize, batchSize};
+    const outSize = Math.ceil(inSize / windowSize);
+    const reduceInfo = {windowSize, inSize, batchSize, outSize};
     const program = new ReduceProgram(reduceInfo, reduceType);
     const output = this.compileAndRun<Tensor2D>(program, [x], dtype);
     // No need to run another GPGPU program.
@@ -1063,7 +1064,12 @@ export class MathBackendWebGL extends KernelBackend {
       inSize = bestIndicesA.shape[1];
     }
     const windowSize = backend_util.computeOptimalWindowSize(inSize);
-    const reduceInfo = {windowSize, inSize, batchSize};
+    const reduceInfo = {
+      windowSize,
+      inSize,
+      batchSize,
+      outSize: Math.ceil(inSize / windowSize)
+    };
     const program =
         new ArgMinMaxProgram(reduceInfo, reduceType, bestIndicesA == null);
     const inputs = [x];
diff --git a/tfjs-backend-webgl/src/kernel_utils/reduce.ts b/tfjs-backend-webgl/src/kernel_utils/reduce.ts
index edf6df8db73..9198b7b60bc 100644
--- a/tfjs-backend-webgl/src/kernel_utils/reduce.ts
+++ b/tfjs-backend-webgl/src/kernel_utils/reduce.ts
@@ -22,18 +22,44 @@ import {ReduceProgram} from '../reduce_gpu';
 
 type ReduceTypes = 'all'|'any'|'max'|'min'|'sum'|'prod';
 
+// Returns an array of configuration objects that describe each stage of the
+// reduction.
+function getReductionStages(inShape: number[]):
+    Array<{inSize: number, windowSize: number, outSize: number}> {
+  const stages = [];
+
+  while (stages.length === 0 || stages[stages.length - 1].outSize !== 1) {
+    const outSize: number =
+        stages.length ? stages[stages.length - 1].outSize : inShape[1];
+    const windowSize = backend_util.computeOptimalWindowSize(outSize);
+    stages.push({
+      inSize: outSize,
+      windowSize,
+      outSize: Math.ceil(outSize / windowSize)
+    });
+  }
+
+  return stages;
+}
+
 export function reduce(
     x: TensorInfo, dtype: DataType, reductionType: ReduceTypes,
     backend: MathBackendWebGL): TensorInfo {
-  const [batchSize, inSize] = x.shape;
-  const windowSize = backend_util.computeOptimalWindowSize(inSize);
-  const reduceInfo = {windowSize, inSize, batchSize};
-  const program = new ReduceProgram(reduceInfo, reductionType);
-  const output = backend.runWebGLProgram(program, [x], dtype);
-
-  if (output.shape[1] === 1) {
-    return output;
+  const reductionStages = getReductionStages(x.shape);
+
+  let result = x;
+  for (let i = 0; i < reductionStages.length; i++) {
+    const {inSize, windowSize, outSize} = reductionStages[i];
+
+    const program = new ReduceProgram(
+        {windowSize, inSize, batchSize: x.shape[0], outSize}, reductionType);
+    const previousResult = result;
+    result = backend.runWebGLProgram(program, [result], dtype);
+
+    if (previousResult.dataId !== x.dataId) {
+      backend.disposeData(previousResult.dataId);
+    }
   }
 
-  return reduce(output, dtype, reductionType, backend);
+  return result;
 }
diff --git a/tfjs-backend-webgl/src/kernels/Max_test.ts b/tfjs-backend-webgl/src/kernels/Max_test.ts
new file mode 100644
index 00000000000..e323b3c8fae
--- /dev/null
+++ b/tfjs-backend-webgl/src/kernels/Max_test.ts
@@ -0,0 +1,40 @@
+/**
+ * @license
+ * Copyright 2020 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import * as tf from '@tensorflow/tfjs-core';
+// tslint:disable-next-line: no-imports-from-dist
+import {ALL_ENVS, describeWithFlags} from '@tensorflow/tfjs-core/dist/jasmine_util';
+
+describeWithFlags('Max', ALL_ENVS, () => {
+  it('does not have memory leak when calling reduce multiple times.',
+     async () => {
+       const beforeDataIds = tf.engine().backend.numDataIds();
+
+       // Input must be large enough to trigger multi-stage reduction.
+       const x = tf.ones([100, 100]);
+       const xMax = x.max();
+
+       const afterResDataIds = tf.engine().backend.numDataIds();
+       expect(afterResDataIds).toEqual(beforeDataIds + 2);
+
+       x.dispose();
+       xMax.dispose();
+
+       const afterDisposeDataIds = tf.engine().backend.numDataIds();
+       expect(afterDisposeDataIds).toEqual(beforeDataIds);
+     });
+});
diff --git a/tfjs-backend-webgl/src/reduce_gpu.ts b/tfjs-backend-webgl/src/reduce_gpu.ts
index 553639965ac..ff9541a39a9 100644
--- a/tfjs-backend-webgl/src/reduce_gpu.ts
+++ b/tfjs-backend-webgl/src/reduce_gpu.ts
@@ -26,10 +26,7 @@ export class ReduceProgram implements GPGPUProgram {
   constructor(
       reduceInfo: backend_util.ReduceInfo,
       reduceType: 'all'|'any'|'max'|'min'|'sum'|'prod') {
-    const windowSize = reduceInfo.windowSize;
-    const batchSize = reduceInfo.batchSize;
-    const inSize = reduceInfo.inSize;
-    const outSize = Math.ceil(inSize / windowSize);
+    const {windowSize, batchSize, inSize, outSize} = reduceInfo;
     this.outputShape = [batchSize, outSize];
 
     let initializationValue = '0.0';
diff --git a/tfjs-core/src/ops/max_test.ts b/tfjs-core/src/ops/max_test.ts
index f48f4cf7567..3da2c8568ac 100644
--- a/tfjs-core/src/ops/max_test.ts
+++ b/tfjs-core/src/ops/max_test.ts
@@ -32,6 +32,14 @@ describeWithFlags('max', ALL_ENVS, () => {
     expectArraysClose(await r.data(), 3);
   });
 
+  it('with a large dimension', async () => {
+    const aData = new Float32Array(1000);
+    aData[0] = 1;
+    const a = tf.tensor1d(aData);
+    const r = tf.max(a);
+    expectArraysClose(await r.data(), 1);
+  });
+
   it('ignores NaNs', async () => {
     expectArraysClose(await tf.max([3, NaN, 2]).data(), 3);
   });
diff --git a/tfjs-core/src/ops/reduce_util.ts b/tfjs-core/src/ops/reduce_util.ts
index 275b42deaff..3a0290d68be 100644
--- a/tfjs-core/src/ops/reduce_util.ts
+++ b/tfjs-core/src/ops/reduce_util.ts
@@ -27,6 +27,7 @@ export interface ReduceInfo {
   windowSize: number;
   batchSize: number;
   inSize: number;
+  outSize: number;
 }
 
 export function computeOptimalWindowSize(inSize: number): number {