From f4c9762540def5c2e4b88101a49fb63583cf7aad Mon Sep 17 00:00:00 2001
From: Yannick Assogba <yassogba@google.com>
Date: Mon, 13 Jul 2020 09:48:08 -0400
Subject: [PATCH 01/12] split out fused ops into their own files

---
 tfjs-core/src/kernel_names.ts               |  35 ++
 tfjs-core/src/ops/fused/conv2d.ts           | 298 +++++++++
 tfjs-core/src/ops/fused/depthwise_conv2d.ts | 234 +++++++
 tfjs-core/src/ops/fused/mat_mul.ts          | 208 ++++++
 tfjs-core/src/ops/fused_ops.ts              | 664 +-------------------
 tfjs-core/src/ops/fused_util.ts             |  48 ++
 6 files changed, 828 insertions(+), 659 deletions(-)
 create mode 100644 tfjs-core/src/ops/fused/conv2d.ts
 create mode 100644 tfjs-core/src/ops/fused/depthwise_conv2d.ts
 create mode 100644 tfjs-core/src/ops/fused/mat_mul.ts

diff --git a/tfjs-core/src/kernel_names.ts b/tfjs-core/src/kernel_names.ts
index 70cf4cb01fa..237ccca263d 100644
--- a/tfjs-core/src/kernel_names.ts
+++ b/tfjs-core/src/kernel_names.ts
@@ -21,6 +21,7 @@
 import {ExplicitPadding} from '../src/ops/conv_util';
 
 import {NamedTensorInfoMap, TensorInfo} from './kernel_registry';
+import {Activation} from './ops/fused_util';
 import {DataType, PixelData} from './types';
 
 export const Abs = 'Abs';
@@ -642,3 +643,37 @@ export interface FromPixelsInputs {
 export interface FromPixelsAttrs {
   numChannels: number;
 }
+
+export const _FusedMatMul = '_FusedMatMul';
+export type _FusedMatMulInputs =
+    Pick<NamedTensorInfoMap, 'a'|'b'|'bias'|'preluActivationWeights'>;
+// tslint:disable-next-line: class-name
+export interface _FusedMatMulAttrs {
+  transposeA: number;
+  transposeB: number;
+  activation: Activation;
+}
+
+export const FusedConv2D = 'FusedConv2D';
+export type FusedConv2DInputs =
+    Pick<NamedTensorInfoMap, 'x'|'filter'|'bias'|'preluActivationWeights'>;
+export interface FusedConv2DAttrs {
+  strides: [number, number]|number;
+  pad: 'valid'|'same'|number|ExplicitPadding;
+  dataFormat: 'NHWC'|'NCHW';
+  dilations: [number, number]|number;
+  dimRoundingMode: 'floor'|'round'|'ceil';
+  activation: Activation;
+}
+
+export const FusedDepthwiseConv2D = 'FusedDepthwiseConv2D';
+export type FusedDepthwiseConv2DInputs =
+    Pick<NamedTensorInfoMap, 'x'|'filter'|'bias'|'preluActivationWeights'>;
+export interface FusedDepthwiseConv2DAttrs {
+  strides: [number, number]|number;
+  pad: 'valid'|'same'|number;
+  dataFormat: 'NHWC'|'NCHW';
+  dilations: [number, number]|number;
+  dimRoundingMode: 'floor'|'round'|'ceil';
+  activation: Activation;
+}
diff --git a/tfjs-core/src/ops/fused/conv2d.ts b/tfjs-core/src/ops/fused/conv2d.ts
new file mode 100644
index 00000000000..e3504076aa3
--- /dev/null
+++ b/tfjs-core/src/ops/fused/conv2d.ts
@@ -0,0 +1,298 @@
+/**
+ * @license
+ * Copyright 2019 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {ENGINE} from '../../engine';
+import {Tensor, Tensor3D, Tensor4D} from '../../tensor';
+import {makeTypesMatch} from '../../tensor_util';
+import {convertToTensor} from '../../tensor_util_env';
+import {TensorLike} from '../../types';
+import * as util from '../../util';
+import {add} from '../add';
+import * as broadcast_util from '../broadcast_util';
+import {conv2d as unfusedConv2d} from '../conv2d';
+import {conv2DBackpropFilter} from '../conv2d_backprop_filter';
+import {conv2DBackpropInput} from '../conv2d_backprop_input';
+import {applyActivation, getFusedBiasGradient, getFusedDyActivation} from '../fused_util';
+import {Activation, shouldFuse} from '../fused_util';
+import * as conv_util from '../ops/../conv_util';
+import {op} from '../ops/../operation';
+
+/**
+ * Computes a 2D convolution over the input x, optionally fused with adding a
+ * bias and applying an activation.
+ *
+ * ```js
+ * const inputDepth = 2;
+ * const inShape = [2, 2, 2, inputDepth];
+ * const outputDepth = 2;
+ * const fSize = 1;
+ * const pad = 0;
+ * const strides = 1;
+ *
+ * const x = tf.tensor4d( [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ * 16], inShape);
+ * const w = tf.tensor4d([-1, 1, -2, 0.5], [fSize, fSize, inputDepth,
+ * outputDepth]);
+ *
+ * tf.fused.conv2d({ x, filter: w, strides, pad, dataFormat: 'NHWC',
+ * dilations: [1, 1], bias: tf.scalar(5), activation: 'relu' }).print();
+ * ```
+ *
+ * @param obj An object with the following properties:
+ * @param x The input tensor, of rank 4 or rank 3, of shape
+ *     `[batch, height, width, inChannels]`. If rank 3, batch of 1 is
+ * assumed.
+ * @param filter The filter, rank 4, of shape
+ *     `[filterHeight, filterWidth, inDepth, outDepth]`.
+ * @param strides The strides of the convolution: `[strideHeight,
+ * strideWidth]`.
+ * @param pad The type of padding algorithm.
+ *   - `same` and stride 1: output will be of same size as input,
+ *       regardless of filter size.
+ *   - `valid` output will be smaller than input if filter is larger
+ *       than 1x1.
+ *   - For more info, see this guide:
+ *     [https://www.tensorflow.org/api_guides/python/nn#Convolution](
+ *          https://www.tensorflow.org/api_guides/python/nn#Convolution)
+ * @param dataFormat An optional string from: "NHWC", "NCHW". Defaults to
+ *     "NHWC". Specify the data format of the input and output data. With the
+ *     default format "NHWC", the data is stored in the order of: [batch,
+ *     height, width, channels]. Only "NHWC" is currently supported.
+ * @param dilations The dilation rates: `[dilationHeight, dilationWidth]`
+ *     in which we sample input values across the height and width dimensions
+ *     in atrous convolution. Defaults to `[1, 1]`. If `dilations` is a single
+ *     number, then `dilationHeight == dilationWidth`. If it is greater than
+ *     1, then all values of `strides` must be 1.
+ * @param dimRoundingMode The rounding mode used when computing output
+ *     dimensions if pad is a number. If none is provided, it will not round
+ *     and error if the output is of fractional size.
+ * @param bias Tensor to be added to the result.
+ * @param activation Name of activation kernel (defaults to `linear`) to be
+ *     applied
+ *      after biasAdd.
+ * @param preluActivationWeights Tensor of prelu weights to be applied as part
+ *     of a `prelu` activation, typically the same shape as `x`.
+ */
+/**
+ * Computes a 2D convolution over the input x, optionally fused with adding a
+ * bias and applying an activation.
+ *
+ * ```js
+ * const inputDepth = 2;
+ * const inShape = [2, 2, 2, inputDepth];
+ * const outputDepth = 2;
+ * const fSize = 1;
+ * const pad = 0;
+ * const strides = 1;
+ *
+ * const x = tf.tensor4d( [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ * 16], inShape);
+ * const w = tf.tensor4d([-1, 1, -2, 0.5], [fSize, fSize, inputDepth,
+ * outputDepth]);
+ *
+ * tf.fused.conv2d({ x, filter: w, strides, pad, dataFormat: 'NHWC',
+ * dilations: [1, 1], bias: tf.scalar(5), activation: 'relu' }).print();
+ * ```
+ *
+ * @param obj An object with the following properties:
+ * @param x The input tensor, of rank 4 or rank 3, of shape
+ *     `[batch, height, width, inChannels]`. If rank 3, batch of 1 is
+ * assumed.
+ * @param filter The filter, rank 4, of shape
+ *     `[filterHeight, filterWidth, inDepth, outDepth]`.
+ * @param strides The strides of the convolution: `[strideHeight,
+ * strideWidth]`.
+ * @param pad The type of padding algorithm.
+ *   - `same` and stride 1: output will be of same size as input,
+ *       regardless of filter size.
+ *   - `valid` output will be smaller than input if filter is larger
+ *       than 1x1.
+ *   - For more info, see this guide:
+ *     [https://www.tensorflow.org/api_guides/python/nn#Convolution](
+ *          https://www.tensorflow.org/api_guides/python/nn#Convolution)
+ * @param dataFormat An optional string from: "NHWC", "NCHW". Defaults to
+ *     "NHWC". Specify the data format of the input and output data. With the
+ *     default format "NHWC", the data is stored in the order of: [batch,
+ *     height, width, channels]. Only "NHWC" is currently supported.
+ * @param dilations The dilation rates: `[dilationHeight, dilationWidth]`
+ *     in which we sample input values across the height and width dimensions
+ *     in atrous convolution. Defaults to `[1, 1]`. If `dilations` is a single
+ *     number, then `dilationHeight == dilationWidth`. If it is greater than
+ *     1, then all values of `strides` must be 1.
+ * @param dimRoundingMode The rounding mode used when computing output
+ *     dimensions if pad is a number. If none is provided, it will not round
+ *     and error if the output is of fractional size.
+ * @param bias Tensor to be added to the result.
+ * @param activation Name of activation kernel (defaults to `linear`) to be
+ *     applied
+ *      after biasAdd.
+ * @param preluActivationWeights Tensor of prelu weights to be applied as part
+ *     of a `prelu` activation, typically the same shape as `x`.
+ */
+function fusedConv2d_<T extends Tensor3D|Tensor4D>({
+  x,
+  filter,
+  strides,
+  pad,
+  dataFormat = 'NHWC',
+  dilations = [1, 1],
+  dimRoundingMode,
+  bias,
+  activation = 'linear',
+  preluActivationWeights
+}: {
+  x: T|TensorLike,
+  filter: Tensor4D|TensorLike,
+  strides: [number, number]|number,
+  pad: 'valid'|'same'|number|conv_util.ExplicitPadding,
+  dataFormat?: 'NHWC'|'NCHW',
+  dilations?: [number, number]|number,
+  dimRoundingMode?: 'floor'|'round'|'ceil',
+  bias?: Tensor|TensorLike,
+  activation?: Activation,
+  preluActivationWeights?: Tensor
+}): T {
+  activation = activation || 'linear';
+  if (shouldFuse(ENGINE.state.gradientDepth, activation) === false) {
+    let result = unfusedConv2d(
+        x, filter, strides, pad, dataFormat, dilations, dimRoundingMode);
+    if (bias != null) {
+      result = add(result, bias);
+    }
+
+    return applyActivation(result, activation, preluActivationWeights) as T;
+  }
+
+  const $x = convertToTensor(x, 'x', 'conv2d');
+  const $filter = convertToTensor(filter, 'filter', 'conv2d');
+
+  let x4D = $x as Tensor4D;
+  let reshapedTo4D = false;
+
+  if ($x.rank === 3) {
+    reshapedTo4D = true;
+    x4D = $x.as4D(1, $x.shape[0], $x.shape[1], $x.shape[2]);
+  }
+  util.assert(
+      x4D.rank === 4,
+      () => `Error in fused conv2d: input must be rank 4, but got rank ` +
+          `${x4D.rank}.`);
+  util.assert(
+      $filter.rank === 4,
+      () => `Error in fused conv2d: filter must be rank 4, but got rank ` +
+          `${$filter.rank}.`);
+  if (dimRoundingMode != null) {
+    util.assert(
+        util.isInt(pad as number),
+        () => `Error in fused conv2d: pad must be an integer when using, ` +
+            `dimRoundingMode ${dimRoundingMode} but got pad ${pad}.`);
+  }
+
+  util.assert(
+      x4D.shape[3] === $filter.shape[2],
+      () => `Error in conv2d: depth of input (${x4D.shape[3]}) must match ` +
+          `input depth for filter ${$filter.shape[2]}.`);
+  util.assert(
+      conv_util.eitherStridesOrDilationsAreOne(strides, dilations),
+      () => 'Error in conv2D: Either strides or dilations must be 1. ' +
+          `Got strides ${strides} and dilations '${dilations}'`);
+  util.assert(
+      dataFormat === 'NHWC',
+      () => `Error in conv2d: got dataFormat of ${
+          dataFormat} but only NHWC is currently supported.`);
+
+  const convInfo = conv_util.computeConv2DInfo(
+      x4D.shape, $filter.shape, strides, dilations, pad, dimRoundingMode);
+
+  let $bias: Tensor;
+  if (bias != null) {
+    $bias = convertToTensor(bias, 'bias', 'fused conv2d');
+    [$bias] = makeTypesMatch($bias, $x);
+
+    broadcast_util.assertAndGetBroadcastShape(convInfo.outShape, $bias.shape);
+  }
+
+  let $preluActivationWeights: Tensor;
+  if (preluActivationWeights != null) {
+    $preluActivationWeights = convertToTensor(
+        preluActivationWeights, 'prelu weights', 'fused conv2d');
+  }
+
+  const grad = (dy: Tensor4D, saved: Tensor[]) => {
+    const [$filter, x4D, y] = saved as [Tensor4D, Tensor4D, Tensor4D];
+
+    const dyActivation = getFusedDyActivation(dy, y, activation) as Tensor4D;
+
+    util.assert(
+        conv_util.tupleValuesAreOne(dilations),
+        () => 'Error in gradient of fused conv2D: ' +
+            `dilation rates greater than 1 ` +
+            `are not yet supported in gradients. Got dilations '${dilations}'`);
+
+    let biasGradient = {};
+    if (bias != null) {
+      biasGradient = {bias: () => getFusedBiasGradient($bias, dyActivation)};
+    }
+
+    return Object.assign(
+        {
+          x: () => conv2DBackpropInput(
+              x4D.shape, dyActivation, $filter, strides, pad),
+          filter: () => conv2DBackpropFilter(
+              x4D, dyActivation, $filter.shape, strides, pad)
+        },
+        biasGradient);
+  };
+
+  const inputs: {
+    x: Tensor,
+    filter: Tensor,
+    bias?: Tensor,
+    preluActivationWeights?: Tensor
+  } = {x: x4D, filter: $filter};
+  if (bias != null) {
+    inputs.bias = $bias;
+  }
+  if (preluActivationWeights != null) {
+    inputs.preluActivationWeights = $preluActivationWeights;
+  }
+
+  const inputsToSave = [$filter, x4D];
+  const outputsToSave = [true];  // Save the only output.
+  const res = ENGINE.runKernelFunc(
+      (backend, save) => {
+        const res = backend.fusedConv2d({
+          input: x4D,
+          filter: $filter,
+          convInfo,
+          bias: $bias,
+          activation,
+          preluActivationWeights: $preluActivationWeights
+        });
+        save([$filter, x4D, res]);
+        return res;
+      },
+      inputs, grad, 'FusedConv2D', {convInfo, activation}, inputsToSave,
+      outputsToSave);
+
+  if (reshapedTo4D) {
+    return res.as3D(res.shape[1], res.shape[2], res.shape[3]) as T;
+  }
+
+  return res as T;
+}
+export const conv2d = op({fusedConv2d_});
diff --git a/tfjs-core/src/ops/fused/depthwise_conv2d.ts b/tfjs-core/src/ops/fused/depthwise_conv2d.ts
new file mode 100644
index 00000000000..cadefd6d964
--- /dev/null
+++ b/tfjs-core/src/ops/fused/depthwise_conv2d.ts
@@ -0,0 +1,234 @@
+/**
+ * @license
+ * Copyright 2019 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {ENGINE} from '../../engine';
+import * as conv_util from '../../ops/conv_util';
+import {op} from '../../ops/operation';
+import {Tensor, Tensor3D, Tensor4D} from '../../tensor';
+import {makeTypesMatch} from '../../tensor_util';
+import {convertToTensor} from '../../tensor_util_env';
+import {TensorLike} from '../../types';
+import * as util from '../../util';
+import {add} from '../add';
+import * as broadcast_util from '../broadcast_util';
+import {depthwiseConv2d as unfusedDepthwiseConv2d} from '../depthwise_conv2d';
+import {depthwiseConv2dNativeBackpropFilter} from '../depthwise_conv2d_native_backprop_filter';
+import {depthwiseConv2dNativeBackpropInput} from '../depthwise_conv2d_native_backprop_input';
+import {Activation, applyActivation, getFusedBiasGradient, getFusedDyActivation, shouldFuse} from '../fused_util';
+
+/**
+ * Computes depthwise 2D convolution, optionally fused with adding a
+ * bias and applying an activation.
+ *
+ * Given a 4D `input` array and a `filter` array of shape
+ * `[filterHeight, filterWidth, inChannels, channelMultiplier]` containing
+ * `inChannels` convolutional filters of depth 1, this op applies a
+ * different filter to each input channel (expanding from 1 channel to
+ * `channelMultiplier` channels for each), then concatenates the results
+ * together. The output has `inChannels * channelMultiplier` channels.
+ *
+ * See
+ * [https://www.tensorflow.org/api_docs/python/tf/nn/depthwise_conv2d](
+ *     https://www.tensorflow.org/api_docs/python/tf/nn/depthwise_conv2d)
+ * for more details.
+ *
+ * @param obj An object with the following properties:
+ * @param x The input tensor, of rank 4 or rank 3, of shape
+ *     `[batch, height, width, inChannels]`. If rank 3, batch of 1 is
+ * assumed.
+ * @param filter The filter tensor, rank 4, of shape
+ *     `[filterHeight, filterWidth, inChannels, channelMultiplier]`.
+ * @param strides The strides of the convolution: `[strideHeight,
+ * strideWidth]`. If strides is a single number, then `strideHeight ==
+ * strideWidth`.
+ * @param pad The type of padding algorithm.
+ *   - `same` and stride 1: output will be of same size as input,
+ *       regardless of filter size.
+ *   - `valid`: output will be smaller than input if filter is larger
+ *       than 1x1.
+ *   - For more info, see this guide:
+ *     [https://www.tensorflow.org/api_guides/python/nn#Convolution](
+ *          https://www.tensorflow.org/api_guides/python/nn#Convolution)
+ * @param dilations The dilation rates: `[dilationHeight, dilationWidth]`
+ *     in which we sample input values across the height and width dimensions
+ *     in atrous convolution. Defaults to `[1, 1]`. If `rate` is a single
+ *     number, then `dilationHeight == dilationWidth`. If it is greater than
+ *     1, then all values of `strides` must be 1.
+ * @param dataFormat: An optional string from: "NHWC", "NCHW". Defaults to
+ *     "NHWC". Specify the data format of the input and output data. With the
+ *     default format "NHWC", the data is stored in the order of: [batch,
+ *     height, width, channels]. Only "NHWC" is currently supported.
+ * @param dimRoundingMode The rounding mode used when computing output
+ *     dimensions if pad is a number. If none is provided, it will not round
+ *     and error if the output is of fractional size.
+ * @param bias Tensor to be added to the result.
+ * @param activation Name of activation kernel (defaults to `linear`).
+ * @param preluActivationWeights Tensor of prelu weights to be applied as part
+ *     of a `prelu` activation, typically the same shape as `x`.
+ */
+function fusedDepthwiseConv2d_<T extends Tensor3D|Tensor4D>({
+  x,
+  filter,
+  strides,
+  pad,
+  dataFormat = 'NHWC',
+  dilations = [1, 1],
+  dimRoundingMode,
+  bias,
+  activation = 'linear',
+  preluActivationWeights
+}: {
+  x: T|TensorLike,
+  filter: Tensor4D|TensorLike,
+  strides: [number, number]|number,
+  pad: 'valid'|'same'|number,
+  dataFormat?: 'NHWC'|'NCHW',
+  dilations?: [number, number]|number,
+  dimRoundingMode?: 'floor'|'round'|'ceil',
+  bias?: Tensor|TensorLike,
+  activation?: Activation,
+  preluActivationWeights?: Tensor
+}): T {
+  if (shouldFuse(ENGINE.state.gradientDepth, activation) === false) {
+    let result = unfusedDepthwiseConv2d(
+        x, filter, strides, pad, dataFormat, dilations, dimRoundingMode);
+    if (bias != null) {
+      result = add(result, bias);
+    }
+
+    return applyActivation(result, activation, preluActivationWeights) as T;
+  }
+
+  const $x = convertToTensor(x, 'x', 'depthwiseConv2d');
+  const $filter = convertToTensor(filter, 'filter', 'depthwiseConv2d');
+
+  let x4D = $x as Tensor4D;
+  let reshapedTo4D = false;
+  if ($x.rank === 3) {
+    reshapedTo4D = true;
+    x4D = $x.as4D(1, $x.shape[0], $x.shape[1], $x.shape[2]);
+  }
+  util.assert(
+      x4D.rank === 4,
+      () => `Error in fused depthwiseConv2d: input must be rank 4, but got ` +
+          `rank ${x4D.rank}.`);
+  util.assert(
+      $filter.rank === 4,
+      () => `Error in fused depthwiseConv2d: filter must be rank 4, ` +
+          `but got rank ${$filter.rank}.`);
+  util.assert(
+      x4D.shape[3] === $filter.shape[2],
+      () => `Error in fused depthwiseConv2d: number of input channels ` +
+          `(${x4D.shape[3]}) must match the inChannels dimension in ` +
+          `filter ${$filter.shape[2]}.`);
+  if (dilations == null) {
+    dilations = [1, 1];
+  }
+  util.assert(
+      conv_util.eitherStridesOrDilationsAreOne(strides, dilations),
+      () =>
+          'Error in fused depthwiseConv2d: Either strides or dilations must ' +
+          `be 1. Got strides ${strides} and dilations '${dilations}'`);
+
+  if (dimRoundingMode != null) {
+    util.assert(
+        util.isInt(pad as number),
+        () => `Error in fused depthwiseConv2d: pad must be an integer when ` +
+            `using dimRoundingMode ${dimRoundingMode} but got pad ${pad}.`);
+  }
+
+  const convInfo = conv_util.computeConv2DInfo(
+      x4D.shape, $filter.shape, strides, dilations, pad, dimRoundingMode,
+      true /* depthwise */);
+
+  let $bias: Tensor;
+  if (bias != null) {
+    $bias = convertToTensor(bias, 'bias', 'fused conv2d');
+    [$bias] = makeTypesMatch($bias, $x);
+
+    broadcast_util.assertAndGetBroadcastShape(convInfo.outShape, $bias.shape);
+  }
+
+  let $preluActivationWeights: Tensor;
+  if (preluActivationWeights != null) {
+    $preluActivationWeights = convertToTensor(
+        preluActivationWeights, 'prelu weights', 'fused depthwiseConv2d');
+  }
+
+  const grad = (dy: Tensor4D, saved: Tensor[]) => {
+    util.assert(
+        conv_util.tupleValuesAreOne(dilations),
+        () => 'Error in gradient of fused depthwiseConv2d: dilation rates ' +
+            `greater than 1 are not yet supported. Got dilations ` +
+            `'${dilations}'`);
+    const [$filter, x4D, y] = saved;
+
+    const dyActivation = getFusedDyActivation(dy, y, activation) as Tensor4D;
+
+    let biasGradient = {};
+    if (bias != null) {
+      biasGradient = {bias: () => getFusedBiasGradient($bias, dyActivation)};
+    }
+
+    return Object.assign(
+        {
+          x: () => depthwiseConv2dNativeBackpropInput(
+              (x4D as Tensor4D).shape, dyActivation, $filter as Tensor4D,
+              convInfo),
+          filter: () => depthwiseConv2dNativeBackpropFilter(
+              x4D as Tensor4D, dyActivation, ($filter as Tensor4D).shape,
+              convInfo),
+        },
+        biasGradient);
+  };
+
+  const inputs: {
+    x: Tensor,
+    filter: Tensor,
+    bias?: Tensor,
+    preluActivationWeights?: Tensor
+  } = {x: x4D, filter: $filter};
+  if (bias != null) {
+    inputs.bias = $bias;
+  }
+  if (preluActivationWeights != null) {
+    inputs.preluActivationWeights = $preluActivationWeights;
+  }
+
+  const inputsToSave = [$filter, x4D];
+  const outputsToSave = [true];
+  const res = ENGINE.runKernelFunc(
+      (backend, save) => {
+        const res = backend.fusedDepthwiseConv2D({
+          input: x4D,
+          filter: $filter,
+          convInfo,
+          bias: $bias,
+          activation,
+          preluActivationWeights: $preluActivationWeights
+        });
+        save([$filter, x4D, res]);
+        return res;
+      },
+      inputs, grad, 'FusedDepthwiseConv2D', {convInfo, activation},
+      inputsToSave, outputsToSave);
+  if (reshapedTo4D) {
+    return res.as3D(res.shape[1], res.shape[2], res.shape[3]) as T;
+  }
+  return res as T;
+}
+export const depthwiseConv2d = op({fusedDepthwiseConv2d_});
diff --git a/tfjs-core/src/ops/fused/mat_mul.ts b/tfjs-core/src/ops/fused/mat_mul.ts
new file mode 100644
index 00000000000..db0a2ed2f95
--- /dev/null
+++ b/tfjs-core/src/ops/fused/mat_mul.ts
@@ -0,0 +1,208 @@
+/**
+ * @license
+ * Copyright 2019 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {ENGINE} from '../../engine';
+import {op} from '../../ops/operation';
+import {Tensor, Tensor3D} from '../../tensor';
+import {makeTypesMatch} from '../../tensor_util';
+import {convertToTensor} from '../../tensor_util_env';
+import {TensorLike} from '../../types';
+import * as util from '../../util';
+import {add} from '../add';
+import * as broadcast_util from '../broadcast_util';
+import {Activation, applyActivation, getFusedBiasGradient, getFusedDyActivation, shouldFuse} from '../fused_util';
+import {matMul as unfusedMatMul} from '../mat_mul';
+
+/**
+ * Computes the dot product of two matrices with optional activation and bias.
+ *
+ * ```js
+ * const a = tf.tensor2d([-1, -2], [1, 2]);
+ * const b = tf.tensor2d([1, 2, 3, 4], [2, 2]);
+ * const bias = tf.tensor2d([1, 2], [1, 2]);
+ *
+ * tf.fused.matMul({a, b, bias, activation: 'relu'}).print();
+ * ```
+ *
+ * @param obj An object with the following properties:
+ * - `a` First matrix in dot product operation.
+ * - `b` Second matrix in dot product operation.
+ * - `transposeA` If true, `a` is transposed before multiplication.
+ * - `transposeB` If true, `b` is transposed before multiplication.
+ * - `bias` Matrix to be added to the result.
+ * - `activation` Name of activation kernel (defaults to `linear`).
+ * - `preluActivationWeights` Tensor of prelu weights.
+ */
+function fusedMatMul_<T extends Tensor>({
+  a,
+  b,
+  transposeA = false,
+  transposeB = false,
+  bias,
+  activation = 'linear',
+  preluActivationWeights
+}: {
+  a: T|TensorLike,
+  b: T|TensorLike,
+  transposeA?: boolean,
+  transposeB?: boolean,
+  bias?: Tensor|TensorLike,
+  activation?: Activation,
+  preluActivationWeights?: Tensor
+}): T {
+  if (shouldFuse(ENGINE.state.gradientDepth, activation) === false) {
+    let result = unfusedMatMul(a, b, transposeA, transposeB);
+    if (bias != null) {
+      result = add(result, bias);
+    }
+
+    return applyActivation(result, activation, preluActivationWeights) as T;
+  }
+
+  let $a = convertToTensor(a, 'a', 'fused matMul');
+  let $b = convertToTensor(b, 'b', 'fused matMul');
+  [$a, $b] = makeTypesMatch($a, $b);
+
+  const innerShapeA =
+      transposeA ? $a.shape[$a.rank - 2] : $a.shape[$a.rank - 1];
+  const innerShapeB =
+      transposeB ? $b.shape[$b.rank - 1] : $b.shape[$b.rank - 2];
+
+  const outerShapeA =
+      transposeA ? $a.shape[$a.rank - 1] : $a.shape[$a.rank - 2];
+  const outerShapeB =
+      transposeB ? $b.shape[$b.rank - 2] : $b.shape[$b.rank - 1];
+
+  const outerDimsA = $a.shape.slice(0, -2);
+  const outerDimsB = $b.shape.slice(0, -2);
+  const batchDimA = util.sizeFromShape(outerDimsA);
+  const batchDimB = util.sizeFromShape(outerDimsB);
+
+  util.assert(
+      $a.rank >= 2 && $b.rank >= 2 && $a.rank === $b.rank,
+      () =>
+          `Error in fused matMul: inputs must have the same rank of at least ` +
+          `2, got ranks ${$a.rank} and ${$b.rank}.`);
+
+  util.assert(
+      util.arraysEqual(outerDimsA, outerDimsB),
+      () => `Error in fused matMul: outer dimensions (${outerDimsA}) and (` +
+          `${outerDimsB}) of Tensors with shapes ${$a.shape} and ` +
+          `${$b.shape} must match.`);
+
+  util.assert(
+      innerShapeA === innerShapeB,
+      () => `Error in fused matMul: inner shapes (${innerShapeA}) and (` +
+          `${innerShapeB}) of Tensors with shapes ${$a.shape} and ` +
+          `${$b.shape} and transposeA=${transposeA}` +
+          ` and transposeB=${transposeB} must match.`);
+
+  const outShape = $a.shape.slice(0, -2).concat([outerShapeA, outerShapeB]);
+
+  const a3D = transposeA ? $a.as3D(batchDimA, innerShapeA, outerShapeA) :
+                           $a.as3D(batchDimA, outerShapeA, innerShapeA);
+  const b3D = transposeB ? $b.as3D(batchDimB, outerShapeB, innerShapeB) :
+                           $b.as3D(batchDimB, innerShapeB, outerShapeB);
+
+  let $bias: Tensor;
+  if (bias != null) {
+    $bias = convertToTensor(bias, 'bias', 'fused matMul');
+    [$bias] = makeTypesMatch($bias, $a);
+
+    broadcast_util.assertAndGetBroadcastShape(outShape, $bias.shape);
+  }
+
+  let $preluActivationWeights: Tensor;
+  if (preluActivationWeights != null) {
+    $preluActivationWeights = convertToTensor(
+        preluActivationWeights, 'prelu weights', 'fused matMul');
+  }
+
+  const grad = (dy: Tensor3D, saved: Tensor[]) => {
+    const [a3D, b3D, y] = saved;
+    const dyActivation = getFusedDyActivation(dy, y, activation);
+
+    let biasGradient = {};
+    if (bias != null) {
+      biasGradient = {bias: () => getFusedBiasGradient($bias, dyActivation)};
+    }
+
+    if (!transposeA && !transposeB) {
+      return Object.assign(
+          {
+            a: () => dyActivation.matMul(b3D as Tensor3D, false, true),
+            b: () => a3D.matMul(dyActivation, true, false)
+          },
+          biasGradient);
+    } else if (!transposeA && transposeB) {
+      return Object.assign(
+          {
+            a: () => dyActivation.matMul(b3D as Tensor3D, false, false),
+            b: () => dyActivation.matMul(a3D as Tensor3D, true, false)
+          },
+          biasGradient);
+    } else if (transposeA && !transposeB) {
+      return Object.assign(
+          {
+            a: () => b3D.matMul(dyActivation, false, true),
+            b: () => a3D.matMul(dyActivation, false, false)
+          },
+          biasGradient);
+    } else {
+      return Object.assign(
+          {
+            a: () => b3D.matMul(dyActivation, true, true),
+            b: () => dyActivation.matMul(a3D as Tensor3D, true, true)
+          },
+          biasGradient);
+    }
+  };
+
+  const inputs:
+      {a: Tensor, b: Tensor,
+       bias?: Tensor,
+       preluActivationWeights?: Tensor} = {a: a3D, b: b3D};
+  if (bias != null) {
+    inputs.bias = $bias;
+  }
+  if (preluActivationWeights != null) {
+    inputs.preluActivationWeights = $preluActivationWeights;
+  }
+
+  const inputsToSave = [a3D, b3D];
+  const outputsToSave = [true];
+
+  const res = ENGINE.runKernelFunc(
+      (backend, save) => {
+        const y = backend.fusedBatchMatMul({
+          a: a3D,
+          b: b3D,
+          transposeA,
+          transposeB,
+          bias: $bias,
+          activation,
+          preluActivationWeights: $preluActivationWeights
+        });
+        save([a3D, b3D, y]);
+        return y;
+      },
+      inputs, grad, '_FusedMatMul', {transposeA, transposeB, activation},
+      inputsToSave, outputsToSave);
+  return res.reshape(outShape);
+}
+
+export const matMul = op({fusedMatMul_});
diff --git a/tfjs-core/src/ops/fused_ops.ts b/tfjs-core/src/ops/fused_ops.ts
index f416d224a7d..ba930839e3c 100644
--- a/tfjs-core/src/ops/fused_ops.ts
+++ b/tfjs-core/src/ops/fused_ops.ts
@@ -15,663 +15,9 @@
  * =============================================================================
  */
 
-import {ENGINE} from '../engine';
-import * as conv_util from '../ops/conv_util';
-import {op} from '../ops/operation';
-import {Tensor, Tensor3D, Tensor4D} from '../tensor';
-import {makeTypesMatch} from '../tensor_util';
-import {convertToTensor} from '../tensor_util_env';
-import {TensorLike} from '../types';
-import * as util from '../util';
+import {conv2d} from './fused/conv2d';
+import {depthwiseConv2d} from './fused/depthwise_conv2d';
+import {matMul} from './fused/mat_mul';
+import {Activation} from './fused_util';
 
-import {add} from './add';
-import * as broadcast_util from './broadcast_util';
-import {conv2d as unfusedConv2d} from './conv2d';
-import {conv2DBackpropFilter} from './conv2d_backprop_filter';
-import {conv2DBackpropInput} from './conv2d_backprop_input';
-import {depthwiseConv2d as unfusedDepthwiseConv2d} from './depthwise_conv2d';
-import {depthwiseConv2dNativeBackpropFilter} from './depthwise_conv2d_native_backprop_filter';
-import {depthwiseConv2dNativeBackpropInput} from './depthwise_conv2d_native_backprop_input';
-import {elu} from './elu';
-import {Activation, shouldFuse} from './fused_util';
-import {matMul as unfusedMatMul} from './mat_mul';
-import {prelu} from './prelu';
-import {relu} from './relu';
-import {relu6} from './relu6';
-
-// Returns gradient for fused activation.
-const getFusedDyActivation =
-    (dy: Tensor, y: Tensor, activation: Activation): Tensor => {
-      if (activation == null || activation === 'linear') {
-        return dy;
-      }
-      if (activation === 'relu') {
-        return dy.mul(y.step());
-      }
-      throw new Error(
-          `Gradient for activation ${activation} has not been ` +
-          `implemented yet.`);
-    };
-
-// Returns gradient for fused bias.
-const getFusedBiasGradient = (bias: Tensor, dyActivation: Tensor): Tensor => {
-  let res = dyActivation;
-  const reduceAxes =
-      broadcast_util.getReductionAxes(bias.shape, dyActivation.shape);
-  if (reduceAxes.length > 0) {
-    res = res.sum(reduceAxes);
-  }
-  return res.reshape(bias.shape);
-};
-
-const applyActivation =
-    (x: Tensor, activation: Activation, preluActivationWeights?: Tensor):
-        Tensor => {
-          if (activation === 'linear') {
-            return x;
-          } else if (activation === 'relu') {
-            return relu(x);
-          } else if (activation === 'elu') {
-            return elu(x);
-          } else if (activation === 'relu6') {
-            return relu6(x);
-          } else if (activation === 'prelu') {
-            return prelu(x, preluActivationWeights);
-          }
-          throw new Error(`Unknown fused activation ${activation}.`);
-        };
-
-/**
- * Computes the dot product of two matrices with optional activation and bias.
- *
- * ```js
- * const a = tf.tensor2d([-1, -2], [1, 2]);
- * const b = tf.tensor2d([1, 2, 3, 4], [2, 2]);
- * const bias = tf.tensor2d([1, 2], [1, 2]);
- *
- * tf.fused.matMul({a, b, bias, activation: 'relu'}).print();
- * ```
- *
- * @param obj An object with the following properties:
- * - `a` First matrix in dot product operation.
- * - `b` Second matrix in dot product operation.
- * - `transposeA` If true, `a` is transposed before multiplication.
- * - `transposeB` If true, `b` is transposed before multiplication.
- * - `bias` Matrix to be added to the result.
- * - `activation` Name of activation kernel (defaults to `linear`).
- * - `preluActivationWeights` Tensor of prelu weights.
- */
-function fusedMatMul_<T extends Tensor>({
-  a,
-  b,
-  transposeA = false,
-  transposeB = false,
-  bias,
-  activation = 'linear',
-  preluActivationWeights
-}: {
-  a: T|TensorLike,
-  b: T|TensorLike,
-  transposeA?: boolean,
-  transposeB?: boolean,
-  bias?: Tensor|TensorLike,
-  activation?: Activation,
-  preluActivationWeights?: Tensor
-}): T {
-  if (shouldFuse(ENGINE.state.gradientDepth, activation) === false) {
-    let result = unfusedMatMul(a, b, transposeA, transposeB);
-    if (bias != null) {
-      result = add(result, bias);
-    }
-
-    return applyActivation(result, activation, preluActivationWeights) as T;
-  }
-
-  let $a = convertToTensor(a, 'a', 'fused matMul');
-  let $b = convertToTensor(b, 'b', 'fused matMul');
-  [$a, $b] = makeTypesMatch($a, $b);
-
-  const innerShapeA =
-      transposeA ? $a.shape[$a.rank - 2] : $a.shape[$a.rank - 1];
-  const innerShapeB =
-      transposeB ? $b.shape[$b.rank - 1] : $b.shape[$b.rank - 2];
-
-  const outerShapeA =
-      transposeA ? $a.shape[$a.rank - 1] : $a.shape[$a.rank - 2];
-  const outerShapeB =
-      transposeB ? $b.shape[$b.rank - 2] : $b.shape[$b.rank - 1];
-
-  const outerDimsA = $a.shape.slice(0, -2);
-  const outerDimsB = $b.shape.slice(0, -2);
-  const batchDimA = util.sizeFromShape(outerDimsA);
-  const batchDimB = util.sizeFromShape(outerDimsB);
-
-  util.assert(
-      $a.rank >= 2 && $b.rank >= 2 && $a.rank === $b.rank,
-      () =>
-          `Error in fused matMul: inputs must have the same rank of at least ` +
-          `2, got ranks ${$a.rank} and ${$b.rank}.`);
-
-  util.assert(
-      util.arraysEqual(outerDimsA, outerDimsB),
-      () => `Error in fused matMul: outer dimensions (${outerDimsA}) and (` +
-          `${outerDimsB}) of Tensors with shapes ${$a.shape} and ` +
-          `${$b.shape} must match.`);
-
-  util.assert(
-      innerShapeA === innerShapeB,
-      () => `Error in fused matMul: inner shapes (${innerShapeA}) and (` +
-          `${innerShapeB}) of Tensors with shapes ${$a.shape} and ` +
-          `${$b.shape} and transposeA=${transposeA}` +
-          ` and transposeB=${transposeB} must match.`);
-
-  const outShape = $a.shape.slice(0, -2).concat([outerShapeA, outerShapeB]);
-
-  const a3D = transposeA ? $a.as3D(batchDimA, innerShapeA, outerShapeA) :
-                           $a.as3D(batchDimA, outerShapeA, innerShapeA);
-  const b3D = transposeB ? $b.as3D(batchDimB, outerShapeB, innerShapeB) :
-                           $b.as3D(batchDimB, innerShapeB, outerShapeB);
-
-  let $bias: Tensor;
-  if (bias != null) {
-    $bias = convertToTensor(bias, 'bias', 'fused matMul');
-    [$bias] = makeTypesMatch($bias, $a);
-
-    broadcast_util.assertAndGetBroadcastShape(outShape, $bias.shape);
-  }
-
-  let $preluActivationWeights: Tensor;
-  if (preluActivationWeights != null) {
-    $preluActivationWeights = convertToTensor(
-        preluActivationWeights, 'prelu weights', 'fused matMul');
-  }
-
-  const grad = (dy: Tensor3D, saved: Tensor[]) => {
-    const [a3D, b3D, y] = saved;
-    const dyActivation = getFusedDyActivation(dy, y, activation);
-
-    let biasGradient = {};
-    if (bias != null) {
-      biasGradient = {bias: () => getFusedBiasGradient($bias, dyActivation)};
-    }
-
-    if (!transposeA && !transposeB) {
-      return Object.assign(
-          {
-            a: () => dyActivation.matMul(b3D as Tensor3D, false, true),
-            b: () => a3D.matMul(dyActivation, true, false)
-          },
-          biasGradient);
-    } else if (!transposeA && transposeB) {
-      return Object.assign(
-          {
-            a: () => dyActivation.matMul(b3D as Tensor3D, false, false),
-            b: () => dyActivation.matMul(a3D as Tensor3D, true, false)
-          },
-          biasGradient);
-    } else if (transposeA && !transposeB) {
-      return Object.assign(
-          {
-            a: () => b3D.matMul(dyActivation, false, true),
-            b: () => a3D.matMul(dyActivation, false, false)
-          },
-          biasGradient);
-    } else {
-      return Object.assign(
-          {
-            a: () => b3D.matMul(dyActivation, true, true),
-            b: () => dyActivation.matMul(a3D as Tensor3D, true, true)
-          },
-          biasGradient);
-    }
-  };
-
-  const inputs:
-      {a: Tensor, b: Tensor,
-       bias?: Tensor,
-       preluActivationWeights?: Tensor} = {a: a3D, b: b3D};
-  if (bias != null) {
-    inputs.bias = $bias;
-  }
-  if (preluActivationWeights != null) {
-    inputs.preluActivationWeights = $preluActivationWeights;
-  }
-
-  const inputsToSave = [a3D, b3D];
-  const outputsToSave = [true];
-
-  const res = ENGINE.runKernelFunc(
-      (backend, save) => {
-        const y = backend.fusedBatchMatMul({
-          a: a3D,
-          b: b3D,
-          transposeA,
-          transposeB,
-          bias: $bias,
-          activation,
-          preluActivationWeights: $preluActivationWeights
-        });
-        save([a3D, b3D, y]);
-        return y;
-      },
-      inputs, grad, '_FusedMatMul', {transposeA, transposeB, activation},
-      inputsToSave, outputsToSave);
-  return res.reshape(outShape);
-}
-
-/**
- * Computes a 2D convolution over the input x, optionally fused with adding a
- * bias and applying an activation.
- *
- * ```js
- * const inputDepth = 2;
- * const inShape = [2, 2, 2, inputDepth];
- * const outputDepth = 2;
- * const fSize = 1;
- * const pad = 0;
- * const strides = 1;
- *
- * const x = tf.tensor4d( [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- * 16], inShape);
- * const w = tf.tensor4d([-1, 1, -2, 0.5], [fSize, fSize, inputDepth,
- * outputDepth]);
- *
- * tf.fused.conv2d({ x, filter: w, strides, pad, dataFormat: 'NHWC',
- * dilations: [1, 1], bias: tf.scalar(5), activation: 'relu' }).print();
- * ```
- *
- * @param obj An object with the following properties:
- * @param x The input tensor, of rank 4 or rank 3, of shape
- *     `[batch, height, width, inChannels]`. If rank 3, batch of 1 is
- * assumed.
- * @param filter The filter, rank 4, of shape
- *     `[filterHeight, filterWidth, inDepth, outDepth]`.
- * @param strides The strides of the convolution: `[strideHeight,
- * strideWidth]`.
- * @param pad The type of padding algorithm.
- *   - `same` and stride 1: output will be of same size as input,
- *       regardless of filter size.
- *   - `valid` output will be smaller than input if filter is larger
- *       than 1x1.
- *   - For more info, see this guide:
- *     [https://www.tensorflow.org/api_guides/python/nn#Convolution](
- *          https://www.tensorflow.org/api_guides/python/nn#Convolution)
- * @param dataFormat An optional string from: "NHWC", "NCHW". Defaults to
- *     "NHWC". Specify the data format of the input and output data. With the
- *     default format "NHWC", the data is stored in the order of: [batch,
- *     height, width, channels]. Only "NHWC" is currently supported.
- * @param dilations The dilation rates: `[dilationHeight, dilationWidth]`
- *     in which we sample input values across the height and width dimensions
- *     in atrous convolution. Defaults to `[1, 1]`. If `dilations` is a single
- *     number, then `dilationHeight == dilationWidth`. If it is greater than
- *     1, then all values of `strides` must be 1.
- * @param dimRoundingMode The rounding mode used when computing output
- *     dimensions if pad is a number. If none is provided, it will not round
- *     and error if the output is of fractional size.
- * @param bias Tensor to be added to the result.
- * @param activation Name of activation kernel (defaults to `linear`) to be
- *     applied
- *      after biasAdd.
- * @param preluActivationWeights Tensor of prelu weights to be applied as part
- *     of a `prelu` activation, typically the same shape as `x`.
- */
-function fusedConv2d_<T extends Tensor3D|Tensor4D>({
-  x,
-  filter,
-  strides,
-  pad,
-  dataFormat = 'NHWC',
-  dilations = [1, 1],
-  dimRoundingMode,
-  bias,
-  activation = 'linear',
-  preluActivationWeights
-}: {
-  x: T|TensorLike,
-  filter: Tensor4D|TensorLike,
-  strides: [number, number]|number,
-  pad: 'valid'|'same'|number|conv_util.ExplicitPadding,
-  dataFormat?: 'NHWC'|'NCHW',
-  dilations?: [number, number]|number,
-  dimRoundingMode?: 'floor'|'round'|'ceil',
-  bias?: Tensor|TensorLike,
-  activation?: Activation,
-  preluActivationWeights?: Tensor
-}): T {
-  activation = activation || 'linear';
-  if (shouldFuse(ENGINE.state.gradientDepth, activation) === false) {
-    let result = unfusedConv2d(
-        x, filter, strides, pad, dataFormat, dilations, dimRoundingMode);
-    if (bias != null) {
-      result = add(result, bias);
-    }
-
-    return applyActivation(result, activation, preluActivationWeights) as T;
-  }
-
-  const $x = convertToTensor(x, 'x', 'conv2d');
-  const $filter = convertToTensor(filter, 'filter', 'conv2d');
-
-  let x4D = $x as Tensor4D;
-  let reshapedTo4D = false;
-
-  if ($x.rank === 3) {
-    reshapedTo4D = true;
-    x4D = $x.as4D(1, $x.shape[0], $x.shape[1], $x.shape[2]);
-  }
-  util.assert(
-      x4D.rank === 4,
-      () => `Error in fused conv2d: input must be rank 4, but got rank ` +
-          `${x4D.rank}.`);
-  util.assert(
-      $filter.rank === 4,
-      () => `Error in fused conv2d: filter must be rank 4, but got rank ` +
-          `${$filter.rank}.`);
-  if (dimRoundingMode != null) {
-    util.assert(
-        util.isInt(pad as number),
-        () => `Error in fused conv2d: pad must be an integer when using, ` +
-            `dimRoundingMode ${dimRoundingMode} but got pad ${pad}.`);
-  }
-
-  util.assert(
-      x4D.shape[3] === $filter.shape[2],
-      () => `Error in conv2d: depth of input (${x4D.shape[3]}) must match ` +
-          `input depth for filter ${$filter.shape[2]}.`);
-  util.assert(
-      conv_util.eitherStridesOrDilationsAreOne(strides, dilations),
-      () => 'Error in conv2D: Either strides or dilations must be 1. ' +
-          `Got strides ${strides} and dilations '${dilations}'`);
-  util.assert(
-      dataFormat === 'NHWC',
-      () => `Error in conv2d: got dataFormat of ${
-          dataFormat} but only NHWC is currently supported.`);
-
-  const convInfo = conv_util.computeConv2DInfo(
-      x4D.shape, $filter.shape, strides, dilations, pad, dimRoundingMode);
-
-  let $bias: Tensor;
-  if (bias != null) {
-    $bias = convertToTensor(bias, 'bias', 'fused conv2d');
-    [$bias] = makeTypesMatch($bias, $x);
-
-    broadcast_util.assertAndGetBroadcastShape(convInfo.outShape, $bias.shape);
-  }
-
-  let $preluActivationWeights: Tensor;
-  if (preluActivationWeights != null) {
-    $preluActivationWeights = convertToTensor(
-        preluActivationWeights, 'prelu weights', 'fused conv2d');
-  }
-
-  const grad = (dy: Tensor4D, saved: Tensor[]) => {
-    const [$filter, x4D, y] = saved as [Tensor4D, Tensor4D, Tensor4D];
-
-    const dyActivation = getFusedDyActivation(dy, y, activation) as Tensor4D;
-
-    util.assert(
-        conv_util.tupleValuesAreOne(dilations),
-        () => 'Error in gradient of fused conv2D: ' +
-            `dilation rates greater than 1 ` +
-            `are not yet supported in gradients. Got dilations '${dilations}'`);
-
-    let biasGradient = {};
-    if (bias != null) {
-      biasGradient = {bias: () => getFusedBiasGradient($bias, dyActivation)};
-    }
-
-    return Object.assign(
-        {
-          x: () => conv2DBackpropInput(
-              x4D.shape, dyActivation, $filter, strides, pad),
-          filter: () => conv2DBackpropFilter(
-              x4D, dyActivation, $filter.shape, strides, pad)
-        },
-        biasGradient);
-  };
-
-  const inputs: {
-    x: Tensor,
-    filter: Tensor,
-    bias?: Tensor,
-    preluActivationWeights?: Tensor
-  } = {x: x4D, filter: $filter};
-  if (bias != null) {
-    inputs.bias = $bias;
-  }
-  if (preluActivationWeights != null) {
-    inputs.preluActivationWeights = $preluActivationWeights;
-  }
-
-  const inputsToSave = [$filter, x4D];
-  const outputsToSave = [true];  // Save the only output.
-  const res = ENGINE.runKernelFunc(
-      (backend, save) => {
-        const res = backend.fusedConv2d({
-          input: x4D,
-          filter: $filter,
-          convInfo,
-          bias: $bias,
-          activation,
-          preluActivationWeights: $preluActivationWeights
-        });
-        save([$filter, x4D, res]);
-        return res;
-      },
-      inputs, grad, 'FusedConv2D', {convInfo, activation}, inputsToSave,
-      outputsToSave);
-
-  if (reshapedTo4D) {
-    return res.as3D(res.shape[1], res.shape[2], res.shape[3]) as T;
-  }
-
-  return res as T;
-}
-
-/**
- * Computes depthwise 2D convolution, optionally fused with adding a
- * bias and applying an activation.
- *
- * Given a 4D `input` array and a `filter` array of shape
- * `[filterHeight, filterWidth, inChannels, channelMultiplier]` containing
- * `inChannels` convolutional filters of depth 1, this op applies a
- * different filter to each input channel (expanding from 1 channel to
- * `channelMultiplier` channels for each), then concatenates the results
- * together. The output has `inChannels * channelMultiplier` channels.
- *
- * See
- * [https://www.tensorflow.org/api_docs/python/tf/nn/depthwise_conv2d](
- *     https://www.tensorflow.org/api_docs/python/tf/nn/depthwise_conv2d)
- * for more details.
- *
- * @param obj An object with the following properties:
- * @param x The input tensor, of rank 4 or rank 3, of shape
- *     `[batch, height, width, inChannels]`. If rank 3, batch of 1 is
- * assumed.
- * @param filter The filter tensor, rank 4, of shape
- *     `[filterHeight, filterWidth, inChannels, channelMultiplier]`.
- * @param strides The strides of the convolution: `[strideHeight,
- * strideWidth]`. If strides is a single number, then `strideHeight ==
- * strideWidth`.
- * @param pad The type of padding algorithm.
- *   - `same` and stride 1: output will be of same size as input,
- *       regardless of filter size.
- *   - `valid`: output will be smaller than input if filter is larger
- *       than 1x1.
- *   - For more info, see this guide:
- *     [https://www.tensorflow.org/api_guides/python/nn#Convolution](
- *          https://www.tensorflow.org/api_guides/python/nn#Convolution)
- * @param dilations The dilation rates: `[dilationHeight, dilationWidth]`
- *     in which we sample input values across the height and width dimensions
- *     in atrous convolution. Defaults to `[1, 1]`. If `rate` is a single
- *     number, then `dilationHeight == dilationWidth`. If it is greater than
- *     1, then all values of `strides` must be 1.
- * @param dataFormat: An optional string from: "NHWC", "NCHW". Defaults to
- *     "NHWC". Specify the data format of the input and output data. With the
- *     default format "NHWC", the data is stored in the order of: [batch,
- *     height, width, channels]. Only "NHWC" is currently supported.
- * @param dimRoundingMode The rounding mode used when computing output
- *     dimensions if pad is a number. If none is provided, it will not round
- *     and error if the output is of fractional size.
- * @param bias Tensor to be added to the result.
- * @param activation Name of activation kernel (defaults to `linear`).
- * @param preluActivationWeights Tensor of prelu weights to be applied as part
- *     of a `prelu` activation, typically the same shape as `x`.
- */
-function fusedDepthwiseConv2d_<T extends Tensor3D|Tensor4D>({
-  x,
-  filter,
-  strides,
-  pad,
-  dataFormat = 'NHWC',
-  dilations = [1, 1],
-  dimRoundingMode,
-  bias,
-  activation = 'linear',
-  preluActivationWeights
-}: {
-  x: T|TensorLike,
-  filter: Tensor4D|TensorLike,
-  strides: [number, number]|number,
-  pad: 'valid'|'same'|number,
-  dataFormat?: 'NHWC'|'NCHW',
-  dilations?: [number, number]|number,
-  dimRoundingMode?: 'floor'|'round'|'ceil',
-  bias?: Tensor|TensorLike,
-  activation?: Activation,
-  preluActivationWeights?: Tensor
-}): T {
-  if (shouldFuse(ENGINE.state.gradientDepth, activation) === false) {
-    let result = unfusedDepthwiseConv2d(
-        x, filter, strides, pad, dataFormat, dilations, dimRoundingMode);
-    if (bias != null) {
-      result = add(result, bias);
-    }
-
-    return applyActivation(result, activation, preluActivationWeights) as T;
-  }
-
-  const $x = convertToTensor(x, 'x', 'depthwiseConv2d');
-  const $filter = convertToTensor(filter, 'filter', 'depthwiseConv2d');
-
-  let x4D = $x as Tensor4D;
-  let reshapedTo4D = false;
-  if ($x.rank === 3) {
-    reshapedTo4D = true;
-    x4D = $x.as4D(1, $x.shape[0], $x.shape[1], $x.shape[2]);
-  }
-  util.assert(
-      x4D.rank === 4,
-      () => `Error in fused depthwiseConv2d: input must be rank 4, but got ` +
-          `rank ${x4D.rank}.`);
-  util.assert(
-      $filter.rank === 4,
-      () => `Error in fused depthwiseConv2d: filter must be rank 4, ` +
-          `but got rank ${$filter.rank}.`);
-  util.assert(
-      x4D.shape[3] === $filter.shape[2],
-      () => `Error in fused depthwiseConv2d: number of input channels ` +
-          `(${x4D.shape[3]}) must match the inChannels dimension in ` +
-          `filter ${$filter.shape[2]}.`);
-  if (dilations == null) {
-    dilations = [1, 1];
-  }
-  util.assert(
-      conv_util.eitherStridesOrDilationsAreOne(strides, dilations),
-      () =>
-          'Error in fused depthwiseConv2d: Either strides or dilations must ' +
-          `be 1. Got strides ${strides} and dilations '${dilations}'`);
-
-  if (dimRoundingMode != null) {
-    util.assert(
-        util.isInt(pad as number),
-        () => `Error in fused depthwiseConv2d: pad must be an integer when ` +
-            `using dimRoundingMode ${dimRoundingMode} but got pad ${pad}.`);
-  }
-
-  const convInfo = conv_util.computeConv2DInfo(
-      x4D.shape, $filter.shape, strides, dilations, pad, dimRoundingMode,
-      true /* depthwise */);
-
-  let $bias: Tensor;
-  if (bias != null) {
-    $bias = convertToTensor(bias, 'bias', 'fused conv2d');
-    [$bias] = makeTypesMatch($bias, $x);
-
-    broadcast_util.assertAndGetBroadcastShape(convInfo.outShape, $bias.shape);
-  }
-
-  let $preluActivationWeights: Tensor;
-  if (preluActivationWeights != null) {
-    $preluActivationWeights = convertToTensor(
-        preluActivationWeights, 'prelu weights', 'fused depthwiseConv2d');
-  }
-
-  const grad = (dy: Tensor4D, saved: Tensor[]) => {
-    util.assert(
-        conv_util.tupleValuesAreOne(dilations),
-        () => 'Error in gradient of fused depthwiseConv2d: dilation rates ' +
-            `greater than 1 are not yet supported. Got dilations ` +
-            `'${dilations}'`);
-    const [$filter, x4D, y] = saved;
-
-    const dyActivation = getFusedDyActivation(dy, y, activation) as Tensor4D;
-
-    let biasGradient = {};
-    if (bias != null) {
-      biasGradient = {bias: () => getFusedBiasGradient($bias, dyActivation)};
-    }
-
-    return Object.assign(
-        {
-          x: () => depthwiseConv2dNativeBackpropInput(
-              (x4D as Tensor4D).shape, dyActivation, $filter as Tensor4D,
-              convInfo),
-          filter: () => depthwiseConv2dNativeBackpropFilter(
-              x4D as Tensor4D, dyActivation, ($filter as Tensor4D).shape,
-              convInfo),
-        },
-        biasGradient);
-  };
-
-  const inputs: {
-    x: Tensor,
-    filter: Tensor,
-    bias?: Tensor,
-    preluActivationWeights?: Tensor
-  } = {x: x4D, filter: $filter};
-  if (bias != null) {
-    inputs.bias = $bias;
-  }
-  if (preluActivationWeights != null) {
-    inputs.preluActivationWeights = $preluActivationWeights;
-  }
-
-  const inputsToSave = [$filter, x4D];
-  const outputsToSave = [true];
-  const res = ENGINE.runKernelFunc(
-      (backend, save) => {
-        const res = backend.fusedDepthwiseConv2D({
-          input: x4D,
-          filter: $filter,
-          convInfo,
-          bias: $bias,
-          activation,
-          preluActivationWeights: $preluActivationWeights
-        });
-        save([$filter, x4D, res]);
-        return res;
-      },
-      inputs, grad, 'FusedDepthwiseConv2D', {convInfo, activation},
-      inputsToSave, outputsToSave);
-  if (reshapedTo4D) {
-    return res.as3D(res.shape[1], res.shape[2], res.shape[3]) as T;
-  }
-  return res as T;
-}
-
-export const matMul = op({fusedMatMul_});
-export const conv2d = op({fusedConv2d_});
-export const depthwiseConv2d = op({fusedDepthwiseConv2d_});
-
-export {Activation};
+export {Activation, conv2d, depthwiseConv2d, matMul};
diff --git a/tfjs-core/src/ops/fused_util.ts b/tfjs-core/src/ops/fused_util.ts
index d9f16133966..ff90c751c0f 100644
--- a/tfjs-core/src/ops/fused_util.ts
+++ b/tfjs-core/src/ops/fused_util.ts
@@ -16,8 +16,13 @@
  */
 
 import {Tensor, Tensor3D, Tensor4D} from '../tensor';
+import * as broadcast_util from './broadcast_util';
 
 import {Conv2DInfo} from './conv_util';
+import {elu} from './elu';
+import {prelu} from './prelu';
+import {relu} from './relu';
+import {relu6} from './relu6';
 
 export type Activation = 'linear'|'relu'|'prelu'|'elu'|'relu6';
 
@@ -45,3 +50,46 @@ export const shouldFuse = (gradientDepth: number, activation: Activation) => {
   const gradientMode = gradientDepth > 0;
   return !gradientMode || activation === 'linear';
 };
+
+// Returns gradient for fused activation.
+export function getFusedDyActivation(
+    dy: Tensor, y: Tensor, activation: Activation): Tensor {
+  if (activation == null || activation === 'linear') {
+    return dy;
+  }
+  if (activation === 'relu') {
+    return dy.mul(y.step());
+  }
+  throw new Error(
+      `Gradient for activation ${activation} has not been ` +
+      `implemented yet.`);
+}
+
+// Returns gradient for fused bias.
+export function getFusedBiasGradient(
+    bias: Tensor, dyActivation: Tensor): Tensor {
+  let res = dyActivation;
+  const reduceAxes =
+      broadcast_util.getReductionAxes(bias.shape, dyActivation.shape);
+  if (reduceAxes.length > 0) {
+    res = res.sum(reduceAxes);
+  }
+  return res.reshape(bias.shape);
+}
+
+export function applyActivation(
+    x: Tensor, activation: Activation,
+    preluActivationWeights?: Tensor): Tensor {
+  if (activation === 'linear') {
+    return x;
+  } else if (activation === 'relu') {
+    return relu(x);
+  } else if (activation === 'elu') {
+    return elu(x);
+  } else if (activation === 'relu6') {
+    return relu6(x);
+  } else if (activation === 'prelu') {
+    return prelu(x, preluActivationWeights);
+  }
+  throw new Error(`Unknown fused activation ${activation}.`);
+}

From 2c0cf0dd602c229531a9d19cc02f8d30694ffb03 Mon Sep 17 00:00:00 2001
From: Yannick Assogba <yassogba@google.com>
Date: Mon, 13 Jul 2020 13:23:13 -0400
Subject: [PATCH 02/12] remove unused kernel interfaces

---
 tfjs-core/src/kernel_names.ts | 39 ++++++-----------------------------
 1 file changed, 6 insertions(+), 33 deletions(-)

diff --git a/tfjs-core/src/kernel_names.ts b/tfjs-core/src/kernel_names.ts
index efe84f8ac85..2020d63ee8a 100644
--- a/tfjs-core/src/kernel_names.ts
+++ b/tfjs-core/src/kernel_names.ts
@@ -21,7 +21,6 @@
 import {ExplicitPadding} from '../src/ops/conv_util';
 
 import {NamedTensorInfoMap, TensorInfo} from './kernel_registry';
-import {Activation} from './ops/fused_util';
 import {DataType, PixelData} from './types';
 
 export const Abs = 'Abs';
@@ -734,36 +733,10 @@ export interface FromPixelsAttrs {
   numChannels: number;
 }
 
-export const _FusedMatMul = '_FusedMatMul';
-export type _FusedMatMulInputs =
-    Pick<NamedTensorInfoMap, 'a'|'b'|'bias'|'preluActivationWeights'>;
-// tslint:disable-next-line: class-name
-export interface _FusedMatMulAttrs {
-  transposeA: number;
-  transposeB: number;
-  activation: Activation;
-}
-
-export const FusedConv2D = 'FusedConv2D';
-export type FusedConv2DInputs =
-    Pick<NamedTensorInfoMap, 'x'|'filter'|'bias'|'preluActivationWeights'>;
-export interface FusedConv2DAttrs {
-  strides: [number, number]|number;
-  pad: 'valid'|'same'|number|ExplicitPadding;
-  dataFormat: 'NHWC'|'NCHW';
-  dilations: [number, number]|number;
-  dimRoundingMode: 'floor'|'round'|'ceil';
-  activation: Activation;
-}
-
-export const FusedDepthwiseConv2D = 'FusedDepthwiseConv2D';
-export type FusedDepthwiseConv2DInputs =
-    Pick<NamedTensorInfoMap, 'x'|'filter'|'bias'|'preluActivationWeights'>;
-export interface FusedDepthwiseConv2DAttrs {
-  strides: [number, number]|number;
-  pad: 'valid'|'same'|number;
-  dataFormat: 'NHWC'|'NCHW';
-  dilations: [number, number]|number;
-  dimRoundingMode: 'floor'|'round'|'ceil';
-  activation: Activation;
+export const RotateWithOffset = 'RotateWithOffset';
+export type RotateWithOffsetInputs = Pick<NamedTensorInfoMap, 'image'>;
+export interface RotateWithOffsetAttrs {
+  radians: number;
+  fillValue: number|[number, number, number];
+  center: number|[number, number];
 }

From c5a81b01b6c25dc2bddb3b91dd754850c20f3ed9 Mon Sep 17 00:00:00 2001
From: Yannick Assogba <yassogba@google.com>
Date: Mon, 13 Jul 2020 15:00:28 -0400
Subject: [PATCH 03/12] fix circular deps

---
 tfjs-core/src/backends/backend.ts           |  2 +-
 tfjs-core/src/backends/backend_util.ts      |  3 +-
 tfjs-core/src/ops/fused/conv2d.ts           |  3 +-
 tfjs-core/src/ops/fused/depthwise_conv2d.ts |  3 +-
 tfjs-core/src/ops/fused/mat_mul.ts          |  3 +-
 tfjs-core/src/ops/fused/types.ts            | 40 +++++++++++++++++++++
 tfjs-core/src/ops/fused_ops.ts              |  2 +-
 tfjs-core/src/ops/fused_util.ts             | 27 ++------------
 8 files changed, 53 insertions(+), 30 deletions(-)
 create mode 100644 tfjs-core/src/ops/fused/types.ts

diff --git a/tfjs-core/src/backends/backend.ts b/tfjs-core/src/backends/backend.ts
index a93f3552b15..23cf94a0ca5 100644
--- a/tfjs-core/src/backends/backend.ts
+++ b/tfjs-core/src/backends/backend.ts
@@ -16,7 +16,7 @@
  */
 
 import {Conv2DInfo, Conv3DInfo} from '../ops/conv_util';
-import {FusedBatchMatMulConfig, FusedConv2DConfig} from '../ops/fused_util';
+import {FusedBatchMatMulConfig, FusedConv2DConfig} from '../ops/fused/types';
 import {Backend, DataId, Scalar, Tensor, Tensor1D, Tensor2D, Tensor3D, Tensor4D, Tensor5D} from '../tensor';
 import {BackendValues, DataType, Rank, ShapeMap} from '../types';
 
diff --git a/tfjs-core/src/backends/backend_util.ts b/tfjs-core/src/backends/backend_util.ts
index 6baa8b629f5..6f43b966f12 100644
--- a/tfjs-core/src/backends/backend_util.ts
+++ b/tfjs-core/src/backends/backend_util.ts
@@ -31,7 +31,8 @@ export * from '../ops/axis_util';
 export * from '../ops/broadcast_util';
 export * from '../ops/concat_util';
 export * from '../ops/conv_util';
-export {Activation, FusedConv2DConfig} from '../ops/fused_util';
+export * from '../ops/fused_util';
+export * from '../ops/fused/types';
 export * from '../ops/reduce_util';
 
 export {BackendValues, TypedArray, upcastType, PixelData} from '../types';
diff --git a/tfjs-core/src/ops/fused/conv2d.ts b/tfjs-core/src/ops/fused/conv2d.ts
index e3504076aa3..6a051c36ae4 100644
--- a/tfjs-core/src/ops/fused/conv2d.ts
+++ b/tfjs-core/src/ops/fused/conv2d.ts
@@ -27,9 +27,10 @@ import {conv2d as unfusedConv2d} from '../conv2d';
 import {conv2DBackpropFilter} from '../conv2d_backprop_filter';
 import {conv2DBackpropInput} from '../conv2d_backprop_input';
 import {applyActivation, getFusedBiasGradient, getFusedDyActivation} from '../fused_util';
-import {Activation, shouldFuse} from '../fused_util';
+import {shouldFuse} from '../fused_util';
 import * as conv_util from '../ops/../conv_util';
 import {op} from '../ops/../operation';
+import {Activation} from './types';
 
 /**
  * Computes a 2D convolution over the input x, optionally fused with adding a
diff --git a/tfjs-core/src/ops/fused/depthwise_conv2d.ts b/tfjs-core/src/ops/fused/depthwise_conv2d.ts
index cadefd6d964..4b9b8067399 100644
--- a/tfjs-core/src/ops/fused/depthwise_conv2d.ts
+++ b/tfjs-core/src/ops/fused/depthwise_conv2d.ts
@@ -28,7 +28,8 @@ import * as broadcast_util from '../broadcast_util';
 import {depthwiseConv2d as unfusedDepthwiseConv2d} from '../depthwise_conv2d';
 import {depthwiseConv2dNativeBackpropFilter} from '../depthwise_conv2d_native_backprop_filter';
 import {depthwiseConv2dNativeBackpropInput} from '../depthwise_conv2d_native_backprop_input';
-import {Activation, applyActivation, getFusedBiasGradient, getFusedDyActivation, shouldFuse} from '../fused_util';
+import {applyActivation, getFusedBiasGradient, getFusedDyActivation, shouldFuse} from '../fused_util';
+import {Activation} from './types';
 
 /**
  * Computes depthwise 2D convolution, optionally fused with adding a
diff --git a/tfjs-core/src/ops/fused/mat_mul.ts b/tfjs-core/src/ops/fused/mat_mul.ts
index db0a2ed2f95..66e8a9fcff6 100644
--- a/tfjs-core/src/ops/fused/mat_mul.ts
+++ b/tfjs-core/src/ops/fused/mat_mul.ts
@@ -24,8 +24,9 @@ import {TensorLike} from '../../types';
 import * as util from '../../util';
 import {add} from '../add';
 import * as broadcast_util from '../broadcast_util';
-import {Activation, applyActivation, getFusedBiasGradient, getFusedDyActivation, shouldFuse} from '../fused_util';
+import {applyActivation, getFusedBiasGradient, getFusedDyActivation, shouldFuse} from '../fused_util';
 import {matMul as unfusedMatMul} from '../mat_mul';
+import {Activation} from './types';
 
 /**
  * Computes the dot product of two matrices with optional activation and bias.
diff --git a/tfjs-core/src/ops/fused/types.ts b/tfjs-core/src/ops/fused/types.ts
new file mode 100644
index 00000000000..ffd5c423a4e
--- /dev/null
+++ b/tfjs-core/src/ops/fused/types.ts
@@ -0,0 +1,40 @@
+/**
+ * @license
+ * Copyright 2020 Google Inc. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {Tensor, Tensor3D, Tensor4D} from '../../tensor';
+import {Conv2DInfo} from '../conv_util';
+
+export type FusedConv2DConfig = {
+  input: Tensor4D,
+  filter: Tensor4D,
+  convInfo: Conv2DInfo,
+  bias?: Tensor,
+  activation?: Activation,
+  preluActivationWeights?: Tensor
+};
+
+export type FusedBatchMatMulConfig = {
+  a: Tensor3D,
+  b: Tensor3D,
+  transposeA: boolean,
+  transposeB: boolean,
+  bias?: Tensor,
+  activation?: Activation,
+  preluActivationWeights?: Tensor
+};
+
+export type Activation = 'linear'|'relu'|'prelu'|'elu'|'relu6';
diff --git a/tfjs-core/src/ops/fused_ops.ts b/tfjs-core/src/ops/fused_ops.ts
index ba930839e3c..bf458da5c9e 100644
--- a/tfjs-core/src/ops/fused_ops.ts
+++ b/tfjs-core/src/ops/fused_ops.ts
@@ -18,6 +18,6 @@
 import {conv2d} from './fused/conv2d';
 import {depthwiseConv2d} from './fused/depthwise_conv2d';
 import {matMul} from './fused/mat_mul';
-import {Activation} from './fused_util';
+import {Activation} from './fused/types';
 
 export {Activation, conv2d, depthwiseConv2d, matMul};
diff --git a/tfjs-core/src/ops/fused_util.ts b/tfjs-core/src/ops/fused_util.ts
index ff90c751c0f..8784be9af29 100644
--- a/tfjs-core/src/ops/fused_util.ts
+++ b/tfjs-core/src/ops/fused_util.ts
@@ -15,36 +15,15 @@
  * =============================================================================
  */
 
-import {Tensor, Tensor3D, Tensor4D} from '../tensor';
-import * as broadcast_util from './broadcast_util';
+import {Tensor} from '../tensor';
 
-import {Conv2DInfo} from './conv_util';
+import * as broadcast_util from './broadcast_util';
 import {elu} from './elu';
+import {Activation} from './fused/types';
 import {prelu} from './prelu';
 import {relu} from './relu';
 import {relu6} from './relu6';
 
-export type Activation = 'linear'|'relu'|'prelu'|'elu'|'relu6';
-
-export type FusedBatchMatMulConfig = {
-  a: Tensor3D,
-  b: Tensor3D,
-  transposeA: boolean,
-  transposeB: boolean,
-  bias?: Tensor,
-  activation?: Activation,
-  preluActivationWeights?: Tensor
-};
-
-export type FusedConv2DConfig = {
-  input: Tensor4D,
-  filter: Tensor4D,
-  convInfo: Conv2DInfo,
-  bias?: Tensor,
-  activation?: Activation,
-  preluActivationWeights?: Tensor
-};
-
 // Whether we should call fused ops.
 export const shouldFuse = (gradientDepth: number, activation: Activation) => {
   const gradientMode = gradientDepth > 0;

From e737248d91f4bfb93edee5313369211bebae19c3 Mon Sep 17 00:00:00 2001
From: Yannick Assogba <yassogba@google.com>
Date: Tue, 14 Jul 2020 17:13:51 -0400
Subject: [PATCH 04/12] move fused files to top level. delete gradients

---
 tfjs-core/src/backends/backend.ts             |    2 +-
 tfjs-core/src/backends/backend_util.ts        |    2 +-
 .../ops/{fused/conv2d.ts => fused_conv2d.ts}  |   71 +-
 tfjs-core/src/ops/fused_conv2d_test.ts        |  864 ++++++++++
 ...se_conv2d.ts => fused_depthwise_conv2d.ts} |   68 +-
 .../src/ops/fused_depthwise_conv2d_test.ts    |  253 +++
 .../{fused/mat_mul.ts => fused_mat_mul.ts}    |   78 +-
 tfjs-core/src/ops/fused_mat_mul_test.ts       |  310 ++++
 tfjs-core/src/ops/fused_ops.ts                |    8 +-
 tfjs-core/src/ops/fused_test.ts               | 1391 -----------------
 .../ops/{fused/types.ts => fused_types.ts}    |    4 +-
 tfjs-core/src/ops/fused_util.ts               |    8 +-
 tfjs-core/src/tests.ts                        |    4 +-
 13 files changed, 1481 insertions(+), 1582 deletions(-)
 rename tfjs-core/src/ops/{fused/conv2d.ts => fused_conv2d.ts} (81%)
 create mode 100644 tfjs-core/src/ops/fused_conv2d_test.ts
 rename tfjs-core/src/ops/{fused/depthwise_conv2d.ts => fused_depthwise_conv2d.ts} (75%)
 create mode 100644 tfjs-core/src/ops/fused_depthwise_conv2d_test.ts
 rename tfjs-core/src/ops/{fused/mat_mul.ts => fused_mat_mul.ts} (67%)
 create mode 100644 tfjs-core/src/ops/fused_mat_mul_test.ts
 delete mode 100644 tfjs-core/src/ops/fused_test.ts
 rename tfjs-core/src/ops/{fused/types.ts => fused_types.ts} (92%)

diff --git a/tfjs-core/src/backends/backend.ts b/tfjs-core/src/backends/backend.ts
index 23cf94a0ca5..66907f2fa8d 100644
--- a/tfjs-core/src/backends/backend.ts
+++ b/tfjs-core/src/backends/backend.ts
@@ -16,7 +16,7 @@
  */
 
 import {Conv2DInfo, Conv3DInfo} from '../ops/conv_util';
-import {FusedBatchMatMulConfig, FusedConv2DConfig} from '../ops/fused/types';
+import {FusedBatchMatMulConfig, FusedConv2DConfig} from '../ops/fused_types';
 import {Backend, DataId, Scalar, Tensor, Tensor1D, Tensor2D, Tensor3D, Tensor4D, Tensor5D} from '../tensor';
 import {BackendValues, DataType, Rank, ShapeMap} from '../types';
 
diff --git a/tfjs-core/src/backends/backend_util.ts b/tfjs-core/src/backends/backend_util.ts
index 6f43b966f12..c1302627922 100644
--- a/tfjs-core/src/backends/backend_util.ts
+++ b/tfjs-core/src/backends/backend_util.ts
@@ -32,7 +32,7 @@ export * from '../ops/broadcast_util';
 export * from '../ops/concat_util';
 export * from '../ops/conv_util';
 export * from '../ops/fused_util';
-export * from '../ops/fused/types';
+export * from '../ops/fused_types';
 export * from '../ops/reduce_util';
 
 export {BackendValues, TypedArray, upcastType, PixelData} from '../types';
diff --git a/tfjs-core/src/ops/fused/conv2d.ts b/tfjs-core/src/ops/fused_conv2d.ts
similarity index 81%
rename from tfjs-core/src/ops/fused/conv2d.ts
rename to tfjs-core/src/ops/fused_conv2d.ts
index 6a051c36ae4..5d507b6f415 100644
--- a/tfjs-core/src/ops/fused/conv2d.ts
+++ b/tfjs-core/src/ops/fused_conv2d.ts
@@ -15,22 +15,19 @@
  * =============================================================================
  */
 
-import {ENGINE} from '../../engine';
-import {Tensor, Tensor3D, Tensor4D} from '../../tensor';
-import {makeTypesMatch} from '../../tensor_util';
-import {convertToTensor} from '../../tensor_util_env';
-import {TensorLike} from '../../types';
-import * as util from '../../util';
-import {add} from '../add';
-import * as broadcast_util from '../broadcast_util';
-import {conv2d as unfusedConv2d} from '../conv2d';
-import {conv2DBackpropFilter} from '../conv2d_backprop_filter';
-import {conv2DBackpropInput} from '../conv2d_backprop_input';
-import {applyActivation, getFusedBiasGradient, getFusedDyActivation} from '../fused_util';
-import {shouldFuse} from '../fused_util';
-import * as conv_util from '../ops/../conv_util';
-import {op} from '../ops/../operation';
-import {Activation} from './types';
+import {ENGINE} from '../engine';
+import {Tensor, Tensor3D, Tensor4D} from '../tensor';
+import {makeTypesMatch} from '../tensor_util';
+import {convertToTensor} from '../tensor_util_env';
+import {TensorLike} from '../types';
+import * as util from '../util';
+
+import * as broadcast_util from './broadcast_util';
+import * as conv_util from './conv_util';
+import {Activation} from './fused_types';
+import {op} from './operation';
+
+
 
 /**
  * Computes a 2D convolution over the input x, optionally fused with adding a
@@ -168,15 +165,6 @@ function fusedConv2d_<T extends Tensor3D|Tensor4D>({
   preluActivationWeights?: Tensor
 }): T {
   activation = activation || 'linear';
-  if (shouldFuse(ENGINE.state.gradientDepth, activation) === false) {
-    let result = unfusedConv2d(
-        x, filter, strides, pad, dataFormat, dilations, dimRoundingMode);
-    if (bias != null) {
-      result = add(result, bias);
-    }
-
-    return applyActivation(result, activation, preluActivationWeights) as T;
-  }
 
   const $x = convertToTensor(x, 'x', 'conv2d');
   const $filter = convertToTensor(filter, 'filter', 'conv2d');
@@ -233,32 +221,6 @@ function fusedConv2d_<T extends Tensor3D|Tensor4D>({
         preluActivationWeights, 'prelu weights', 'fused conv2d');
   }
 
-  const grad = (dy: Tensor4D, saved: Tensor[]) => {
-    const [$filter, x4D, y] = saved as [Tensor4D, Tensor4D, Tensor4D];
-
-    const dyActivation = getFusedDyActivation(dy, y, activation) as Tensor4D;
-
-    util.assert(
-        conv_util.tupleValuesAreOne(dilations),
-        () => 'Error in gradient of fused conv2D: ' +
-            `dilation rates greater than 1 ` +
-            `are not yet supported in gradients. Got dilations '${dilations}'`);
-
-    let biasGradient = {};
-    if (bias != null) {
-      biasGradient = {bias: () => getFusedBiasGradient($bias, dyActivation)};
-    }
-
-    return Object.assign(
-        {
-          x: () => conv2DBackpropInput(
-              x4D.shape, dyActivation, $filter, strides, pad),
-          filter: () => conv2DBackpropFilter(
-              x4D, dyActivation, $filter.shape, strides, pad)
-        },
-        biasGradient);
-  };
-
   const inputs: {
     x: Tensor,
     filter: Tensor,
@@ -275,7 +237,7 @@ function fusedConv2d_<T extends Tensor3D|Tensor4D>({
   const inputsToSave = [$filter, x4D];
   const outputsToSave = [true];  // Save the only output.
   const res = ENGINE.runKernelFunc(
-      (backend, save) => {
+      (backend) => {
         const res = backend.fusedConv2d({
           input: x4D,
           filter: $filter,
@@ -284,11 +246,10 @@ function fusedConv2d_<T extends Tensor3D|Tensor4D>({
           activation,
           preluActivationWeights: $preluActivationWeights
         });
-        save([$filter, x4D, res]);
         return res;
       },
-      inputs, grad, 'FusedConv2D', {convInfo, activation}, inputsToSave,
-      outputsToSave);
+      inputs, null /* grad */, 'FusedConv2D', {convInfo, activation},
+      inputsToSave, outputsToSave);
 
   if (reshapedTo4D) {
     return res.as3D(res.shape[1], res.shape[2], res.shape[3]) as T;
diff --git a/tfjs-core/src/ops/fused_conv2d_test.ts b/tfjs-core/src/ops/fused_conv2d_test.ts
new file mode 100644
index 00000000000..1f2b34cc84d
--- /dev/null
+++ b/tfjs-core/src/ops/fused_conv2d_test.ts
@@ -0,0 +1,864 @@
+/**
+ * @license
+ * Copyright 2020 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import * as tf from '../index';
+import {ALL_ENVS, describeWithFlags} from '../jasmine_util';
+import {expectArraysClose} from '../test_util';
+
+function generateCaseInputs(totalSizeTensor: number, totalSizeFilter: number) {
+  const inp = new Array(totalSizeTensor);
+  const filt = new Array(totalSizeFilter);
+
+  for (let i = 0; i < totalSizeTensor; i++) {
+    inp[i] = i * 0.001 - totalSizeTensor * 0.001 / 2;
+  }
+  for (let i = 0; i < totalSizeFilter; i++) {
+    const sign = i % 2 === 0 ? -1 : 1;
+    filt[i] = i * 0.001 * sign;
+  }
+
+  return {input: inp, filter: filt};
+}
+
+describeWithFlags('fused conv2d', ALL_ENVS, () => {
+  it('basic', async () => {
+    const inputDepth = 2;
+    const inShape: [number, number, number, number] = [2, 2, 2, inputDepth];
+    const outputDepth = 2;
+    const fSize = 1;
+    const pad = 0;
+    const stride = 1;
+
+    const x = tf.tensor4d(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], inShape);
+    const w =
+        tf.tensor4d([-1, 1, -2, 0.5], [fSize, fSize, inputDepth, outputDepth]);
+
+    const result = tf.fused.conv2d({x, filter: w, strides: stride, pad});
+    expect(result.shape).toEqual([2, 2, 2, 2]);
+    const expected =
+        [-5, 2, -11, 5, -17, 8, -23, 11, -29, 14, -35, 17, -41, 20, -47, 23];
+
+    expectArraysClose(await result.data(), expected);
+  });
+
+  it('basic with relu', async () => {
+    const inputDepth = 2;
+    const inShape: [number, number, number, number] = [2, 2, 2, inputDepth];
+    const outputDepth = 2;
+    const fSize = 1;
+    const pad = 0;
+    const stride = 1;
+
+    const x = tf.tensor4d(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], inShape);
+    const w =
+        tf.tensor4d([-1, 1, -2, 0.5], [fSize, fSize, inputDepth, outputDepth]);
+
+    const result = tf.fused.conv2d({
+      x,
+      filter: w,
+      strides: stride,
+      pad,
+      dataFormat: 'NHWC',
+      dilations: [1, 1],
+      activation: 'relu'
+    });
+    expect(result.shape).toEqual([2, 2, 2, 2]);
+    const expected = [0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0, 17, 0, 20, 0, 23];
+
+    expectArraysClose(await result.data(), expected);
+  });
+
+  it('relu with stride 2 x=[1,8,8,16] f=[3,3,16,1] s=[2,2] d=1 p=same',
+     async () => {
+       const inputDepth = 16;
+       const xSize = 8;
+       const inputShape: [number, number, number, number] =
+           [1, xSize, xSize, inputDepth];
+       const outputDepth = 1;
+       const fSize = 3;
+       const pad = 'same';
+       const stride: [number, number] = [2, 2];
+
+       // TODO(annxingyuan): Make this test work with large inputs
+       // https://github.com/tensorflow/tfjs/issues/3143
+       const inputData = [];
+       for (let i = 0; i < xSize * xSize * inputDepth; i++) {
+         inputData.push(i % 5);
+       }
+
+       const wData = [];
+       for (let i = 0; i < fSize * fSize * inputDepth * outputDepth; i++) {
+         wData.push(i % 5);
+       }
+
+       const x = tf.tensor4d(inputData, inputShape);
+       const w = tf.tensor4d(wData, [fSize, fSize, inputDepth, outputDepth]);
+
+       const result = tf.fused.conv2d({
+         x,
+         filter: w,
+         strides: stride,
+         pad,
+         dataFormat: 'NHWC',
+         dilations: [1, 1],
+         activation: 'relu'
+       });
+       expect(result.shape).toEqual([1, 4, 4, 1]);
+       expectArraysClose(await result.data(), new Float32Array([
+                           854, 431, 568, 382, 580, 427, 854, 288, 431, 568,
+                           580, 289, 285, 570, 285, 258
+                         ]));
+     });
+
+  it('relu bias stride 2 x=[1,8,8,16] f=[3,3,16,1] s=[2,2] d=8 p=same',
+     async () => {
+       const inputDepth = 16;
+       const xSize = 8;
+       const inputShape: [number, number, number, number] =
+           [1, xSize, xSize, inputDepth];
+       const outputDepth = 8;
+       const fSize = 3;
+       const pad = 'same';
+       const stride: [number, number] = [2, 2];
+
+       const inputs = generateCaseInputs(
+           1 * xSize * xSize * inputDepth,
+           fSize * fSize * inputDepth * outputDepth);
+       const x = tf.tensor4d(inputs.input, inputShape);
+       const w =
+           tf.tensor4d(inputs.filter, [fSize, fSize, inputDepth, outputDepth]);
+       const bias = tf.tensor1d([1, 4, 2, 3, 9, 6, 5, 8]);
+       const result = tf.fused.conv2d({
+         x,
+         filter: w,
+         strides: stride,
+         pad,
+         dataFormat: 'NHWC',
+         dilations: [1, 1],
+         activation: 'relu',
+         bias
+       });
+       expect(result.shape).toEqual([1, 4, 4, 8]);
+       expectArraysClose(await result.data(), new Float32Array([
+                           25.75398063659668,
+                           0,
+                           26.857805252075195,
+                           0,
+                           33.961631774902344,
+                           0,
+                           30.065458297729492,
+                           0,
+                           23.118206024169922,
+                           0,
+                           24.212820053100586,
+                           0,
+                           31.307422637939453,
+                           0,
+                           27.402034759521484,
+                           0,
+                           20.482431411743164,
+                           0,
+                           21.567821502685547,
+                           0,
+                           28.653217315673828,
+                           0,
+                           24.73861312866211,
+                           0,
+                           11.078080177307129,
+                           0,
+                           12.130399703979492,
+                           0,
+                           19.182720184326172,
+                           0,
+                           15.235037803649902,
+                           0,
+                           4.6677775382995605,
+                           0.31717729568481445,
+                           5.697869777679443,
+                           0,
+                           12.727968215942383,
+                           2.2569849491119385,
+                           8.758066177368164,
+                           4.226885795593262,
+                           2.0319995880126953,
+                           2.9575586318969727,
+                           3.052880048751831,
+                           1.9366796016693115,
+                           10.073760032653809,
+                           4.915799617767334,
+                           6.094639778137207,
+                           6.89492130279541,
+                           0,
+                           5.5979437828063965,
+                           0.4078875780105591,
+                           4.586280822753906,
+                           7.419551849365234,
+                           7.5746169090271,
+                           3.43121600151062,
+                           9.562952041625977,
+                           0,
+                           6.404943943023682,
+                           0,
+                           5.401776313781738,
+                           6.5998077392578125,
+                           8.398608207702637,
+                           2.602976083755493,
+                           10.395440101623535,
+                           0,
+                           21.440250396728516,
+                           0,
+                           20.483882904052734,
+                           0,
+                           23.527509689331055,
+                           0,
+                           25.571144104003906,
+                           0,
+                           24.080629348754883,
+                           0,
+                           23.133480072021484,
+                           0,
+                           26.186328887939453,
+                           0,
+                           28.239177703857422,
+                           0,
+                           26.721012115478516,
+                           0,
+                           25.783079147338867,
+                           0,
+                           28.84514808654785,
+                           0,
+                           30.907209396362305,
+                           0,
+                           18.914127349853516,
+                           0,
+                           17.960111618041992,
+                           0,
+                           21.006093978881836,
+                           0,
+                           23.052082061767578,
+                           0,
+                           17.89089584350586,
+                           0,
+                           16.95684814453125,
+                           0,
+                           20.022798538208008,
+                           0,
+                           22.088754653930664,
+                           0,
+                           19.06132698059082,
+                           0,
+                           18.133424758911133,
+                           0,
+                           21.205520629882812,
+                           0,
+                           23.27761459350586,
+                           0,
+                           20.23175811767578,
+                           0,
+                           19.309999465942383,
+                           0,
+                           22.388240814208984,
+                           0,
+                           24.46647834777832,
+                           0,
+                           13.584352493286133,
+                           0,
+                           12.6395845413208,
+                           0,
+                           15.694815635681152,
+                           0,
+                           17.750045776367188
+                         ]));
+     });
+
+  it('prelu bias stride 2 x=[1,8,8,16] f=[3,3,16,1] s=[2,2] d=8 p=same',
+     async () => {
+       const inputDepth = 16;
+       const xSize = 8;
+       const inputShape: [number, number, number, number] =
+           [1, xSize, xSize, inputDepth];
+       const outputDepth = 8;
+       const fSize = 3;
+       const pad = 'same';
+       const stride: [number, number] = [2, 2];
+
+       const inputs = generateCaseInputs(
+           1 * xSize * xSize * inputDepth,
+           fSize * fSize * inputDepth * outputDepth);
+       const x = tf.tensor4d(inputs.input, inputShape);
+       const w =
+           tf.tensor4d(inputs.filter, [fSize, fSize, inputDepth, outputDepth]);
+       const bias = tf.tensor1d([1, 4, 2, 3, 9, 6, 5, 8]);
+       const preluActivationWeights = tf.tensor1d([1, 2, 3, 4, 5, 6, 7, 8]);
+
+       const result = tf.fused.conv2d({
+         x,
+         filter: w,
+         strides: stride,
+         pad,
+         dataFormat: 'NHWC',
+         dilations: [1, 1],
+         activation: 'prelu',
+         preluActivationWeights,
+         bias
+       });
+       expect(result.shape).toEqual([1, 4, 4, 8]);
+       expectArraysClose(
+           await result.data(), new Float32Array([
+             25.75398063659668,   -41.61178970336914,  26.857805252075195,
+             -87.63885498046875,  33.961631774902344,  -114.0812759399414,
+             30.065458297729492,  -136.93893432617188, 23.118206024169922,
+             -36.33102035522461,  24.212820053100586,  -77.04048156738281,
+             31.307422637939453,  -98.12835693359375,  27.402034759521484,
+             -115.5947265625,     20.482431411743164,  -31.050262451171875,
+             21.567821502685547,  -66.44209289550781,  28.653217315673828,
+             -82.17544555664062,  24.73861312866211,   -94.25041198730469,
+             11.078080177307129,  -12.208478927612305, 12.130399703979492,
+             -28.626232147216797, 19.182720184326172,  -25.253299713134766,
+             15.235037803649902,  -18.08960723876953,  4.6677775382995605,
+             0.31717729568481445, 5.697869777679443,   -2.8516759872436523,
+             12.727968215942383,  2.2569849491119385,  8.758066177368164,
+             4.226885795593262,   2.0319995880126953,  2.9575586318969727,
+             3.052880048751831,   1.9366796016693115,  10.073760032653809,
+             4.915799617767334,   6.094639778137207,   6.89492130279541,
+             -0.6037763357162476, 5.5979437828063965,  0.4078875780105591,
+             4.586280822753906,   7.419551849365234,   7.5746169090271,
+             3.43121600151062,    9.562952041625977,   -1.4065279960632324,
+             6.404943943023682,   -1.2100803852081299, 5.401776313781738,
+             6.5998077392578125,  8.398608207702637,   2.602976083755493,
+             10.395440101623535,  -16.418434143066406, 21.440250396728516,
+             -46.38618850708008,  20.483882904052734,  -42.52848815917969,
+             23.527509689331055,  -87.84530639648438,  25.571144104003906,
+             -19.054208755493164, 24.080629348754883,  -54.32115936279297,
+             23.133480072021484,  -55.79951477050781,  26.186328887939453,
+             -106.48924255371094, 28.239177703857422,  -21.689987182617188,
+             26.721012115478516,  -62.25614929199219,  25.783079147338867,
+             -69.070556640625,    28.84514808654785,   -125.13325500488281,
+             30.907209396362305,  -13.891133308410645, 18.914127349853516,
+             -38.81135940551758,  17.960111618041992,  -29.915504455566406,
+             21.006093978881836,  -70.20361328125,     23.052082061767578,
+             -12.857919692993164, 17.89089584350586,   -35.771610260009766,
+             16.95684814453125,   -24.949115753173828, 20.022798538208008,
+             -63.39042282104492,  22.088754653930664,  -14.02528190612793,
+             19.06132698059082,   -39.2921257019043,   18.133424758911133,
+             -30.847349166870117, 21.205520629882812,  -71.69097137451172,
+             23.27761459350586,   -15.192638397216797, 20.23175811767578,
+             -42.8126335144043,   19.309999465942383,  -36.74560546875,
+             22.388240814208984,  -79.99152374267578,  24.46647834777832,
+             -8.556736946105957,  13.584352493286133,  -22.835901260375977,
+             12.6395845413208,    -3.336000442504883,  15.694815635681152,
+             -33.0570182800293,   17.750045776367188
+           ]));
+     });
+
+  it('basic with bias', async () => {
+    const inputDepth = 2;
+    const inShape: [number, number, number, number] = [2, 2, 2, inputDepth];
+    const outputDepth = 2;
+    const fSize = 1;
+    const pad = 0;
+    const stride = 1;
+
+    const x = tf.tensor4d(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], inShape);
+    const w =
+        tf.tensor4d([-1, 1, -2, 0.5], [fSize, fSize, inputDepth, outputDepth]);
+
+    const result = tf.fused.conv2d({
+      x,
+      filter: w,
+      strides: stride,
+      pad,
+      dataFormat: 'NHWC',
+      dilations: [1, 1],
+      bias: tf.tensor1d([5, 6])
+    });
+    expect(result.shape).toEqual([2, 2, 2, 2]);
+    const expected =
+        [0, 8, -6, 11, -12, 14, -18, 17, -24, 20, -30, 23, -36, 26, -42, 29];
+
+    expectArraysClose(await result.data(), expected);
+  });
+
+  it('basic with explicit padding', async () => {
+    const inputDepth = 1;
+    const outputDepth = 1;
+    const pad =
+        [[0, 0], [1, 2], [0, 1], [0, 0]] as tf.backend_util.ExplicitPadding;
+    const stride = 1;
+    const dataFormat = 'NHWC';
+    const dilation = 1;
+
+    const x = tf.tensor3d([1, 2, 3, 4, 5, 6, 7, 8], [4, 2, inputDepth]);
+    const w =
+        tf.tensor4d([3, 1, 5, 0, 2, 7, 8, 9], [4, 2, inputDepth, outputDepth]);
+
+    const result = tf.fused.conv2d(
+        {x, filter: w, strides: stride, pad, dataFormat, dilations: dilation});
+
+    const resultData = await result.data();
+    expect(result.shape).toEqual([4, 2, 1]);
+    expectArraysClose(resultData, [133, 66, 200, 102, 108, 58, 56, 58]);
+  });
+
+  it('basic with elu', async () => {
+    const inputDepth = 2;
+    const inShape: [number, number, number, number] = [2, 2, 2, inputDepth];
+    const outputDepth = 2;
+    const fSize = 1;
+    const pad = 0;
+    const stride = 1;
+
+    const x = tf.tensor4d(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], inShape);
+    const w =
+        tf.tensor4d([-1, 1, -2, 0.5], [fSize, fSize, inputDepth, outputDepth]);
+
+    const result = tf.fused.conv2d({
+      x,
+      filter: w,
+      strides: stride,
+      pad,
+      dataFormat: 'NHWC',
+      dilations: [1, 1],
+      activation: 'elu'
+    });
+    expect(result.shape).toEqual([2, 2, 2, 2]);
+    const expected =
+        [-0.99326, 2, -1, 5, -1, 8, -1, 11, -1, 14, -1, 17, -1, 20, -1, 23];
+
+    expectArraysClose(await result.data(), expected);
+  });
+
+  it('basic with prelu', async () => {
+    const inputDepth = 2;
+    const inShape: [number, number, number, number] = [2, 2, 2, inputDepth];
+    const outputDepth = 2;
+    const fSize = 1;
+    const pad = 0;
+    const stride = 1;
+
+    const x = tf.tensor4d(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], inShape);
+    const alpha = tf.tensor3d([0.25, 0.75], [1, 1, 2]);
+    const w =
+        tf.tensor4d([-1, 1, -2, 0.5], [fSize, fSize, inputDepth, outputDepth]);
+
+    const result = tf.fused.conv2d({
+      x,
+      filter: w,
+      strides: stride,
+      pad,
+      dataFormat: 'NHWC',
+      dilations: [1, 1],
+      activation: 'prelu',
+      preluActivationWeights: alpha
+    });
+    expect(result.shape).toEqual([2, 2, 2, 2]);
+    const expected = [
+      -1.25, 2, -2.75, 5, -4.25, 8, -5.75, 11, -7.25, 14, -8.75, 17, -10.25, 20,
+      -11.75, 23
+    ];
+
+    expectArraysClose(await result.data(), expected);
+  });
+
+  it('basic with broadcasted bias and relu', async () => {
+    const inputDepth = 2;
+    const inShape: [number, number, number, number] = [2, 2, 2, inputDepth];
+    const outputDepth = 2;
+    const fSize = 1;
+    const pad = 0;
+    const stride = 1;
+
+    const x = tf.tensor4d(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], inShape);
+    const w =
+        tf.tensor4d([-1, 1, -2, 0.5], [fSize, fSize, inputDepth, outputDepth]);
+
+    const result = tf.fused.conv2d({
+      x,
+      filter: w,
+      strides: stride,
+      pad,
+      dataFormat: 'NHWC',
+      dilations: [1, 1],
+      bias: tf.scalar(5),
+      activation: 'relu'
+    });
+    expect(result.shape).toEqual([2, 2, 2, 2]);
+    const expected = [0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28];
+
+    expectArraysClose(await result.data(), expected);
+  });
+
+  it('im2row', async () => {
+    const inputDepth = 1;
+    const inputShape: [number, number, number] = [4, 4, inputDepth];
+    const outputDepth = 3;
+    const fSize = 1;
+    const pad = 'same';
+    const strides: [number, number] = [2, 2];
+
+    const x = tf.tensor3d(
+        [
+          10, 30, 50, 70, 20, 40, 60, 80, -10, -30, -50, -70, -20, -40, -60, -80
+        ],
+        inputShape);
+    const w = tf.tensor4d([1, 0.5, 1], [fSize, fSize, inputDepth, outputDepth]);
+
+    const result = tf.fused.conv2d({x, filter: w, strides, pad});
+
+    expectArraysClose(
+        await result.data(),
+        [10, 5, 10, 50, 25, 50, -10, -5, -10, -50, -25, -50]);
+  });
+
+  it('im2row with relu', async () => {
+    const inputDepth = 1;
+    const inputShape: [number, number, number] = [4, 4, inputDepth];
+    const outputDepth = 3;
+    const fSize = 1;
+    const pad = 'same';
+    const strides: [number, number] = [2, 2];
+
+    const x = tf.tensor3d(
+        [
+          10, 30, 50, 70, 20, 40, 60, 80, -10, -30, -50, -70, -20, -40, -60, -80
+        ],
+        inputShape);
+    const w = tf.tensor4d([1, 0.5, 1], [fSize, fSize, inputDepth, outputDepth]);
+
+    const result = tf.fused.conv2d({
+      x,
+      filter: w,
+      strides,
+      pad,
+      dataFormat: 'NHWC',
+      dilations: [1, 1],
+      activation: 'relu'
+    });
+
+    expectArraysClose(
+        await result.data(), [10, 5, 10, 50, 25, 50, 0, 0, 0, 0, 0, 0]);
+  });
+
+  it('im2row with prelu', async () => {
+    const inputDepth = 1;
+    const inputShape: [number, number, number] = [4, 4, inputDepth];
+    const outputDepth = 3;
+    const fSize = 1;
+    const pad = 'same';
+    const strides: [number, number] = [2, 2];
+
+    const x = tf.tensor3d(
+        [
+          10, 30, 50, 70, 20, 40, 60, 80, -10, -30, -50, -70, -20, -40, -60, -80
+        ],
+        inputShape);
+    const w = tf.tensor4d([1, 0.5, 1], [fSize, fSize, inputDepth, outputDepth]);
+    const alpha = tf.tensor3d([0.5], [1, 1, inputDepth]);
+
+    const result = tf.fused.conv2d({
+      x,
+      filter: w,
+      strides,
+      pad,
+      dataFormat: 'NHWC',
+      dilations: [1, 1],
+      activation: 'prelu',
+      preluActivationWeights: alpha
+    });
+
+    expectArraysClose(
+        await result.data(),
+        [10, 5, 10, 50, 25, 50, -5, -2.5, -5, -25, -12.5, -25]);
+  });
+
+  it('pointwise with prelu', async () => {
+    const inputDepth = 1;
+    const inputShape: [number, number, number] = [4, 4, inputDepth];
+    const outputDepth = 3;
+    const fSize = 1;
+    const pad = 'same';
+    const strides: [number, number] = [1, 1];
+
+    const x = tf.tensor3d(
+        [
+          10, 30, 50, 70, 20, 40, 60, 80, -10, -30, -50, -70, -20, -40, -60, -80
+        ],
+        inputShape);
+    const w = tf.tensor4d([1, 0.5, 1], [fSize, fSize, inputDepth, outputDepth]);
+    const alpha = tf.tensor3d([0.5], [1, 1, inputDepth]);
+
+    const result = tf.fused.conv2d({
+      x,
+      filter: w,
+      strides,
+      pad,
+      dataFormat: 'NHWC',
+      dilations: [1, 1],
+      activation: 'prelu',
+      preluActivationWeights: alpha
+    });
+
+    expectArraysClose(await result.data(), [
+      10,  5,    10,  30,  15,   30,  50,  25,    50,  70,  35,    70,
+      20,  10,   20,  40,  20,   40,  60,  30,    60,  80,  40,    80,
+      -5,  -2.5, -5,  -15, -7.5, -15, -25, -12.5, -25, -35, -17.5, -35,
+      -10, -5,   -10, -20, -10,  -20, -30, -15,   -30, -40, -20,   -40
+    ]);
+  });
+
+  it('im2row with broadcasted bias and relu', async () => {
+    const inputDepth = 1;
+    const inputShape: [number, number, number] = [4, 4, inputDepth];
+    const outputDepth = 3;
+    const fSize = 1;
+    const pad = 'same';
+    const strides: [number, number] = [2, 2];
+
+    const x = tf.tensor3d(
+        [
+          10, 30, 50, 70, 20, 40, 60, 80, -10, -30, -50, -70, -20, -40, -60, -80
+        ],
+        inputShape);
+    const w = tf.tensor4d([1, 0.5, 1], [fSize, fSize, inputDepth, outputDepth]);
+
+    const result = tf.fused.conv2d({
+      x,
+      filter: w,
+      strides,
+      pad,
+      dataFormat: 'NHWC',
+      dilations: [1, 1],
+      bias: tf.scalar(5),
+      activation: 'relu'
+    });
+
+    expectArraysClose(
+        await result.data(), [15, 10, 15, 55, 30, 55, 0, 0, 0, 0, 0, 0]);
+  });
+
+  // it('backProp input x=[2,3,3,1] f=[2,2,1,1] s=1 p=0', async () => {
+  //   const inputDepth = 1;
+  //   const outputDepth = 1;
+  //   const inputShape: [number, number, number, number] = [2, 3, 3,
+  //   inputDepth]; const filterSize = 2; const strides = 1; const pad = 0;
+
+  //   const filterShape: [number, number, number, number] =
+  //       [filterSize, filterSize, inputDepth, outputDepth];
+  //   const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
+
+  //   const x = tf.tensor4d(
+  //       [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
+  //   const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
+
+  //   const grads = tf.grads(
+  //       (x: tf.Tensor4D) => tf.fused.conv2d({x, filter, strides, pad}));
+  //   const [dx] = grads([x], dy);
+
+  //   expect(dx.shape).toEqual(x.shape);
+  //   expectArraysClose(
+  //       await dx.data(),
+  //       [-3, 2, 1, -8, 1.5, 0.5, -4, 1, 0, -3, 2, 1, -8, 1.5, 0.5, -4, 1,
+  //       0]);
+  // });
+
+  // it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0', async () => {
+  //   const inputDepth = 1;
+  //   const outputDepth = 1;
+  //   const inputShape: [number, number, number, number] = [2, 3, 3,
+  //   inputDepth]; const filterSize = 2; const strides = 1; const pad = 0;
+
+  //   const filterShape: [number, number, number, number] =
+  //       [filterSize, filterSize, inputDepth, outputDepth];
+  //   const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
+
+  //   const x = tf.tensor4d(
+  //       [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
+  //   const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
+
+  //   const grads = tf.grads(
+  //       (x: tf.Tensor4D, filter: tf.Tensor4D) =>
+  //           tf.fused.conv2d({x, filter, strides, pad}));
+  //   const [dx, dfilter] = grads([x, filter], dy);
+
+  //   expect(dx.shape).toEqual(x.shape);
+  //   expectArraysClose(
+  //       await dx.data(),
+  //       [-3, 2, 1, -8, 1.5, 0.5, -4, 1, 0, -3, 2, 1, -8, 1.5, 0.5, -4, 1,
+  //       0]);
+
+  //   expect(dfilter.shape).toEqual(filterShape);
+  //   expectArraysClose(await dfilter.data(), [26, 38, 62, 74]);
+  // });
+
+  // it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0 with bias', async () => {
+  //   const inputDepth = 1;
+  //   const outputDepth = 1;
+  //   const inputShape: [number, number, number, number] = [2, 3, 3,
+  //   inputDepth]; const filterSize = 2; const strides = 1; const pad = 0;
+
+  //   const filterShape: [number, number, number, number] =
+  //       [filterSize, filterSize, inputDepth, outputDepth];
+  //   const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
+  //   const bias = tf.ones([2, 2, 2, 1]);
+
+  //   const x = tf.tensor4d(
+  //       [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
+  //   const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
+
+  //   const fusedGrads =
+  //       tf.grads((x: tf.Tensor4D, w: tf.Tensor4D, b) => tf.fused.conv2d({
+  //         x,
+  //         filter: w,
+  //         strides,
+  //         pad,
+  //         dataFormat: 'NHWC',
+  //         dilations: [1, 1],
+  //         bias: b
+  //       }));
+  //   const [dxFused, dfilterFused, dbiasFused] =
+  //       fusedGrads([x, filter, bias], dy);
+
+  //   const grads = tf.grads((x: tf.Tensor4D, filter: tf.Tensor4D, bias) => {
+  //     const conv = tf.conv2d(x, filter, strides, pad);
+  //     const sum = tf.add(conv, bias);
+  //     return sum;
+  //   });
+  //   const [dx, dfilter, dbias] = grads([x, filter, bias], dy);
+
+  //   expectArraysClose(await dxFused.array(), await dx.array());
+  //   expectArraysClose(await dfilterFused.array(), await dfilter.array());
+  //   expectArraysClose(await dbiasFused.array(), await dbias.array());
+  // });
+
+  // it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0 with bias and relu',
+  //    async () => {
+  //      const inputDepth = 1;
+  //      const outputDepth = 1;
+  //      const inputShape: [number, number, number, number] =
+  //          [2, 3, 3, inputDepth];
+  //      const filterSize = 2;
+  //      const strides = 1;
+  //      const pad = 0;
+
+  //      const filterShape: [number, number, number, number] =
+  //          [filterSize, filterSize, inputDepth, outputDepth];
+  //      const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
+  //      const bias = tf.ones([2, 2, 2, 1]);
+
+  //      const x = tf.tensor4d(
+  //          [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+  //          inputShape);
+  //      const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
+
+  //      const fusedGrads =
+  //          tf.grads((x: tf.Tensor4D, w: tf.Tensor4D, b) => tf.fused.conv2d({
+  //            x,
+  //            filter: w,
+  //            strides,
+  //            pad,
+  //            dataFormat: 'NHWC',
+  //            dilations: [1, 1],
+  //            bias: b,
+  //            activation: 'relu'
+  //          }));
+  //      const [dxFused, dfilterFused, dbiasFused] =
+  //          fusedGrads([x, filter, bias], dy);
+
+  //      const grads = tf.grads((x: tf.Tensor4D, filter: tf.Tensor4D, bias)
+  //      => {
+  //        const conv = tf.conv2d(x, filter, strides, pad);
+  //        const sum = tf.add(conv, bias);
+  //        return tf.relu(sum);
+  //      });
+  //      const [dx, dfilter, dbias] = grads([x, filter, bias], dy);
+
+  //      expectArraysClose(await dxFused.array(), await dx.array());
+  //      expectArraysClose(await dfilterFused.array(), await dfilter.array());
+  //      expectArraysClose(await dbiasFused.array(), await dbias.array());
+  //    });
+
+  // it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0 with bias and elu', async ()
+  // => {
+  //   const inputDepth = 1;
+  //   const outputDepth = 1;
+  //   const inputShape: [number, number, number, number] = [2, 3, 3,
+  //   inputDepth]; const filterSize = 2; const strides = 1; const pad = 0;
+
+  //   const filterShape: [number, number, number, number] =
+  //       [filterSize, filterSize, inputDepth, outputDepth];
+  //   const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
+  //   const bias = tf.ones([2, 2, 2, 1]);
+
+  //   const x = tf.tensor4d(
+  //       [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
+  //   const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
+
+  //   const fusedGrads =
+  //       tf.grads((x: tf.Tensor4D, w: tf.Tensor4D, b) => tf.fused.conv2d({
+  //         x,
+  //         filter: w,
+  //         strides,
+  //         pad,
+  //         dataFormat: 'NHWC',
+  //         dilations: [1, 1],
+  //         bias: b,
+  //         activation: 'elu'
+  //       }));
+  //   const [dxFused, dfilterFused, dbiasFused] =
+  //       fusedGrads([x, filter, bias], dy);
+
+  //   const grads = tf.grads((x: tf.Tensor4D, filter: tf.Tensor4D, bias) => {
+  //     const conv = tf.conv2d(x, filter, strides, pad);
+  //     const sum = tf.add(conv, bias);
+  //     return tf.elu(sum);
+  //   });
+  //   const [dx, dfilter, dbias] = grads([x, filter, bias], dy);
+
+  //   expectArraysClose(await dxFused.array(), await dx.array());
+  //   expectArraysClose(await dfilterFused.array(), await dfilter.array());
+  //   expectArraysClose(await dbiasFused.array(), await dbias.array());
+  // });
+
+  // it('fused matmul with relu6 and gradients', async () => {
+  //   const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
+  //   const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
+  //   const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
+  //   const transposeA = false;
+  //   const transposeB = false;
+
+  //   const fusedGrads = tf.grads((a, b) => {
+  //     return tf.fused.matMul(
+  //         {a, b, transposeA, transposeB, bias: null, activation: 'relu6'});
+  //   });
+  //   const [fusedDa, fusedDb] = fusedGrads([a, b], dy);
+
+  //   const grads = tf.grads((a, b) => {
+  //     const prod = tf.matMul(a, b, transposeA, transposeB);
+  //     return tf.relu6(prod);
+  //   });
+  //   const [da, db] = grads([a, b], dy);
+
+  //   expectArraysClose(await da.array(), await fusedDa.array());
+  //   expectArraysClose(await db.data(), await fusedDb.array());
+  // });
+});
diff --git a/tfjs-core/src/ops/fused/depthwise_conv2d.ts b/tfjs-core/src/ops/fused_depthwise_conv2d.ts
similarity index 75%
rename from tfjs-core/src/ops/fused/depthwise_conv2d.ts
rename to tfjs-core/src/ops/fused_depthwise_conv2d.ts
index 4b9b8067399..64180ac8ab0 100644
--- a/tfjs-core/src/ops/fused/depthwise_conv2d.ts
+++ b/tfjs-core/src/ops/fused_depthwise_conv2d.ts
@@ -15,21 +15,17 @@
  * =============================================================================
  */
 
-import {ENGINE} from '../../engine';
-import * as conv_util from '../../ops/conv_util';
-import {op} from '../../ops/operation';
-import {Tensor, Tensor3D, Tensor4D} from '../../tensor';
-import {makeTypesMatch} from '../../tensor_util';
-import {convertToTensor} from '../../tensor_util_env';
-import {TensorLike} from '../../types';
-import * as util from '../../util';
-import {add} from '../add';
-import * as broadcast_util from '../broadcast_util';
-import {depthwiseConv2d as unfusedDepthwiseConv2d} from '../depthwise_conv2d';
-import {depthwiseConv2dNativeBackpropFilter} from '../depthwise_conv2d_native_backprop_filter';
-import {depthwiseConv2dNativeBackpropInput} from '../depthwise_conv2d_native_backprop_input';
-import {applyActivation, getFusedBiasGradient, getFusedDyActivation, shouldFuse} from '../fused_util';
-import {Activation} from './types';
+import {ENGINE} from '../engine';
+import {Tensor, Tensor3D, Tensor4D} from '../tensor';
+import {makeTypesMatch} from '../tensor_util';
+import {convertToTensor} from '../tensor_util_env';
+import {TensorLike} from '../types';
+import * as util from '../util';
+
+import * as broadcast_util from './broadcast_util';
+import * as conv_util from './conv_util';
+import {Activation} from './fused_types';
+import {op} from './operation';
 
 /**
  * Computes depthwise 2D convolution, optionally fused with adding a
@@ -104,16 +100,6 @@ function fusedDepthwiseConv2d_<T extends Tensor3D|Tensor4D>({
   activation?: Activation,
   preluActivationWeights?: Tensor
 }): T {
-  if (shouldFuse(ENGINE.state.gradientDepth, activation) === false) {
-    let result = unfusedDepthwiseConv2d(
-        x, filter, strides, pad, dataFormat, dilations, dimRoundingMode);
-    if (bias != null) {
-      result = add(result, bias);
-    }
-
-    return applyActivation(result, activation, preluActivationWeights) as T;
-  }
-
   const $x = convertToTensor(x, 'x', 'depthwiseConv2d');
   const $filter = convertToTensor(filter, 'filter', 'depthwiseConv2d');
 
@@ -170,33 +156,6 @@ function fusedDepthwiseConv2d_<T extends Tensor3D|Tensor4D>({
         preluActivationWeights, 'prelu weights', 'fused depthwiseConv2d');
   }
 
-  const grad = (dy: Tensor4D, saved: Tensor[]) => {
-    util.assert(
-        conv_util.tupleValuesAreOne(dilations),
-        () => 'Error in gradient of fused depthwiseConv2d: dilation rates ' +
-            `greater than 1 are not yet supported. Got dilations ` +
-            `'${dilations}'`);
-    const [$filter, x4D, y] = saved;
-
-    const dyActivation = getFusedDyActivation(dy, y, activation) as Tensor4D;
-
-    let biasGradient = {};
-    if (bias != null) {
-      biasGradient = {bias: () => getFusedBiasGradient($bias, dyActivation)};
-    }
-
-    return Object.assign(
-        {
-          x: () => depthwiseConv2dNativeBackpropInput(
-              (x4D as Tensor4D).shape, dyActivation, $filter as Tensor4D,
-              convInfo),
-          filter: () => depthwiseConv2dNativeBackpropFilter(
-              x4D as Tensor4D, dyActivation, ($filter as Tensor4D).shape,
-              convInfo),
-        },
-        biasGradient);
-  };
-
   const inputs: {
     x: Tensor,
     filter: Tensor,
@@ -213,7 +172,7 @@ function fusedDepthwiseConv2d_<T extends Tensor3D|Tensor4D>({
   const inputsToSave = [$filter, x4D];
   const outputsToSave = [true];
   const res = ENGINE.runKernelFunc(
-      (backend, save) => {
+      (backend) => {
         const res = backend.fusedDepthwiseConv2D({
           input: x4D,
           filter: $filter,
@@ -222,10 +181,9 @@ function fusedDepthwiseConv2d_<T extends Tensor3D|Tensor4D>({
           activation,
           preluActivationWeights: $preluActivationWeights
         });
-        save([$filter, x4D, res]);
         return res;
       },
-      inputs, grad, 'FusedDepthwiseConv2D', {convInfo, activation},
+      inputs, null /* grad */, 'FusedDepthwiseConv2D', {convInfo, activation},
       inputsToSave, outputsToSave);
   if (reshapedTo4D) {
     return res.as3D(res.shape[1], res.shape[2], res.shape[3]) as T;
diff --git a/tfjs-core/src/ops/fused_depthwise_conv2d_test.ts b/tfjs-core/src/ops/fused_depthwise_conv2d_test.ts
new file mode 100644
index 00000000000..49e318a7844
--- /dev/null
+++ b/tfjs-core/src/ops/fused_depthwise_conv2d_test.ts
@@ -0,0 +1,253 @@
+/**
+ * @license
+ * Copyright 2020 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import * as tf from '../index';
+import {ALL_ENVS, describeWithFlags} from '../jasmine_util';
+import {expectArraysClose} from '../test_util';
+
+describeWithFlags('fused depthwiseConv2D', ALL_ENVS, () => {
+  it('basic', async () => {
+    const fSize = 2;
+    const pad = 'valid';
+    const strides = 1;
+    const chMul = 1;
+    const inDepth = 1;
+
+    const x = tf.tensor4d(
+        [
+          0.230664, 0.987388, 0.0685208, 0.419224, 0.887861, 0.731641,
+          0.0741907, 0.409265, 0.351377
+        ],
+        [1, 3, 3, inDepth]);
+    const w = tf.tensor4d(
+        [-0.303873, -0.229223, 0.144333, 0.803373],
+        [fSize, fSize, inDepth, chMul],
+    );
+
+    const result = tf.fused.depthwiseConv2d({x, filter: w, strides, pad});
+    expect(result.shape).toEqual([1, 2, 2, 1]);
+    const expected = [0.47737, 0.40018, 0.00859, -0.09615];
+    expectArraysClose(await result.data(), expected);
+  });
+
+  it('basic with relu', async () => {
+    const fSize = 2;
+    const pad = 'valid';
+    const strides = 1;
+    const chMul = 1;
+    const inDepth = 1;
+
+    const x = tf.tensor4d(
+        [
+          0.230664, 0.987388, 0.0685208, 0.419224, 0.887861, 0.731641,
+          0.0741907, 0.409265, 0.351377
+        ],
+        [1, 3, 3, inDepth]);
+    const w = tf.tensor4d(
+        [-0.303873, -0.229223, 0.144333, 0.803373],
+        [fSize, fSize, inDepth, chMul],
+    );
+
+    const result = tf.fused.depthwiseConv2d(
+        {x, filter: w, strides, pad, activation: 'relu'});
+    expect(result.shape).toEqual([1, 2, 2, 1]);
+    const expected = [0.47737, 0.40018, 0.00859, 0];
+    expectArraysClose(await result.data(), expected);
+  });
+
+  it('basic with broadcasted bias and relu', async () => {
+    const fSize = 2;
+    const pad = 'valid';
+    const strides = 1;
+    const chMul = 1;
+    const inDepth = 1;
+
+    const x = tf.tensor4d(
+        [
+          0.230664, 0.987388, 0.0685208, 0.419224, 0.887861, 0.731641,
+          0.0741907, 0.409265, 0.351377
+        ],
+        [1, 3, 3, inDepth]);
+    const w = tf.tensor4d(
+        [-0.303873, -0.229223, 0.144333, 0.803373],
+        [fSize, fSize, inDepth, chMul],
+    );
+
+    const result = tf.fused.depthwiseConv2d(
+        {x, filter: w, strides, pad, bias: tf.scalar(1), activation: 'relu'});
+    expect(result.shape).toEqual([1, 2, 2, 1]);
+    const expected = [1.47737, 1.40018, 1.00859, 0.90385];
+    expectArraysClose(await result.data(), expected);
+  });
+
+  it('prelu', async () => {
+    const fSize = 3;
+    const pad = 'valid';
+    const strides = 1;
+    const chMul = 1;
+    const inDepth = 1;
+
+    const x = tf.tensor4d(
+        [
+          0.149194, 0.089009, 0.654891, 0.083324, 0.537043, 0.644331, 0.563037,
+          0.211859, 0.633501, 0.186427, 0.777034, 0.50001,  0.607341, 0.95303,
+          0.696479, 0.050387, 0.62045,  0.728049, 0.028043, 0.437009, 0.712881,
+          0.741935, 0.974474, 0.621102, 0.171411
+        ],
+        [1, 5, 5, inDepth]);
+    const alpha = tf.tensor4d(
+        [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], [1, 3, 3, 1]);
+    const w = tf.tensor4d(
+        [
+          -0.125386, -0.975199, -0.640437, -0.281895, -0.990968, -0.347208,
+          -0.889702, -0.180695, -0.691992
+        ],
+        [fSize, fSize, inDepth, chMul],
+    );
+
+    const result = tf.fused.depthwiseConv2d({
+      x,
+      filter: w,
+      strides,
+      pad,
+      activation: 'prelu',
+      preluActivationWeights: alpha
+    });
+    expect(result.shape).toEqual([1, 3, 3, 1]);
+    const expected = [
+      -0.25400, -0.50118, -0.73622, -0.94068, -1.2298, -1.84585, -2.3089,
+      -2.7499, -2.64077
+    ];
+    expectArraysClose(await result.data(), expected);
+  });
+
+  // it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0', async () => {
+  //   const inputDepth = 1;
+  //   const outputDepth = 1;
+  //   const inputShape: [number, number, number, number] = [2, 3, 3,
+  //   inputDepth]; const filterSize = 2; const strides = 1; const pad = 0;
+
+  //   const filterShape: [number, number, number, number] =
+  //       [filterSize, filterSize, inputDepth, outputDepth];
+  //   const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
+
+  //   const x = tf.tensor4d(
+  //       [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
+  //   const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
+
+  //   const grads = tf.grads(
+  //       (x: tf.Tensor4D, filter: tf.Tensor4D) =>
+  //           tf.fused.depthwiseConv2d({x, filter, strides, pad}));
+  //   const [dx, dfilter] = grads([x, filter], dy);
+
+  //   expect(dx.shape).toEqual(x.shape);
+  //   expectArraysClose(
+  //       await dx.data(),
+  //       [-3, 2, 1, -8, 1.5, 0.5, -4, 1, 0, -3, 2, 1, -8, 1.5, 0.5, -4, 1,
+  //       0]);
+
+  //   expect(dfilter.shape).toEqual(filterShape);
+  //   expectArraysClose(await dfilter.data(), [26, 38, 62, 74]);
+  // });
+
+  // it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0 with bias', async () => {
+  //   const inputDepth = 1;
+  //   const outputDepth = 1;
+  //   const inputShape: [number, number, number, number] = [2, 3, 3,
+  //   inputDepth]; const filterSize = 2; const strides = 1; const pad = 0;
+
+  //   const filterShape: [number, number, number, number] =
+  //       [filterSize, filterSize, inputDepth, outputDepth];
+  //   const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
+  //   const bias = tf.ones([2, 2, 2, 1]);
+
+  //   const x = tf.tensor4d(
+  //       [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
+  //   const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
+
+  //   const fusedGrads = tf.grads(
+  //       (x: tf.Tensor4D, w: tf.Tensor4D, b) => tf.fused.depthwiseConv2d({
+  //         x,
+  //         filter: w,
+  //         strides,
+  //         pad,
+  //         dataFormat: 'NHWC',
+  //         dilations: [1, 1],
+  //         bias: b
+  //       }));
+  //   const [dxFused, dfilterFused, dbiasFused] =
+  //       fusedGrads([x, filter, bias], dy);
+
+  //   const grads = tf.grads((x: tf.Tensor4D, filter: tf.Tensor4D, bias) => {
+  //     const conv = tf.depthwiseConv2d(x, filter, strides, pad);
+  //     const sum = tf.add(conv, bias);
+  //     return sum;
+  //   });
+  //   const [dx, dfilter, dbias] = grads([x, filter, bias], dy);
+
+  //   expectArraysClose(await dxFused.array(), await dx.array());
+  //   expectArraysClose(await dfilterFused.array(), await dfilter.array());
+  //   expectArraysClose(await dbiasFused.array(), await dbias.array());
+  // });
+
+  // it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0 with bias and activation',
+  //    async () => {
+  //      const inputDepth = 1;
+  //      const outputDepth = 1;
+  //      const inputShape: [number, number, number, number] =
+  //          [2, 3, 3, inputDepth];
+  //      const filterSize = 2;
+  //      const strides = 1;
+  //      const pad = 0;
+
+  //      const filterShape: [number, number, number, number] =
+  //          [filterSize, filterSize, inputDepth, outputDepth];
+  //      const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
+  //      const bias = tf.ones([2, 2, 2, 1]);
+
+  //      const x = tf.tensor4d(
+  //          [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+  //          inputShape);
+  //      const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
+
+  //      const fusedGrads = tf.grads(
+  //          (x: tf.Tensor4D, w: tf.Tensor4D, b) => tf.fused.depthwiseConv2d({
+  //            x,
+  //            filter: w,
+  //            strides,
+  //            pad,
+  //            dataFormat: 'NHWC',
+  //            dilations: [1, 1],
+  //            bias: b,
+  //            activation: 'relu'
+  //          }));
+  //      const [dxFused, dfilterFused, dbiasFused] =
+  //          fusedGrads([x, filter, bias], dy);
+
+  //      const grads = tf.grads((x: tf.Tensor4D, filter: tf.Tensor4D, bias)
+  //      => {
+  //        const conv = tf.depthwiseConv2d(x, filter, strides, pad);
+  //        const sum = tf.add(conv, bias);
+  //        return tf.relu(sum);
+  //      });
+  //      const [dx, dfilter, dbias] = grads([x, filter, bias], dy);
+
+  //      expectArraysClose(await dxFused.array(), await dx.array());
+  //      expectArraysClose(await dfilterFused.array(), await dfilter.array());
+  //      expectArraysClose(await dbiasFused.array(), await dbias.array());
+  //    });
+});
diff --git a/tfjs-core/src/ops/fused/mat_mul.ts b/tfjs-core/src/ops/fused_mat_mul.ts
similarity index 67%
rename from tfjs-core/src/ops/fused/mat_mul.ts
rename to tfjs-core/src/ops/fused_mat_mul.ts
index 66e8a9fcff6..3340bcfa102 100644
--- a/tfjs-core/src/ops/fused/mat_mul.ts
+++ b/tfjs-core/src/ops/fused_mat_mul.ts
@@ -15,18 +15,16 @@
  * =============================================================================
  */
 
-import {ENGINE} from '../../engine';
-import {op} from '../../ops/operation';
-import {Tensor, Tensor3D} from '../../tensor';
-import {makeTypesMatch} from '../../tensor_util';
-import {convertToTensor} from '../../tensor_util_env';
-import {TensorLike} from '../../types';
-import * as util from '../../util';
-import {add} from '../add';
-import * as broadcast_util from '../broadcast_util';
-import {applyActivation, getFusedBiasGradient, getFusedDyActivation, shouldFuse} from '../fused_util';
-import {matMul as unfusedMatMul} from '../mat_mul';
-import {Activation} from './types';
+import {ENGINE} from '../engine';
+import {Tensor} from '../tensor';
+import {makeTypesMatch} from '../tensor_util';
+import {convertToTensor} from '../tensor_util_env';
+import {TensorLike} from '../types';
+import * as util from '../util';
+
+import * as broadcast_util from './broadcast_util';
+import {Activation} from './fused_types';
+import {op} from './operation';
 
 /**
  * Computes the dot product of two matrices with optional activation and bias.
@@ -65,15 +63,6 @@ function fusedMatMul_<T extends Tensor>({
   activation?: Activation,
   preluActivationWeights?: Tensor
 }): T {
-  if (shouldFuse(ENGINE.state.gradientDepth, activation) === false) {
-    let result = unfusedMatMul(a, b, transposeA, transposeB);
-    if (bias != null) {
-      result = add(result, bias);
-    }
-
-    return applyActivation(result, activation, preluActivationWeights) as T;
-  }
-
   let $a = convertToTensor(a, 'a', 'fused matMul');
   let $b = convertToTensor(b, 'b', 'fused matMul');
   [$a, $b] = makeTypesMatch($a, $b);
@@ -133,46 +122,6 @@ function fusedMatMul_<T extends Tensor>({
         preluActivationWeights, 'prelu weights', 'fused matMul');
   }
 
-  const grad = (dy: Tensor3D, saved: Tensor[]) => {
-    const [a3D, b3D, y] = saved;
-    const dyActivation = getFusedDyActivation(dy, y, activation);
-
-    let biasGradient = {};
-    if (bias != null) {
-      biasGradient = {bias: () => getFusedBiasGradient($bias, dyActivation)};
-    }
-
-    if (!transposeA && !transposeB) {
-      return Object.assign(
-          {
-            a: () => dyActivation.matMul(b3D as Tensor3D, false, true),
-            b: () => a3D.matMul(dyActivation, true, false)
-          },
-          biasGradient);
-    } else if (!transposeA && transposeB) {
-      return Object.assign(
-          {
-            a: () => dyActivation.matMul(b3D as Tensor3D, false, false),
-            b: () => dyActivation.matMul(a3D as Tensor3D, true, false)
-          },
-          biasGradient);
-    } else if (transposeA && !transposeB) {
-      return Object.assign(
-          {
-            a: () => b3D.matMul(dyActivation, false, true),
-            b: () => a3D.matMul(dyActivation, false, false)
-          },
-          biasGradient);
-    } else {
-      return Object.assign(
-          {
-            a: () => b3D.matMul(dyActivation, true, true),
-            b: () => dyActivation.matMul(a3D as Tensor3D, true, true)
-          },
-          biasGradient);
-    }
-  };
-
   const inputs:
       {a: Tensor, b: Tensor,
        bias?: Tensor,
@@ -188,7 +137,7 @@ function fusedMatMul_<T extends Tensor>({
   const outputsToSave = [true];
 
   const res = ENGINE.runKernelFunc(
-      (backend, save) => {
+      (backend) => {
         const y = backend.fusedBatchMatMul({
           a: a3D,
           b: b3D,
@@ -198,11 +147,10 @@ function fusedMatMul_<T extends Tensor>({
           activation,
           preluActivationWeights: $preluActivationWeights
         });
-        save([a3D, b3D, y]);
         return y;
       },
-      inputs, grad, '_FusedMatMul', {transposeA, transposeB, activation},
-      inputsToSave, outputsToSave);
+      inputs, null /* grad */, '_FusedMatMul',
+      {transposeA, transposeB, activation}, inputsToSave, outputsToSave);
   return res.reshape(outShape);
 }
 
diff --git a/tfjs-core/src/ops/fused_mat_mul_test.ts b/tfjs-core/src/ops/fused_mat_mul_test.ts
new file mode 100644
index 00000000000..c85123cde11
--- /dev/null
+++ b/tfjs-core/src/ops/fused_mat_mul_test.ts
@@ -0,0 +1,310 @@
+/**
+ * @license
+ * Copyright 2020 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import * as tf from '../index';
+import {ALL_ENVS, describeWithFlags} from '../jasmine_util';
+import {expectArraysClose} from '../test_util';
+
+describeWithFlags('fused matmul', ALL_ENVS, () => {
+  it('fused A x B', async () => {
+    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
+    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
+
+    const c = tf.fused.matMul({a, b});
+
+    expect(c.shape).toEqual([2, 2]);
+    expectArraysClose(await c.data(), [0, 8, -3, 20]);
+  });
+
+  it('fused A x B with relu', async () => {
+    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
+    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
+    const transposeA = false;
+    const transposeB = false;
+
+    const c = tf.fused.matMul(
+        {a, b, transposeA, transposeB, bias: null, activation: 'relu'});
+
+    expect(c.shape).toEqual([2, 2]);
+    expectArraysClose(await c.data(), [0, 8, 0, 20]);
+  });
+
+  it('fused A x B with elu', async () => {
+    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
+    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
+    const transposeA = false;
+    const transposeB = false;
+
+    const c = tf.fused.matMul(
+        {a, b, transposeA, transposeB, bias: null, activation: 'elu'});
+
+    expect(c.shape).toEqual([2, 2]);
+    expectArraysClose(await c.data(), [0, 8, -0.9502, 20]);
+  });
+
+  it('fused A x B with relu6', async () => {
+    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
+    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
+    const transposeA = false;
+    const transposeB = false;
+
+    const c = tf.fused.matMul(
+        {a, b, transposeA, transposeB, bias: null, activation: 'relu6'});
+
+    expect(c.shape).toEqual([2, 2]);
+    expectArraysClose(await c.data(), [0, 6, 0, 6]);
+  });
+
+  it('fused A x B with prelu', async () => {
+    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
+    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
+    const alpha = tf.tensor2d([0.5, 0.5], [1, 2]);
+    const transposeA = false;
+    const transposeB = false;
+
+    const c = tf.fused.matMul({
+      a,
+      b,
+      transposeA,
+      transposeB,
+      bias: null,
+      activation: 'prelu',
+      preluActivationWeights: alpha
+    });
+
+    expect(c.shape).toEqual([2, 2]);
+    expectArraysClose(await c.data(), [0, 8, -1.5, 20]);
+  });
+
+  it('fused A x B with relu transpose', async () => {
+    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
+    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [2, 3]);
+    const transposeA = false;
+    const transposeB = true;
+
+    const c = tf.fused.matMul(
+        {a, b, transposeA, transposeB, bias: null, activation: 'relu'});
+
+    expect(c.shape).toEqual([2, 2]);
+    expectArraysClose(await c.data(), [0, 9, 0, 24]);
+  });
+
+  it('fused A x B with 2d bias and relu', async () => {
+    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
+    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
+    const c = tf.tensor2d([1, 1, 1, 1], [2, 2]);
+    const transposeA = false;
+    const transposeB = false;
+
+    const d = tf.fused.matMul(
+        {a, b, transposeA, transposeB, bias: c, activation: 'relu'});
+
+    expect(d.shape).toEqual([2, 2]);
+    expectArraysClose(await d.data(), [1, 9, 0, 21]);
+  });
+
+  it('fused A x B with relu and broadcasted bias', async () => {
+    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
+    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
+    const c = tf.tensor1d([1, 1]);
+    const act: tf.fused.Activation = 'relu';
+    const transposeA = false;
+    const transposeB = false;
+
+    const d = tf.fused.matMul(
+        {a, b, transposeA, transposeB, bias: c, activation: act});
+
+    expect(d.shape).toEqual([2, 2]);
+    expectArraysClose(await d.data(), [1, 9, 0, 21]);
+  });
+
+  it('fused A x B with elu and broadcasted bias', async () => {
+    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
+    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
+    const c = tf.tensor1d([1, 1]);
+    const act: tf.fused.Activation = 'elu';
+    const transposeA = false;
+    const transposeB = false;
+
+    const d = tf.fused.matMul(
+        {a, b, transposeA, transposeB, bias: c, activation: act});
+
+    expect(d.shape).toEqual([2, 2]);
+    expectArraysClose(await d.data(), [1, 9, -0.8647, 21]);
+  });
+
+  it('fused A x B with relu and broadcasted bias different rank', async () => {
+    const a = tf.tensor3d([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [2, 2, 3]);
+    const b = tf.tensor3d([0, 1, -3, 2, 2, 1, 0, 1, -3, 2, 2, 1], [2, 3, 2]);
+    const c = tf.tensor2d([1, 2], [1, 2]);
+    const act: tf.fused.Activation = 'relu';
+    const transposeA = false;
+    const transposeB = false;
+
+    const d = tf.fused.matMul(
+        {a, b, transposeA, transposeB, bias: c, activation: act});
+
+    expect(d.shape).toEqual([2, 2, 2]);
+    expectArraysClose(await d.data(), [2, 6, 0, 18, 0, 30, 0, 42]);
+  });
+
+  it('fused A x B with 2d bias only', async () => {
+    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
+    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
+    const c = tf.tensor2d([1, 1, 1, 1], [2, 2]);
+    const transposeA = false;
+    const transposeB = false;
+
+    const d = tf.fused.matMul(
+        {a, b, transposeA, transposeB, bias: c, activation: 'linear'});
+
+    expect(d.shape).toEqual([2, 2]);
+    expectArraysClose(await d.data(), [1, 9, -2, 21]);
+  });
+
+  // it('fused A x B with relu gradient', async () => {
+  //   const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
+  //   const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
+  //   const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
+  //   const transposeA = false;
+  //   const transposeB = false;
+
+  //   const grads = tf.grads((a, b) => {
+  //     const prod = tf.matMul(a, b, transposeA, transposeB);
+  //     return tf.relu(prod);
+  //   });
+
+  //   const fusedGrads = tf.grads((a, b) => {
+  //     return tf.fused.matMul(
+  //         {a, b, transposeA, transposeB, bias: null, activation: 'relu'});
+  //   });
+
+  //   const [da, db] = grads([a, b], dy);
+  //   const [fusedDa, fusedDb] = fusedGrads([a, b], dy);
+  //   expectArraysClose(await da.array(), await fusedDa.array());
+  //   expectArraysClose(await db.data(), await fusedDb.array());
+  // });
+
+  // it('gradient with clones A x B with relu', () => {
+  //   const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
+  //   const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
+  //   const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
+  //   const transposeA = false;
+  //   const transposeB = false;
+
+  //   const fusedGrads = tf.grads((a, b) => {
+  //     return tf.fused
+  //         .matMul({
+  //           a: a.clone(),
+  //           b: b.clone(),
+  //           transposeA,
+  //           transposeB,
+  //           bias: null,
+  //           activation: 'relu'
+  //         })
+  //         .clone();
+  //   });
+
+  //   const [fusedDa, fusedDb] = fusedGrads([a, b], dy);
+  //   expect(fusedDa.shape).toEqual(a.shape);
+  //   expect(fusedDb.shape).toEqual(b.shape);
+  // });
+
+  // it('fused A x B with relu bias gradient', async () => {
+  //   const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
+  //   const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
+  //   const c = tf.tensor2d([1, 1, 1, 1], [2, 2]);
+  //   const transposeA = false;
+  //   const transposeB = false;
+
+  //   const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
+
+  //   const grads = tf.grads((a, b, c) => {
+  //     const prod = tf.matMul(a, b, transposeA, transposeB);
+  //     const sum = tf.add(prod, c);
+  //     return tf.relu(sum);
+  //   });
+
+  //   const fusedGrads = tf.grads((a, b, c) => {
+  //     return tf.fused.matMul(
+  //         {a, b, transposeA, transposeB, bias: c, activation: 'relu'});
+  //   });
+
+  //   const [da, db, dc] = grads([a, b, c], dy);
+  //   const [fusedDa, fusedDb, fusedDc] = fusedGrads([a, b, c], dy);
+
+  //   expectArraysClose(await da.array(), await fusedDa.array());
+  //   expectArraysClose(await db.array(), await fusedDb.array());
+  //   expectArraysClose(await dc.array(), await fusedDc.array());
+  // });
+
+  // it('fused A x B with relu bias gradient transpose', async () => {
+  //   const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [3, 2]);
+  //   const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
+  //   const c = tf.tensor2d([1, 1, 1, 1], [2, 2]);
+  //   const transposeA = true;
+  //   const transposeB = false;
+
+  //   const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
+
+  //   const grads = tf.grads((a, b, c) => {
+  //     const prod = tf.matMul(a, b, transposeA, transposeB);
+  //     const sum = tf.add(prod, c);
+  //     return tf.relu(sum);
+  //   });
+
+  //   const fusedGrads = tf.grads((a, b, c) => {
+  //     return tf.fused.matMul(
+  //         {a, b, transposeA, transposeB, bias: c, activation: 'relu'});
+  //   });
+
+  //   const [da, db, dc] = grads([a, b, c], dy);
+  //   const [fusedDa, fusedDb, fusedDc] = fusedGrads([a, b, c], dy);
+
+  //   expectArraysClose(await da.array(), await fusedDa.array());
+  //   expectArraysClose(await db.array(), await fusedDb.array());
+  //   expectArraysClose(await dc.array(), await fusedDc.array());
+  // });
+
+  // it('fused A x B with relu and broadcasted bias gradient', async () => {
+  //   const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
+  //   const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
+  //   const c = tf.tensor2d([[1]]);
+  //   const transposeA = false;
+  //   const transposeB = false;
+
+  //   const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
+
+  //   const grads = tf.grads((a, b, c) => {
+  //     const prod = tf.matMul(a, b, transposeA, transposeB);
+  //     const sum = tf.add(prod, c);
+  //     return tf.relu(sum);
+  //   });
+
+  //   const fusedGrads = tf.grads((a, b, c) => {
+  //     return tf.fused.matMul(
+  //         {a, b, transposeA, transposeB, bias: c, activation: 'relu'});
+  //   });
+
+  //   const [da, db, dc] = grads([a, b, c], dy);
+  //   const [fusedDa, fusedDb, fusedDc] = fusedGrads([a, b, c], dy);
+
+  //   expectArraysClose(await da.array(), await fusedDa.array());
+  //   expectArraysClose(await db.array(), await fusedDb.array());
+  //   expectArraysClose(await dc.array(), await fusedDc.array());
+  // });
+});
diff --git a/tfjs-core/src/ops/fused_ops.ts b/tfjs-core/src/ops/fused_ops.ts
index bf458da5c9e..32d8b26770c 100644
--- a/tfjs-core/src/ops/fused_ops.ts
+++ b/tfjs-core/src/ops/fused_ops.ts
@@ -15,9 +15,9 @@
  * =============================================================================
  */
 
-import {conv2d} from './fused/conv2d';
-import {depthwiseConv2d} from './fused/depthwise_conv2d';
-import {matMul} from './fused/mat_mul';
-import {Activation} from './fused/types';
+import {conv2d} from './fused_conv2d';
+import {depthwiseConv2d} from './fused_depthwise_conv2d';
+import {matMul} from './fused_mat_mul';
+import {Activation} from './fused_types';
 
 export {Activation, conv2d, depthwiseConv2d, matMul};
diff --git a/tfjs-core/src/ops/fused_test.ts b/tfjs-core/src/ops/fused_test.ts
deleted file mode 100644
index 7003c3070de..00000000000
--- a/tfjs-core/src/ops/fused_test.ts
+++ /dev/null
@@ -1,1391 +0,0 @@
-/**
- * @license
- * Copyright 2019 Google LLC. All Rights Reserved.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * =============================================================================
- */
-
-import * as tf from '../index';
-import {ALL_ENVS, describeWithFlags} from '../jasmine_util';
-import {expectArraysClose} from '../test_util';
-
-function generateCaseInputs(totalSizeTensor: number, totalSizeFilter: number) {
-  const inp = new Array(totalSizeTensor);
-  const filt = new Array(totalSizeFilter);
-
-  for (let i = 0; i < totalSizeTensor; i++) {
-    inp[i] = i * 0.001 - totalSizeTensor * 0.001 / 2;
-  }
-  for (let i = 0; i < totalSizeFilter; i++) {
-    const sign = i % 2 === 0 ? -1 : 1;
-    filt[i] = i * 0.001 * sign;
-  }
-
-  return {input: inp, filter: filt};
-}
-
-describeWithFlags('fused matmul', ALL_ENVS, () => {
-  it('fused A x B', async () => {
-    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
-    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
-
-    const c = tf.fused.matMul({a, b});
-
-    expect(c.shape).toEqual([2, 2]);
-    expectArraysClose(await c.data(), [0, 8, -3, 20]);
-  });
-
-  it('fused A x B with relu', async () => {
-    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
-    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
-    const transposeA = false;
-    const transposeB = false;
-
-    const c = tf.fused.matMul(
-        {a, b, transposeA, transposeB, bias: null, activation: 'relu'});
-
-    expect(c.shape).toEqual([2, 2]);
-    expectArraysClose(await c.data(), [0, 8, 0, 20]);
-  });
-
-  it('fused A x B with elu', async () => {
-    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
-    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
-    const transposeA = false;
-    const transposeB = false;
-
-    const c = tf.fused.matMul(
-        {a, b, transposeA, transposeB, bias: null, activation: 'elu'});
-
-    expect(c.shape).toEqual([2, 2]);
-    expectArraysClose(await c.data(), [0, 8, -0.9502, 20]);
-  });
-
-  it('fused A x B with relu6', async () => {
-    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
-    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
-    const transposeA = false;
-    const transposeB = false;
-
-    const c = tf.fused.matMul(
-        {a, b, transposeA, transposeB, bias: null, activation: 'relu6'});
-
-    expect(c.shape).toEqual([2, 2]);
-    expectArraysClose(await c.data(), [0, 6, 0, 6]);
-  });
-
-  it('fused A x B with prelu', async () => {
-    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
-    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
-    const alpha = tf.tensor2d([0.5, 0.5], [1, 2]);
-    const transposeA = false;
-    const transposeB = false;
-
-    const c = tf.fused.matMul({
-      a,
-      b,
-      transposeA,
-      transposeB,
-      bias: null,
-      activation: 'prelu',
-      preluActivationWeights: alpha
-    });
-
-    expect(c.shape).toEqual([2, 2]);
-    expectArraysClose(await c.data(), [0, 8, -1.5, 20]);
-  });
-
-  it('fused A x B with relu transpose', async () => {
-    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
-    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [2, 3]);
-    const transposeA = false;
-    const transposeB = true;
-
-    const c = tf.fused.matMul(
-        {a, b, transposeA, transposeB, bias: null, activation: 'relu'});
-
-    expect(c.shape).toEqual([2, 2]);
-    expectArraysClose(await c.data(), [0, 9, 0, 24]);
-  });
-
-  it('fused A x B with 2d bias and relu', async () => {
-    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
-    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
-    const c = tf.tensor2d([1, 1, 1, 1], [2, 2]);
-    const transposeA = false;
-    const transposeB = false;
-
-    const d = tf.fused.matMul(
-        {a, b, transposeA, transposeB, bias: c, activation: 'relu'});
-
-    expect(d.shape).toEqual([2, 2]);
-    expectArraysClose(await d.data(), [1, 9, 0, 21]);
-  });
-
-  it('fused A x B with relu and broadcasted bias', async () => {
-    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
-    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
-    const c = tf.tensor1d([1, 1]);
-    const act: tf.fused.Activation = 'relu';
-    const transposeA = false;
-    const transposeB = false;
-
-    const d = tf.fused.matMul(
-        {a, b, transposeA, transposeB, bias: c, activation: act});
-
-    expect(d.shape).toEqual([2, 2]);
-    expectArraysClose(await d.data(), [1, 9, 0, 21]);
-  });
-
-  it('fused A x B with elu and broadcasted bias', async () => {
-    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
-    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
-    const c = tf.tensor1d([1, 1]);
-    const act: tf.fused.Activation = 'elu';
-    const transposeA = false;
-    const transposeB = false;
-
-    const d = tf.fused.matMul(
-        {a, b, transposeA, transposeB, bias: c, activation: act});
-
-    expect(d.shape).toEqual([2, 2]);
-    expectArraysClose(await d.data(), [1, 9, -0.8647, 21]);
-  });
-
-  it('fused A x B with relu and broadcasted bias different rank', async () => {
-    const a = tf.tensor3d([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [2, 2, 3]);
-    const b = tf.tensor3d([0, 1, -3, 2, 2, 1, 0, 1, -3, 2, 2, 1], [2, 3, 2]);
-    const c = tf.tensor2d([1, 2], [1, 2]);
-    const act: tf.fused.Activation = 'relu';
-    const transposeA = false;
-    const transposeB = false;
-
-    const d = tf.fused.matMul(
-        {a, b, transposeA, transposeB, bias: c, activation: act});
-
-    expect(d.shape).toEqual([2, 2, 2]);
-    expectArraysClose(await d.data(), [2, 6, 0, 18, 0, 30, 0, 42]);
-  });
-
-  it('fused A x B with 2d bias only', async () => {
-    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
-    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
-    const c = tf.tensor2d([1, 1, 1, 1], [2, 2]);
-    const transposeA = false;
-    const transposeB = false;
-
-    const d = tf.fused.matMul(
-        {a, b, transposeA, transposeB, bias: c, activation: 'linear'});
-
-    expect(d.shape).toEqual([2, 2]);
-    expectArraysClose(await d.data(), [1, 9, -2, 21]);
-  });
-
-  it('fused A x B with relu gradient', async () => {
-    const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
-    const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
-    const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
-    const transposeA = false;
-    const transposeB = false;
-
-    const grads = tf.grads((a, b) => {
-      const prod = tf.matMul(a, b, transposeA, transposeB);
-      return tf.relu(prod);
-    });
-
-    const fusedGrads = tf.grads((a, b) => {
-      return tf.fused.matMul(
-          {a, b, transposeA, transposeB, bias: null, activation: 'relu'});
-    });
-
-    const [da, db] = grads([a, b], dy);
-    const [fusedDa, fusedDb] = fusedGrads([a, b], dy);
-    expectArraysClose(await da.array(), await fusedDa.array());
-    expectArraysClose(await db.data(), await fusedDb.array());
-  });
-
-  it('gradient with clones A x B with relu', () => {
-    const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
-    const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
-    const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
-    const transposeA = false;
-    const transposeB = false;
-
-    const fusedGrads = tf.grads((a, b) => {
-      return tf.fused
-          .matMul({
-            a: a.clone(),
-            b: b.clone(),
-            transposeA,
-            transposeB,
-            bias: null,
-            activation: 'relu'
-          })
-          .clone();
-    });
-
-    const [fusedDa, fusedDb] = fusedGrads([a, b], dy);
-    expect(fusedDa.shape).toEqual(a.shape);
-    expect(fusedDb.shape).toEqual(b.shape);
-  });
-
-  it('fused A x B with relu bias gradient', async () => {
-    const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
-    const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
-    const c = tf.tensor2d([1, 1, 1, 1], [2, 2]);
-    const transposeA = false;
-    const transposeB = false;
-
-    const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
-
-    const grads = tf.grads((a, b, c) => {
-      const prod = tf.matMul(a, b, transposeA, transposeB);
-      const sum = tf.add(prod, c);
-      return tf.relu(sum);
-    });
-
-    const fusedGrads = tf.grads((a, b, c) => {
-      return tf.fused.matMul(
-          {a, b, transposeA, transposeB, bias: c, activation: 'relu'});
-    });
-
-    const [da, db, dc] = grads([a, b, c], dy);
-    const [fusedDa, fusedDb, fusedDc] = fusedGrads([a, b, c], dy);
-
-    expectArraysClose(await da.array(), await fusedDa.array());
-    expectArraysClose(await db.array(), await fusedDb.array());
-    expectArraysClose(await dc.array(), await fusedDc.array());
-  });
-
-  it('fused A x B with relu bias gradient transpose', async () => {
-    const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [3, 2]);
-    const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
-    const c = tf.tensor2d([1, 1, 1, 1], [2, 2]);
-    const transposeA = true;
-    const transposeB = false;
-
-    const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
-
-    const grads = tf.grads((a, b, c) => {
-      const prod = tf.matMul(a, b, transposeA, transposeB);
-      const sum = tf.add(prod, c);
-      return tf.relu(sum);
-    });
-
-    const fusedGrads = tf.grads((a, b, c) => {
-      return tf.fused.matMul(
-          {a, b, transposeA, transposeB, bias: c, activation: 'relu'});
-    });
-
-    const [da, db, dc] = grads([a, b, c], dy);
-    const [fusedDa, fusedDb, fusedDc] = fusedGrads([a, b, c], dy);
-
-    expectArraysClose(await da.array(), await fusedDa.array());
-    expectArraysClose(await db.array(), await fusedDb.array());
-    expectArraysClose(await dc.array(), await fusedDc.array());
-  });
-
-  it('fused A x B with relu and broadcasted bias gradient', async () => {
-    const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
-    const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
-    const c = tf.tensor2d([[1]]);
-    const transposeA = false;
-    const transposeB = false;
-
-    const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
-
-    const grads = tf.grads((a, b, c) => {
-      const prod = tf.matMul(a, b, transposeA, transposeB);
-      const sum = tf.add(prod, c);
-      return tf.relu(sum);
-    });
-
-    const fusedGrads = tf.grads((a, b, c) => {
-      return tf.fused.matMul(
-          {a, b, transposeA, transposeB, bias: c, activation: 'relu'});
-    });
-
-    const [da, db, dc] = grads([a, b, c], dy);
-    const [fusedDa, fusedDb, fusedDc] = fusedGrads([a, b, c], dy);
-
-    expectArraysClose(await da.array(), await fusedDa.array());
-    expectArraysClose(await db.array(), await fusedDb.array());
-    expectArraysClose(await dc.array(), await fusedDc.array());
-  });
-});
-
-describeWithFlags('fused depthwiseConv2D', ALL_ENVS, () => {
-  it('basic', async () => {
-    const fSize = 2;
-    const pad = 'valid';
-    const strides = 1;
-    const chMul = 1;
-    const inDepth = 1;
-
-    const x = tf.tensor4d(
-        [
-          0.230664, 0.987388, 0.0685208, 0.419224, 0.887861, 0.731641,
-          0.0741907, 0.409265, 0.351377
-        ],
-        [1, 3, 3, inDepth]);
-    const w = tf.tensor4d(
-        [-0.303873, -0.229223, 0.144333, 0.803373],
-        [fSize, fSize, inDepth, chMul],
-    );
-
-    const result = tf.fused.depthwiseConv2d({x, filter: w, strides, pad});
-    expect(result.shape).toEqual([1, 2, 2, 1]);
-    const expected = [0.47737, 0.40018, 0.00859, -0.09615];
-    expectArraysClose(await result.data(), expected);
-  });
-
-  it('basic with relu', async () => {
-    const fSize = 2;
-    const pad = 'valid';
-    const strides = 1;
-    const chMul = 1;
-    const inDepth = 1;
-
-    const x = tf.tensor4d(
-        [
-          0.230664, 0.987388, 0.0685208, 0.419224, 0.887861, 0.731641,
-          0.0741907, 0.409265, 0.351377
-        ],
-        [1, 3, 3, inDepth]);
-    const w = tf.tensor4d(
-        [-0.303873, -0.229223, 0.144333, 0.803373],
-        [fSize, fSize, inDepth, chMul],
-    );
-
-    const result = tf.fused.depthwiseConv2d(
-        {x, filter: w, strides, pad, activation: 'relu'});
-    expect(result.shape).toEqual([1, 2, 2, 1]);
-    const expected = [0.47737, 0.40018, 0.00859, 0];
-    expectArraysClose(await result.data(), expected);
-  });
-
-  it('basic with broadcasted bias and relu', async () => {
-    const fSize = 2;
-    const pad = 'valid';
-    const strides = 1;
-    const chMul = 1;
-    const inDepth = 1;
-
-    const x = tf.tensor4d(
-        [
-          0.230664, 0.987388, 0.0685208, 0.419224, 0.887861, 0.731641,
-          0.0741907, 0.409265, 0.351377
-        ],
-        [1, 3, 3, inDepth]);
-    const w = tf.tensor4d(
-        [-0.303873, -0.229223, 0.144333, 0.803373],
-        [fSize, fSize, inDepth, chMul],
-    );
-
-    const result = tf.fused.depthwiseConv2d(
-        {x, filter: w, strides, pad, bias: tf.scalar(1), activation: 'relu'});
-    expect(result.shape).toEqual([1, 2, 2, 1]);
-    const expected = [1.47737, 1.40018, 1.00859, 0.90385];
-    expectArraysClose(await result.data(), expected);
-  });
-
-  it('prelu', async () => {
-    const fSize = 3;
-    const pad = 'valid';
-    const strides = 1;
-    const chMul = 1;
-    const inDepth = 1;
-
-    const x = tf.tensor4d(
-        [
-          0.149194, 0.089009, 0.654891, 0.083324, 0.537043, 0.644331, 0.563037,
-          0.211859, 0.633501, 0.186427, 0.777034, 0.50001,  0.607341, 0.95303,
-          0.696479, 0.050387, 0.62045,  0.728049, 0.028043, 0.437009, 0.712881,
-          0.741935, 0.974474, 0.621102, 0.171411
-        ],
-        [1, 5, 5, inDepth]);
-    const alpha = tf.tensor4d(
-        [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], [1, 3, 3, 1]);
-    const w = tf.tensor4d(
-        [
-          -0.125386, -0.975199, -0.640437, -0.281895, -0.990968, -0.347208,
-          -0.889702, -0.180695, -0.691992
-        ],
-        [fSize, fSize, inDepth, chMul],
-    );
-
-    const result = tf.fused.depthwiseConv2d({
-      x,
-      filter: w,
-      strides,
-      pad,
-      activation: 'prelu',
-      preluActivationWeights: alpha
-    });
-    expect(result.shape).toEqual([1, 3, 3, 1]);
-    const expected = [
-      -0.25400, -0.50118, -0.73622, -0.94068, -1.2298, -1.84585, -2.3089,
-      -2.7499, -2.64077
-    ];
-    expectArraysClose(await result.data(), expected);
-  });
-
-  it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0', async () => {
-    const inputDepth = 1;
-    const outputDepth = 1;
-    const inputShape: [number, number, number, number] = [2, 3, 3, inputDepth];
-    const filterSize = 2;
-    const strides = 1;
-    const pad = 0;
-
-    const filterShape: [number, number, number, number] =
-        [filterSize, filterSize, inputDepth, outputDepth];
-    const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
-
-    const x = tf.tensor4d(
-        [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
-    const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
-
-    const grads = tf.grads(
-        (x: tf.Tensor4D, filter: tf.Tensor4D) =>
-            tf.fused.depthwiseConv2d({x, filter, strides, pad}));
-    const [dx, dfilter] = grads([x, filter], dy);
-
-    expect(dx.shape).toEqual(x.shape);
-    expectArraysClose(
-        await dx.data(),
-        [-3, 2, 1, -8, 1.5, 0.5, -4, 1, 0, -3, 2, 1, -8, 1.5, 0.5, -4, 1, 0]);
-
-    expect(dfilter.shape).toEqual(filterShape);
-    expectArraysClose(await dfilter.data(), [26, 38, 62, 74]);
-  });
-
-  it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0 with bias', async () => {
-    const inputDepth = 1;
-    const outputDepth = 1;
-    const inputShape: [number, number, number, number] = [2, 3, 3, inputDepth];
-    const filterSize = 2;
-    const strides = 1;
-    const pad = 0;
-
-    const filterShape: [number, number, number, number] =
-        [filterSize, filterSize, inputDepth, outputDepth];
-    const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
-    const bias = tf.ones([2, 2, 2, 1]);
-
-    const x = tf.tensor4d(
-        [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
-    const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
-
-    const fusedGrads = tf.grads(
-        (x: tf.Tensor4D, w: tf.Tensor4D, b) => tf.fused.depthwiseConv2d({
-          x,
-          filter: w,
-          strides,
-          pad,
-          dataFormat: 'NHWC',
-          dilations: [1, 1],
-          bias: b
-        }));
-    const [dxFused, dfilterFused, dbiasFused] =
-        fusedGrads([x, filter, bias], dy);
-
-    const grads = tf.grads((x: tf.Tensor4D, filter: tf.Tensor4D, bias) => {
-      const conv = tf.depthwiseConv2d(x, filter, strides, pad);
-      const sum = tf.add(conv, bias);
-      return sum;
-    });
-    const [dx, dfilter, dbias] = grads([x, filter, bias], dy);
-
-    expectArraysClose(await dxFused.array(), await dx.array());
-    expectArraysClose(await dfilterFused.array(), await dfilter.array());
-    expectArraysClose(await dbiasFused.array(), await dbias.array());
-  });
-
-  it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0 with bias and activation',
-     async () => {
-       const inputDepth = 1;
-       const outputDepth = 1;
-       const inputShape: [number, number, number, number] =
-           [2, 3, 3, inputDepth];
-       const filterSize = 2;
-       const strides = 1;
-       const pad = 0;
-
-       const filterShape: [number, number, number, number] =
-           [filterSize, filterSize, inputDepth, outputDepth];
-       const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
-       const bias = tf.ones([2, 2, 2, 1]);
-
-       const x = tf.tensor4d(
-           [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
-       const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
-
-       const fusedGrads = tf.grads(
-           (x: tf.Tensor4D, w: tf.Tensor4D, b) => tf.fused.depthwiseConv2d({
-             x,
-             filter: w,
-             strides,
-             pad,
-             dataFormat: 'NHWC',
-             dilations: [1, 1],
-             bias: b,
-             activation: 'relu'
-           }));
-       const [dxFused, dfilterFused, dbiasFused] =
-           fusedGrads([x, filter, bias], dy);
-
-       const grads = tf.grads((x: tf.Tensor4D, filter: tf.Tensor4D, bias) => {
-         const conv = tf.depthwiseConv2d(x, filter, strides, pad);
-         const sum = tf.add(conv, bias);
-         return tf.relu(sum);
-       });
-       const [dx, dfilter, dbias] = grads([x, filter, bias], dy);
-
-       expectArraysClose(await dxFused.array(), await dx.array());
-       expectArraysClose(await dfilterFused.array(), await dfilter.array());
-       expectArraysClose(await dbiasFused.array(), await dbias.array());
-     });
-});
-
-describeWithFlags('fused conv2d', ALL_ENVS, () => {
-  it('basic', async () => {
-    const inputDepth = 2;
-    const inShape: [number, number, number, number] = [2, 2, 2, inputDepth];
-    const outputDepth = 2;
-    const fSize = 1;
-    const pad = 0;
-    const stride = 1;
-
-    const x = tf.tensor4d(
-        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], inShape);
-    const w =
-        tf.tensor4d([-1, 1, -2, 0.5], [fSize, fSize, inputDepth, outputDepth]);
-
-    const result = tf.fused.conv2d({x, filter: w, strides: stride, pad});
-    expect(result.shape).toEqual([2, 2, 2, 2]);
-    const expected =
-        [-5, 2, -11, 5, -17, 8, -23, 11, -29, 14, -35, 17, -41, 20, -47, 23];
-
-    expectArraysClose(await result.data(), expected);
-  });
-
-  it('basic with relu', async () => {
-    const inputDepth = 2;
-    const inShape: [number, number, number, number] = [2, 2, 2, inputDepth];
-    const outputDepth = 2;
-    const fSize = 1;
-    const pad = 0;
-    const stride = 1;
-
-    const x = tf.tensor4d(
-        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], inShape);
-    const w =
-        tf.tensor4d([-1, 1, -2, 0.5], [fSize, fSize, inputDepth, outputDepth]);
-
-    const result = tf.fused.conv2d({
-      x,
-      filter: w,
-      strides: stride,
-      pad,
-      dataFormat: 'NHWC',
-      dilations: [1, 1],
-      activation: 'relu'
-    });
-    expect(result.shape).toEqual([2, 2, 2, 2]);
-    const expected = [0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0, 17, 0, 20, 0, 23];
-
-    expectArraysClose(await result.data(), expected);
-  });
-
-  it('relu with stride 2 x=[1,8,8,16] f=[3,3,16,1] s=[2,2] d=1 p=same',
-     async () => {
-       const inputDepth = 16;
-       const xSize = 8;
-       const inputShape: [number, number, number, number] =
-           [1, xSize, xSize, inputDepth];
-       const outputDepth = 1;
-       const fSize = 3;
-       const pad = 'same';
-       const stride: [number, number] = [2, 2];
-
-       // TODO(annxingyuan): Make this test work with large inputs
-       // https://github.com/tensorflow/tfjs/issues/3143
-       const inputData = [];
-       for (let i = 0; i < xSize * xSize * inputDepth; i++) {
-         inputData.push(i % 5);
-       }
-
-       const wData = [];
-       for (let i = 0; i < fSize * fSize * inputDepth * outputDepth; i++) {
-         wData.push(i % 5);
-       }
-
-       const x = tf.tensor4d(inputData, inputShape);
-       const w = tf.tensor4d(wData, [fSize, fSize, inputDepth, outputDepth]);
-
-       const result = tf.fused.conv2d({
-         x,
-         filter: w,
-         strides: stride,
-         pad,
-         dataFormat: 'NHWC',
-         dilations: [1, 1],
-         activation: 'relu'
-       });
-       expect(result.shape).toEqual([1, 4, 4, 1]);
-       expectArraysClose(await result.data(), new Float32Array([
-                           854, 431, 568, 382, 580, 427, 854, 288, 431, 568,
-                           580, 289, 285, 570, 285, 258
-                         ]));
-     });
-
-  it('relu bias stride 2 x=[1,8,8,16] f=[3,3,16,1] s=[2,2] d=8 p=same',
-     async () => {
-       const inputDepth = 16;
-       const xSize = 8;
-       const inputShape: [number, number, number, number] =
-           [1, xSize, xSize, inputDepth];
-       const outputDepth = 8;
-       const fSize = 3;
-       const pad = 'same';
-       const stride: [number, number] = [2, 2];
-
-       const inputs = generateCaseInputs(
-           1 * xSize * xSize * inputDepth,
-           fSize * fSize * inputDepth * outputDepth);
-       const x = tf.tensor4d(inputs.input, inputShape);
-       const w =
-           tf.tensor4d(inputs.filter, [fSize, fSize, inputDepth, outputDepth]);
-       const bias = tf.tensor1d([1, 4, 2, 3, 9, 6, 5, 8]);
-       const result = tf.fused.conv2d({
-         x,
-         filter: w,
-         strides: stride,
-         pad,
-         dataFormat: 'NHWC',
-         dilations: [1, 1],
-         activation: 'relu',
-         bias
-       });
-       expect(result.shape).toEqual([1, 4, 4, 8]);
-       expectArraysClose(await result.data(), new Float32Array([
-                           25.75398063659668,
-                           0,
-                           26.857805252075195,
-                           0,
-                           33.961631774902344,
-                           0,
-                           30.065458297729492,
-                           0,
-                           23.118206024169922,
-                           0,
-                           24.212820053100586,
-                           0,
-                           31.307422637939453,
-                           0,
-                           27.402034759521484,
-                           0,
-                           20.482431411743164,
-                           0,
-                           21.567821502685547,
-                           0,
-                           28.653217315673828,
-                           0,
-                           24.73861312866211,
-                           0,
-                           11.078080177307129,
-                           0,
-                           12.130399703979492,
-                           0,
-                           19.182720184326172,
-                           0,
-                           15.235037803649902,
-                           0,
-                           4.6677775382995605,
-                           0.31717729568481445,
-                           5.697869777679443,
-                           0,
-                           12.727968215942383,
-                           2.2569849491119385,
-                           8.758066177368164,
-                           4.226885795593262,
-                           2.0319995880126953,
-                           2.9575586318969727,
-                           3.052880048751831,
-                           1.9366796016693115,
-                           10.073760032653809,
-                           4.915799617767334,
-                           6.094639778137207,
-                           6.89492130279541,
-                           0,
-                           5.5979437828063965,
-                           0.4078875780105591,
-                           4.586280822753906,
-                           7.419551849365234,
-                           7.5746169090271,
-                           3.43121600151062,
-                           9.562952041625977,
-                           0,
-                           6.404943943023682,
-                           0,
-                           5.401776313781738,
-                           6.5998077392578125,
-                           8.398608207702637,
-                           2.602976083755493,
-                           10.395440101623535,
-                           0,
-                           21.440250396728516,
-                           0,
-                           20.483882904052734,
-                           0,
-                           23.527509689331055,
-                           0,
-                           25.571144104003906,
-                           0,
-                           24.080629348754883,
-                           0,
-                           23.133480072021484,
-                           0,
-                           26.186328887939453,
-                           0,
-                           28.239177703857422,
-                           0,
-                           26.721012115478516,
-                           0,
-                           25.783079147338867,
-                           0,
-                           28.84514808654785,
-                           0,
-                           30.907209396362305,
-                           0,
-                           18.914127349853516,
-                           0,
-                           17.960111618041992,
-                           0,
-                           21.006093978881836,
-                           0,
-                           23.052082061767578,
-                           0,
-                           17.89089584350586,
-                           0,
-                           16.95684814453125,
-                           0,
-                           20.022798538208008,
-                           0,
-                           22.088754653930664,
-                           0,
-                           19.06132698059082,
-                           0,
-                           18.133424758911133,
-                           0,
-                           21.205520629882812,
-                           0,
-                           23.27761459350586,
-                           0,
-                           20.23175811767578,
-                           0,
-                           19.309999465942383,
-                           0,
-                           22.388240814208984,
-                           0,
-                           24.46647834777832,
-                           0,
-                           13.584352493286133,
-                           0,
-                           12.6395845413208,
-                           0,
-                           15.694815635681152,
-                           0,
-                           17.750045776367188
-                         ]));
-     });
-
-  it('prelu bias stride 2 x=[1,8,8,16] f=[3,3,16,1] s=[2,2] d=8 p=same',
-     async () => {
-       const inputDepth = 16;
-       const xSize = 8;
-       const inputShape: [number, number, number, number] =
-           [1, xSize, xSize, inputDepth];
-       const outputDepth = 8;
-       const fSize = 3;
-       const pad = 'same';
-       const stride: [number, number] = [2, 2];
-
-       const inputs = generateCaseInputs(
-           1 * xSize * xSize * inputDepth,
-           fSize * fSize * inputDepth * outputDepth);
-       const x = tf.tensor4d(inputs.input, inputShape);
-       const w =
-           tf.tensor4d(inputs.filter, [fSize, fSize, inputDepth, outputDepth]);
-       const bias = tf.tensor1d([1, 4, 2, 3, 9, 6, 5, 8]);
-       const preluActivationWeights = tf.tensor1d([1, 2, 3, 4, 5, 6, 7, 8]);
-
-       const result = tf.fused.conv2d({
-         x,
-         filter: w,
-         strides: stride,
-         pad,
-         dataFormat: 'NHWC',
-         dilations: [1, 1],
-         activation: 'prelu',
-         preluActivationWeights,
-         bias
-       });
-       expect(result.shape).toEqual([1, 4, 4, 8]);
-       expectArraysClose(
-           await result.data(), new Float32Array([
-             25.75398063659668,   -41.61178970336914,  26.857805252075195,
-             -87.63885498046875,  33.961631774902344,  -114.0812759399414,
-             30.065458297729492,  -136.93893432617188, 23.118206024169922,
-             -36.33102035522461,  24.212820053100586,  -77.04048156738281,
-             31.307422637939453,  -98.12835693359375,  27.402034759521484,
-             -115.5947265625,     20.482431411743164,  -31.050262451171875,
-             21.567821502685547,  -66.44209289550781,  28.653217315673828,
-             -82.17544555664062,  24.73861312866211,   -94.25041198730469,
-             11.078080177307129,  -12.208478927612305, 12.130399703979492,
-             -28.626232147216797, 19.182720184326172,  -25.253299713134766,
-             15.235037803649902,  -18.08960723876953,  4.6677775382995605,
-             0.31717729568481445, 5.697869777679443,   -2.8516759872436523,
-             12.727968215942383,  2.2569849491119385,  8.758066177368164,
-             4.226885795593262,   2.0319995880126953,  2.9575586318969727,
-             3.052880048751831,   1.9366796016693115,  10.073760032653809,
-             4.915799617767334,   6.094639778137207,   6.89492130279541,
-             -0.6037763357162476, 5.5979437828063965,  0.4078875780105591,
-             4.586280822753906,   7.419551849365234,   7.5746169090271,
-             3.43121600151062,    9.562952041625977,   -1.4065279960632324,
-             6.404943943023682,   -1.2100803852081299, 5.401776313781738,
-             6.5998077392578125,  8.398608207702637,   2.602976083755493,
-             10.395440101623535,  -16.418434143066406, 21.440250396728516,
-             -46.38618850708008,  20.483882904052734,  -42.52848815917969,
-             23.527509689331055,  -87.84530639648438,  25.571144104003906,
-             -19.054208755493164, 24.080629348754883,  -54.32115936279297,
-             23.133480072021484,  -55.79951477050781,  26.186328887939453,
-             -106.48924255371094, 28.239177703857422,  -21.689987182617188,
-             26.721012115478516,  -62.25614929199219,  25.783079147338867,
-             -69.070556640625,    28.84514808654785,   -125.13325500488281,
-             30.907209396362305,  -13.891133308410645, 18.914127349853516,
-             -38.81135940551758,  17.960111618041992,  -29.915504455566406,
-             21.006093978881836,  -70.20361328125,     23.052082061767578,
-             -12.857919692993164, 17.89089584350586,   -35.771610260009766,
-             16.95684814453125,   -24.949115753173828, 20.022798538208008,
-             -63.39042282104492,  22.088754653930664,  -14.02528190612793,
-             19.06132698059082,   -39.2921257019043,   18.133424758911133,
-             -30.847349166870117, 21.205520629882812,  -71.69097137451172,
-             23.27761459350586,   -15.192638397216797, 20.23175811767578,
-             -42.8126335144043,   19.309999465942383,  -36.74560546875,
-             22.388240814208984,  -79.99152374267578,  24.46647834777832,
-             -8.556736946105957,  13.584352493286133,  -22.835901260375977,
-             12.6395845413208,    -3.336000442504883,  15.694815635681152,
-             -33.0570182800293,   17.750045776367188
-           ]));
-     });
-
-  it('basic with bias', async () => {
-    const inputDepth = 2;
-    const inShape: [number, number, number, number] = [2, 2, 2, inputDepth];
-    const outputDepth = 2;
-    const fSize = 1;
-    const pad = 0;
-    const stride = 1;
-
-    const x = tf.tensor4d(
-        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], inShape);
-    const w =
-        tf.tensor4d([-1, 1, -2, 0.5], [fSize, fSize, inputDepth, outputDepth]);
-
-    const result = tf.fused.conv2d({
-      x,
-      filter: w,
-      strides: stride,
-      pad,
-      dataFormat: 'NHWC',
-      dilations: [1, 1],
-      bias: tf.tensor1d([5, 6])
-    });
-    expect(result.shape).toEqual([2, 2, 2, 2]);
-    const expected =
-        [0, 8, -6, 11, -12, 14, -18, 17, -24, 20, -30, 23, -36, 26, -42, 29];
-
-    expectArraysClose(await result.data(), expected);
-  });
-
-  it('basic with explicit padding', async () => {
-    const inputDepth = 1;
-    const outputDepth = 1;
-    const pad =
-        [[0, 0], [1, 2], [0, 1], [0, 0]] as tf.backend_util.ExplicitPadding;
-    const stride = 1;
-    const dataFormat = 'NHWC';
-    const dilation = 1;
-
-    const x = tf.tensor3d([1, 2, 3, 4, 5, 6, 7, 8], [4, 2, inputDepth]);
-    const w =
-        tf.tensor4d([3, 1, 5, 0, 2, 7, 8, 9], [4, 2, inputDepth, outputDepth]);
-
-    const result = tf.fused.conv2d(
-        {x, filter: w, strides: stride, pad, dataFormat, dilations: dilation});
-
-    const resultData = await result.data();
-    expect(result.shape).toEqual([4, 2, 1]);
-    expectArraysClose(resultData, [133, 66, 200, 102, 108, 58, 56, 58]);
-  });
-
-  it('basic with elu', async () => {
-    const inputDepth = 2;
-    const inShape: [number, number, number, number] = [2, 2, 2, inputDepth];
-    const outputDepth = 2;
-    const fSize = 1;
-    const pad = 0;
-    const stride = 1;
-
-    const x = tf.tensor4d(
-        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], inShape);
-    const w =
-        tf.tensor4d([-1, 1, -2, 0.5], [fSize, fSize, inputDepth, outputDepth]);
-
-    const result = tf.fused.conv2d({
-      x,
-      filter: w,
-      strides: stride,
-      pad,
-      dataFormat: 'NHWC',
-      dilations: [1, 1],
-      activation: 'elu'
-    });
-    expect(result.shape).toEqual([2, 2, 2, 2]);
-    const expected =
-        [-0.99326, 2, -1, 5, -1, 8, -1, 11, -1, 14, -1, 17, -1, 20, -1, 23];
-
-    expectArraysClose(await result.data(), expected);
-  });
-
-  it('basic with prelu', async () => {
-    const inputDepth = 2;
-    const inShape: [number, number, number, number] = [2, 2, 2, inputDepth];
-    const outputDepth = 2;
-    const fSize = 1;
-    const pad = 0;
-    const stride = 1;
-
-    const x = tf.tensor4d(
-        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], inShape);
-    const alpha = tf.tensor3d([0.25, 0.75], [1, 1, 2]);
-    const w =
-        tf.tensor4d([-1, 1, -2, 0.5], [fSize, fSize, inputDepth, outputDepth]);
-
-    const result = tf.fused.conv2d({
-      x,
-      filter: w,
-      strides: stride,
-      pad,
-      dataFormat: 'NHWC',
-      dilations: [1, 1],
-      activation: 'prelu',
-      preluActivationWeights: alpha
-    });
-    expect(result.shape).toEqual([2, 2, 2, 2]);
-    const expected = [
-      -1.25, 2, -2.75, 5, -4.25, 8, -5.75, 11, -7.25, 14, -8.75, 17, -10.25, 20,
-      -11.75, 23
-    ];
-
-    expectArraysClose(await result.data(), expected);
-  });
-
-  it('basic with broadcasted bias and relu', async () => {
-    const inputDepth = 2;
-    const inShape: [number, number, number, number] = [2, 2, 2, inputDepth];
-    const outputDepth = 2;
-    const fSize = 1;
-    const pad = 0;
-    const stride = 1;
-
-    const x = tf.tensor4d(
-        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], inShape);
-    const w =
-        tf.tensor4d([-1, 1, -2, 0.5], [fSize, fSize, inputDepth, outputDepth]);
-
-    const result = tf.fused.conv2d({
-      x,
-      filter: w,
-      strides: stride,
-      pad,
-      dataFormat: 'NHWC',
-      dilations: [1, 1],
-      bias: tf.scalar(5),
-      activation: 'relu'
-    });
-    expect(result.shape).toEqual([2, 2, 2, 2]);
-    const expected = [0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28];
-
-    expectArraysClose(await result.data(), expected);
-  });
-
-  it('im2row', async () => {
-    const inputDepth = 1;
-    const inputShape: [number, number, number] = [4, 4, inputDepth];
-    const outputDepth = 3;
-    const fSize = 1;
-    const pad = 'same';
-    const strides: [number, number] = [2, 2];
-
-    const x = tf.tensor3d(
-        [
-          10, 30, 50, 70, 20, 40, 60, 80, -10, -30, -50, -70, -20, -40, -60, -80
-        ],
-        inputShape);
-    const w = tf.tensor4d([1, 0.5, 1], [fSize, fSize, inputDepth, outputDepth]);
-
-    const result = tf.fused.conv2d({x, filter: w, strides, pad});
-
-    expectArraysClose(
-        await result.data(),
-        [10, 5, 10, 50, 25, 50, -10, -5, -10, -50, -25, -50]);
-  });
-
-  it('im2row with relu', async () => {
-    const inputDepth = 1;
-    const inputShape: [number, number, number] = [4, 4, inputDepth];
-    const outputDepth = 3;
-    const fSize = 1;
-    const pad = 'same';
-    const strides: [number, number] = [2, 2];
-
-    const x = tf.tensor3d(
-        [
-          10, 30, 50, 70, 20, 40, 60, 80, -10, -30, -50, -70, -20, -40, -60, -80
-        ],
-        inputShape);
-    const w = tf.tensor4d([1, 0.5, 1], [fSize, fSize, inputDepth, outputDepth]);
-
-    const result = tf.fused.conv2d({
-      x,
-      filter: w,
-      strides,
-      pad,
-      dataFormat: 'NHWC',
-      dilations: [1, 1],
-      activation: 'relu'
-    });
-
-    expectArraysClose(
-        await result.data(), [10, 5, 10, 50, 25, 50, 0, 0, 0, 0, 0, 0]);
-  });
-
-  it('im2row with prelu', async () => {
-    const inputDepth = 1;
-    const inputShape: [number, number, number] = [4, 4, inputDepth];
-    const outputDepth = 3;
-    const fSize = 1;
-    const pad = 'same';
-    const strides: [number, number] = [2, 2];
-
-    const x = tf.tensor3d(
-        [
-          10, 30, 50, 70, 20, 40, 60, 80, -10, -30, -50, -70, -20, -40, -60, -80
-        ],
-        inputShape);
-    const w = tf.tensor4d([1, 0.5, 1], [fSize, fSize, inputDepth, outputDepth]);
-    const alpha = tf.tensor3d([0.5], [1, 1, inputDepth]);
-
-    const result = tf.fused.conv2d({
-      x,
-      filter: w,
-      strides,
-      pad,
-      dataFormat: 'NHWC',
-      dilations: [1, 1],
-      activation: 'prelu',
-      preluActivationWeights: alpha
-    });
-
-    expectArraysClose(
-        await result.data(),
-        [10, 5, 10, 50, 25, 50, -5, -2.5, -5, -25, -12.5, -25]);
-  });
-
-  it('pointwise with prelu', async () => {
-    const inputDepth = 1;
-    const inputShape: [number, number, number] = [4, 4, inputDepth];
-    const outputDepth = 3;
-    const fSize = 1;
-    const pad = 'same';
-    const strides: [number, number] = [1, 1];
-
-    const x = tf.tensor3d(
-        [
-          10, 30, 50, 70, 20, 40, 60, 80, -10, -30, -50, -70, -20, -40, -60, -80
-        ],
-        inputShape);
-    const w = tf.tensor4d([1, 0.5, 1], [fSize, fSize, inputDepth, outputDepth]);
-    const alpha = tf.tensor3d([0.5], [1, 1, inputDepth]);
-
-    const result = tf.fused.conv2d({
-      x,
-      filter: w,
-      strides,
-      pad,
-      dataFormat: 'NHWC',
-      dilations: [1, 1],
-      activation: 'prelu',
-      preluActivationWeights: alpha
-    });
-
-    expectArraysClose(await result.data(), [
-      10,  5,    10,  30,  15,   30,  50,  25,    50,  70,  35,    70,
-      20,  10,   20,  40,  20,   40,  60,  30,    60,  80,  40,    80,
-      -5,  -2.5, -5,  -15, -7.5, -15, -25, -12.5, -25, -35, -17.5, -35,
-      -10, -5,   -10, -20, -10,  -20, -30, -15,   -30, -40, -20,   -40
-    ]);
-  });
-
-  it('im2row with broadcasted bias and relu', async () => {
-    const inputDepth = 1;
-    const inputShape: [number, number, number] = [4, 4, inputDepth];
-    const outputDepth = 3;
-    const fSize = 1;
-    const pad = 'same';
-    const strides: [number, number] = [2, 2];
-
-    const x = tf.tensor3d(
-        [
-          10, 30, 50, 70, 20, 40, 60, 80, -10, -30, -50, -70, -20, -40, -60, -80
-        ],
-        inputShape);
-    const w = tf.tensor4d([1, 0.5, 1], [fSize, fSize, inputDepth, outputDepth]);
-
-    const result = tf.fused.conv2d({
-      x,
-      filter: w,
-      strides,
-      pad,
-      dataFormat: 'NHWC',
-      dilations: [1, 1],
-      bias: tf.scalar(5),
-      activation: 'relu'
-    });
-
-    expectArraysClose(
-        await result.data(), [15, 10, 15, 55, 30, 55, 0, 0, 0, 0, 0, 0]);
-  });
-
-  it('backProp input x=[2,3,3,1] f=[2,2,1,1] s=1 p=0', async () => {
-    const inputDepth = 1;
-    const outputDepth = 1;
-    const inputShape: [number, number, number, number] = [2, 3, 3, inputDepth];
-    const filterSize = 2;
-    const strides = 1;
-    const pad = 0;
-
-    const filterShape: [number, number, number, number] =
-        [filterSize, filterSize, inputDepth, outputDepth];
-    const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
-
-    const x = tf.tensor4d(
-        [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
-    const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
-
-    const grads = tf.grads(
-        (x: tf.Tensor4D) => tf.fused.conv2d({x, filter, strides, pad}));
-    const [dx] = grads([x], dy);
-
-    expect(dx.shape).toEqual(x.shape);
-    expectArraysClose(
-        await dx.data(),
-        [-3, 2, 1, -8, 1.5, 0.5, -4, 1, 0, -3, 2, 1, -8, 1.5, 0.5, -4, 1, 0]);
-  });
-
-  it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0', async () => {
-    const inputDepth = 1;
-    const outputDepth = 1;
-    const inputShape: [number, number, number, number] = [2, 3, 3, inputDepth];
-    const filterSize = 2;
-    const strides = 1;
-    const pad = 0;
-
-    const filterShape: [number, number, number, number] =
-        [filterSize, filterSize, inputDepth, outputDepth];
-    const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
-
-    const x = tf.tensor4d(
-        [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
-    const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
-
-    const grads = tf.grads(
-        (x: tf.Tensor4D, filter: tf.Tensor4D) =>
-            tf.fused.conv2d({x, filter, strides, pad}));
-    const [dx, dfilter] = grads([x, filter], dy);
-
-    expect(dx.shape).toEqual(x.shape);
-    expectArraysClose(
-        await dx.data(),
-        [-3, 2, 1, -8, 1.5, 0.5, -4, 1, 0, -3, 2, 1, -8, 1.5, 0.5, -4, 1, 0]);
-
-    expect(dfilter.shape).toEqual(filterShape);
-    expectArraysClose(await dfilter.data(), [26, 38, 62, 74]);
-  });
-
-  it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0 with bias', async () => {
-    const inputDepth = 1;
-    const outputDepth = 1;
-    const inputShape: [number, number, number, number] = [2, 3, 3, inputDepth];
-    const filterSize = 2;
-    const strides = 1;
-    const pad = 0;
-
-    const filterShape: [number, number, number, number] =
-        [filterSize, filterSize, inputDepth, outputDepth];
-    const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
-    const bias = tf.ones([2, 2, 2, 1]);
-
-    const x = tf.tensor4d(
-        [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
-    const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
-
-    const fusedGrads =
-        tf.grads((x: tf.Tensor4D, w: tf.Tensor4D, b) => tf.fused.conv2d({
-          x,
-          filter: w,
-          strides,
-          pad,
-          dataFormat: 'NHWC',
-          dilations: [1, 1],
-          bias: b
-        }));
-    const [dxFused, dfilterFused, dbiasFused] =
-        fusedGrads([x, filter, bias], dy);
-
-    const grads = tf.grads((x: tf.Tensor4D, filter: tf.Tensor4D, bias) => {
-      const conv = tf.conv2d(x, filter, strides, pad);
-      const sum = tf.add(conv, bias);
-      return sum;
-    });
-    const [dx, dfilter, dbias] = grads([x, filter, bias], dy);
-
-    expectArraysClose(await dxFused.array(), await dx.array());
-    expectArraysClose(await dfilterFused.array(), await dfilter.array());
-    expectArraysClose(await dbiasFused.array(), await dbias.array());
-  });
-
-  it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0 with bias and relu',
-     async () => {
-       const inputDepth = 1;
-       const outputDepth = 1;
-       const inputShape: [number, number, number, number] =
-           [2, 3, 3, inputDepth];
-       const filterSize = 2;
-       const strides = 1;
-       const pad = 0;
-
-       const filterShape: [number, number, number, number] =
-           [filterSize, filterSize, inputDepth, outputDepth];
-       const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
-       const bias = tf.ones([2, 2, 2, 1]);
-
-       const x = tf.tensor4d(
-           [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
-       const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
-
-       const fusedGrads =
-           tf.grads((x: tf.Tensor4D, w: tf.Tensor4D, b) => tf.fused.conv2d({
-             x,
-             filter: w,
-             strides,
-             pad,
-             dataFormat: 'NHWC',
-             dilations: [1, 1],
-             bias: b,
-             activation: 'relu'
-           }));
-       const [dxFused, dfilterFused, dbiasFused] =
-           fusedGrads([x, filter, bias], dy);
-
-       const grads = tf.grads((x: tf.Tensor4D, filter: tf.Tensor4D, bias) => {
-         const conv = tf.conv2d(x, filter, strides, pad);
-         const sum = tf.add(conv, bias);
-         return tf.relu(sum);
-       });
-       const [dx, dfilter, dbias] = grads([x, filter, bias], dy);
-
-       expectArraysClose(await dxFused.array(), await dx.array());
-       expectArraysClose(await dfilterFused.array(), await dfilter.array());
-       expectArraysClose(await dbiasFused.array(), await dbias.array());
-     });
-
-  it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0 with bias and elu', async () => {
-    const inputDepth = 1;
-    const outputDepth = 1;
-    const inputShape: [number, number, number, number] = [2, 3, 3, inputDepth];
-    const filterSize = 2;
-    const strides = 1;
-    const pad = 0;
-
-    const filterShape: [number, number, number, number] =
-        [filterSize, filterSize, inputDepth, outputDepth];
-    const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
-    const bias = tf.ones([2, 2, 2, 1]);
-
-    const x = tf.tensor4d(
-        [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
-    const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
-
-    const fusedGrads =
-        tf.grads((x: tf.Tensor4D, w: tf.Tensor4D, b) => tf.fused.conv2d({
-          x,
-          filter: w,
-          strides,
-          pad,
-          dataFormat: 'NHWC',
-          dilations: [1, 1],
-          bias: b,
-          activation: 'elu'
-        }));
-    const [dxFused, dfilterFused, dbiasFused] =
-        fusedGrads([x, filter, bias], dy);
-
-    const grads = tf.grads((x: tf.Tensor4D, filter: tf.Tensor4D, bias) => {
-      const conv = tf.conv2d(x, filter, strides, pad);
-      const sum = tf.add(conv, bias);
-      return tf.elu(sum);
-    });
-    const [dx, dfilter, dbias] = grads([x, filter, bias], dy);
-
-    expectArraysClose(await dxFused.array(), await dx.array());
-    expectArraysClose(await dfilterFused.array(), await dfilter.array());
-    expectArraysClose(await dbiasFused.array(), await dbias.array());
-  });
-
-  it('fused matmul with relu6 and gradients', async () => {
-    const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
-    const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
-    const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
-    const transposeA = false;
-    const transposeB = false;
-
-    const fusedGrads = tf.grads((a, b) => {
-      return tf.fused.matMul(
-          {a, b, transposeA, transposeB, bias: null, activation: 'relu6'});
-    });
-    const [fusedDa, fusedDb] = fusedGrads([a, b], dy);
-
-    const grads = tf.grads((a, b) => {
-      const prod = tf.matMul(a, b, transposeA, transposeB);
-      return tf.relu6(prod);
-    });
-    const [da, db] = grads([a, b], dy);
-
-    expectArraysClose(await da.array(), await fusedDa.array());
-    expectArraysClose(await db.data(), await fusedDb.array());
-  });
-});
diff --git a/tfjs-core/src/ops/fused/types.ts b/tfjs-core/src/ops/fused_types.ts
similarity index 92%
rename from tfjs-core/src/ops/fused/types.ts
rename to tfjs-core/src/ops/fused_types.ts
index ffd5c423a4e..894e2708869 100644
--- a/tfjs-core/src/ops/fused/types.ts
+++ b/tfjs-core/src/ops/fused_types.ts
@@ -15,8 +15,8 @@
  * =============================================================================
  */
 
-import {Tensor, Tensor3D, Tensor4D} from '../../tensor';
-import {Conv2DInfo} from '../conv_util';
+import {Tensor, Tensor3D, Tensor4D} from '../tensor';
+import {Conv2DInfo} from './conv_util';
 
 export type FusedConv2DConfig = {
   input: Tensor4D,
diff --git a/tfjs-core/src/ops/fused_util.ts b/tfjs-core/src/ops/fused_util.ts
index 8784be9af29..1250b3460c5 100644
--- a/tfjs-core/src/ops/fused_util.ts
+++ b/tfjs-core/src/ops/fused_util.ts
@@ -19,17 +19,11 @@ import {Tensor} from '../tensor';
 
 import * as broadcast_util from './broadcast_util';
 import {elu} from './elu';
-import {Activation} from './fused/types';
+import {Activation} from './fused_types';
 import {prelu} from './prelu';
 import {relu} from './relu';
 import {relu6} from './relu6';
 
-// Whether we should call fused ops.
-export const shouldFuse = (gradientDepth: number, activation: Activation) => {
-  const gradientMode = gradientDepth > 0;
-  return !gradientMode || activation === 'linear';
-};
-
 // Returns gradient for fused activation.
 export function getFusedDyActivation(
     dy: Tensor, y: Tensor, activation: Activation): Tensor {
diff --git a/tfjs-core/src/tests.ts b/tfjs-core/src/tests.ts
index 5b2cbfcbaff..0ed5690dd44 100644
--- a/tfjs-core/src/tests.ts
+++ b/tfjs-core/src/tests.ts
@@ -102,7 +102,9 @@ import './ops/fill_test';
 import './ops/floor_test';
 import './ops/frame_test';
 import './ops/from_pixels_test';
-import './ops/fused_test';
+import './ops/fused_conv2d_test';
+import './ops/fused_depthwise_conv2d_test';
+import './ops/fused_mat_mul_test';
 import './ops/gather_nd_test';
 import './ops/gather_test';
 import './ops/gram_schmidt_test';

From bb2e13e498e012603c3a2ecf2354e44bd66f0104 Mon Sep 17 00:00:00 2001
From: Yannick Assogba <yassogba@google.com>
Date: Tue, 14 Jul 2020 17:56:56 -0400
Subject: [PATCH 05/12] modularise fused ops

---
 tfjs-core/src/kernel_names.ts               | 48 +++++++++++++++++
 tfjs-core/src/ops/fused_conv2d.ts           | 57 ++++++++++-----------
 tfjs-core/src/ops/fused_depthwise_conv2d.ts | 55 ++++++++++----------
 tfjs-core/src/ops/fused_mat_mul.ts          | 55 ++++++++++----------
 4 files changed, 129 insertions(+), 86 deletions(-)

diff --git a/tfjs-core/src/kernel_names.ts b/tfjs-core/src/kernel_names.ts
index 2020d63ee8a..64fb16e672a 100644
--- a/tfjs-core/src/kernel_names.ts
+++ b/tfjs-core/src/kernel_names.ts
@@ -21,6 +21,7 @@
 import {ExplicitPadding} from '../src/ops/conv_util';
 
 import {NamedTensorInfoMap, TensorInfo} from './kernel_registry';
+import {Activation} from './ops/fused_types';
 import {DataType, PixelData} from './types';
 
 export const Abs = 'Abs';
@@ -740,3 +741,50 @@ export interface RotateWithOffsetAttrs {
   fillValue: number|[number, number, number];
   center: number|[number, number];
 }
+
+export const _FusedMatMul = '_FusedMatMul';
+// tslint:disable-next-line: class-name
+export interface _FusedMatMulInputs extends NamedTensorInfoMap {
+  a: TensorInfo;
+  b: TensorInfo;
+  bias?: TensorInfo;
+  preluActivationWeights?: TensorInfo;
+}
+// tslint:disable-next-line: class-name
+export interface _FusedMatMulAttrs {
+  transposeA: boolean;
+  transposeB: boolean;
+  activation: Activation;
+}
+
+export const FusedConv2D = 'FusedConv2D';
+export interface FusedConv2DInputs extends NamedTensorInfoMap {
+  x: TensorInfo;
+  filter: TensorInfo;
+  bias?: TensorInfo;
+  preluActivationWeights?: TensorInfo;
+}
+export interface FusedConv2DAttrs {
+  strides: [number, number]|number;
+  pad: 'valid'|'same'|number|ExplicitPadding;
+  dataFormat: 'NHWC'|'NCHW';
+  dilations: [number, number]|number;
+  dimRoundingMode: 'floor'|'round'|'ceil';
+  activation: Activation;
+}
+
+export const FusedDepthwiseConv2D = 'FusedDepthwiseConv2D';
+export interface FusedDepthwiseConv2DInputs extends NamedTensorInfoMap {
+  x: TensorInfo;
+  filter: TensorInfo;
+  bias?: TensorInfo;
+  preluActivationWeights?: TensorInfo;
+}
+export interface FusedDepthwiseConv2DAttrs {
+  strides: [number, number]|number;
+  pad: 'valid'|'same'|number;
+  dataFormat: 'NHWC'|'NCHW';
+  dilations: [number, number]|number;
+  dimRoundingMode: 'floor'|'round'|'ceil';
+  activation: Activation;
+}
diff --git a/tfjs-core/src/ops/fused_conv2d.ts b/tfjs-core/src/ops/fused_conv2d.ts
index 5d507b6f415..e9c35c0a87f 100644
--- a/tfjs-core/src/ops/fused_conv2d.ts
+++ b/tfjs-core/src/ops/fused_conv2d.ts
@@ -15,8 +15,11 @@
  * =============================================================================
  */
 
-import {ENGINE} from '../engine';
+import {ENGINE, ForwardFunc} from '../engine';
+import {FusedConv2D, FusedConv2DAttrs, FusedConv2DInputs} from '../kernel_names';
+import {NamedAttrMap} from '../kernel_registry';
 import {Tensor, Tensor3D, Tensor4D} from '../tensor';
+import {NamedTensorMap} from '../tensor_types';
 import {makeTypesMatch} from '../tensor_util';
 import {convertToTensor} from '../tensor_util_env';
 import {TensorLike} from '../types';
@@ -27,8 +30,6 @@ import * as conv_util from './conv_util';
 import {Activation} from './fused_types';
 import {op} from './operation';
 
-
-
 /**
  * Computes a 2D convolution over the input x, optionally fused with adding a
  * bias and applying an activation.
@@ -221,35 +222,31 @@ function fusedConv2d_<T extends Tensor3D|Tensor4D>({
         preluActivationWeights, 'prelu weights', 'fused conv2d');
   }
 
-  const inputs: {
-    x: Tensor,
-    filter: Tensor,
-    bias?: Tensor,
-    preluActivationWeights?: Tensor
-  } = {x: x4D, filter: $filter};
-  if (bias != null) {
-    inputs.bias = $bias;
-  }
-  if (preluActivationWeights != null) {
-    inputs.preluActivationWeights = $preluActivationWeights;
-  }
+  const inputs: FusedConv2DInputs = {
+    x: x4D,
+    filter: $filter,
+    bias: $bias,
+    preluActivationWeights: $preluActivationWeights
+  };
+
+  const forward: ForwardFunc<Tensor> = (backend) => {
+    const res = backend.fusedConv2d({
+      input: x4D,
+      filter: $filter,
+      convInfo,
+      bias: $bias,
+      activation,
+      preluActivationWeights: $preluActivationWeights
+    });
+    return res;
+  };
+
+  const attrs: FusedConv2DAttrs =
+      {strides, pad, dataFormat, dilations, dimRoundingMode, activation};
 
-  const inputsToSave = [$filter, x4D];
-  const outputsToSave = [true];  // Save the only output.
   const res = ENGINE.runKernelFunc(
-      (backend) => {
-        const res = backend.fusedConv2d({
-          input: x4D,
-          filter: $filter,
-          convInfo,
-          bias: $bias,
-          activation,
-          preluActivationWeights: $preluActivationWeights
-        });
-        return res;
-      },
-      inputs, null /* grad */, 'FusedConv2D', {convInfo, activation},
-      inputsToSave, outputsToSave);
+      forward, inputs as {} as NamedTensorMap, null /* grad */, FusedConv2D,
+      attrs as {} as NamedAttrMap);
 
   if (reshapedTo4D) {
     return res.as3D(res.shape[1], res.shape[2], res.shape[3]) as T;
diff --git a/tfjs-core/src/ops/fused_depthwise_conv2d.ts b/tfjs-core/src/ops/fused_depthwise_conv2d.ts
index 64180ac8ab0..f253757bf3d 100644
--- a/tfjs-core/src/ops/fused_depthwise_conv2d.ts
+++ b/tfjs-core/src/ops/fused_depthwise_conv2d.ts
@@ -15,8 +15,11 @@
  * =============================================================================
  */
 
-import {ENGINE} from '../engine';
+import {ENGINE, ForwardFunc} from '../engine';
+import {FusedDepthwiseConv2D, FusedDepthwiseConv2DAttrs, FusedDepthwiseConv2DInputs} from '../kernel_names';
+import {NamedAttrMap} from '../kernel_registry';
 import {Tensor, Tensor3D, Tensor4D} from '../tensor';
+import {NamedTensorMap} from '../tensor_types';
 import {makeTypesMatch} from '../tensor_util';
 import {convertToTensor} from '../tensor_util_env';
 import {TensorLike} from '../types';
@@ -156,35 +159,31 @@ function fusedDepthwiseConv2d_<T extends Tensor3D|Tensor4D>({
         preluActivationWeights, 'prelu weights', 'fused depthwiseConv2d');
   }
 
-  const inputs: {
-    x: Tensor,
-    filter: Tensor,
-    bias?: Tensor,
-    preluActivationWeights?: Tensor
-  } = {x: x4D, filter: $filter};
-  if (bias != null) {
-    inputs.bias = $bias;
-  }
-  if (preluActivationWeights != null) {
-    inputs.preluActivationWeights = $preluActivationWeights;
-  }
+  const forward: ForwardFunc<Tensor> = (backend) => {
+    const res = backend.fusedDepthwiseConv2D({
+      input: x4D,
+      filter: $filter,
+      convInfo,
+      bias: $bias,
+      activation,
+      preluActivationWeights: $preluActivationWeights
+    });
+    return res;
+  };
+
+  const inputs: FusedDepthwiseConv2DInputs = {
+    x: x4D,
+    filter: $filter,
+    bias: $bias,
+    preluActivationWeights: $preluActivationWeights
+  };
+  const attrs: FusedDepthwiseConv2DAttrs =
+      {strides, pad, dataFormat, dilations, dimRoundingMode, activation};
 
-  const inputsToSave = [$filter, x4D];
-  const outputsToSave = [true];
   const res = ENGINE.runKernelFunc(
-      (backend) => {
-        const res = backend.fusedDepthwiseConv2D({
-          input: x4D,
-          filter: $filter,
-          convInfo,
-          bias: $bias,
-          activation,
-          preluActivationWeights: $preluActivationWeights
-        });
-        return res;
-      },
-      inputs, null /* grad */, 'FusedDepthwiseConv2D', {convInfo, activation},
-      inputsToSave, outputsToSave);
+      forward, inputs as {} as NamedTensorMap, null /* grad */,
+      FusedDepthwiseConv2D, attrs as {} as NamedAttrMap);
+
   if (reshapedTo4D) {
     return res.as3D(res.shape[1], res.shape[2], res.shape[3]) as T;
   }
diff --git a/tfjs-core/src/ops/fused_mat_mul.ts b/tfjs-core/src/ops/fused_mat_mul.ts
index 3340bcfa102..a4254ea959b 100644
--- a/tfjs-core/src/ops/fused_mat_mul.ts
+++ b/tfjs-core/src/ops/fused_mat_mul.ts
@@ -15,8 +15,11 @@
  * =============================================================================
  */
 
-import {ENGINE} from '../engine';
+import {ENGINE, ForwardFunc} from '../engine';
+import {_FusedMatMul, _FusedMatMulAttrs, _FusedMatMulInputs} from '../kernel_names';
+import {NamedAttrMap} from '../kernel_registry';
 import {Tensor} from '../tensor';
+import {NamedTensorMap} from '../tensor_types';
 import {makeTypesMatch} from '../tensor_util';
 import {convertToTensor} from '../tensor_util_env';
 import {TensorLike} from '../types';
@@ -122,35 +125,31 @@ function fusedMatMul_<T extends Tensor>({
         preluActivationWeights, 'prelu weights', 'fused matMul');
   }
 
-  const inputs:
-      {a: Tensor, b: Tensor,
-       bias?: Tensor,
-       preluActivationWeights?: Tensor} = {a: a3D, b: b3D};
-  if (bias != null) {
-    inputs.bias = $bias;
-  }
-  if (preluActivationWeights != null) {
-    inputs.preluActivationWeights = $preluActivationWeights;
-  }
-
-  const inputsToSave = [a3D, b3D];
-  const outputsToSave = [true];
+  const forward: ForwardFunc<Tensor> = (backend) => {
+    const y = backend.fusedBatchMatMul({
+      a: a3D,
+      b: b3D,
+      transposeA,
+      transposeB,
+      bias: $bias,
+      activation,
+      preluActivationWeights: $preluActivationWeights
+    });
+    return y;
+  };
+
+  const inputs: _FusedMatMulInputs = {
+    a: a3D,
+    b: b3D,
+    bias: $bias,
+    preluActivationWeights: $preluActivationWeights
+  };
+  const attrs: _FusedMatMulAttrs = {transposeA, transposeB, activation};
 
   const res = ENGINE.runKernelFunc(
-      (backend) => {
-        const y = backend.fusedBatchMatMul({
-          a: a3D,
-          b: b3D,
-          transposeA,
-          transposeB,
-          bias: $bias,
-          activation,
-          preluActivationWeights: $preluActivationWeights
-        });
-        return y;
-      },
-      inputs, null /* grad */, '_FusedMatMul',
-      {transposeA, transposeB, activation}, inputsToSave, outputsToSave);
+      forward, inputs as {} as NamedTensorMap, null /* grad */, _FusedMatMul,
+      attrs as {} as NamedAttrMap);
+
   return res.reshape(outShape);
 }
 

From d541877eaf45411b2b51a3401174ee104fe37f6e Mon Sep 17 00:00:00 2001
From: Yannick Assogba <yassogba@google.com>
Date: Thu, 16 Jul 2020 22:59:14 -0400
Subject: [PATCH 06/12] restore gradients to fused ops with customGrad

---
 tfjs-core/src/ops/fused_conv2d.ts             |  94 ++++-
 tfjs-core/src/ops/fused_conv2d_test.ts        | 390 +++++++++---------
 tfjs-core/src/ops/fused_depthwise_conv2d.ts   |  81 +++-
 .../src/ops/fused_depthwise_conv2d_test.ts    | 231 +++++------
 tfjs-core/src/ops/fused_mat_mul.ts            |  82 +++-
 tfjs-core/src/ops/fused_mat_mul_test.ts       | 285 +++++++------
 tfjs-core/src/ops/fused_util.ts               |   6 +
 7 files changed, 691 insertions(+), 478 deletions(-)

diff --git a/tfjs-core/src/ops/fused_conv2d.ts b/tfjs-core/src/ops/fused_conv2d.ts
index e9c35c0a87f..00317430a20 100644
--- a/tfjs-core/src/ops/fused_conv2d.ts
+++ b/tfjs-core/src/ops/fused_conv2d.ts
@@ -16,18 +16,24 @@
  */
 
 import {ENGINE, ForwardFunc} from '../engine';
+import {customGrad} from '../gradients';
 import {FusedConv2D, FusedConv2DAttrs, FusedConv2DInputs} from '../kernel_names';
 import {NamedAttrMap} from '../kernel_registry';
+import {conv2DBackpropFilter} from '../ops/conv2d_backprop_filter';
+import {conv2DBackpropInput} from '../ops/conv2d_backprop_input';
 import {Tensor, Tensor3D, Tensor4D} from '../tensor';
-import {NamedTensorMap} from '../tensor_types';
+import {GradSaveFunc, NamedTensorMap} from '../tensor_types';
 import {makeTypesMatch} from '../tensor_util';
 import {convertToTensor} from '../tensor_util_env';
 import {TensorLike} from '../types';
 import * as util from '../util';
 
+import {add} from './add';
 import * as broadcast_util from './broadcast_util';
+import {conv2d as unfusedConv2d} from './conv2d';
 import * as conv_util from './conv_util';
 import {Activation} from './fused_types';
+import {applyActivation, getFusedBiasGradient, getFusedDyActivation, shouldFuse} from './fused_util';
 import {op} from './operation';
 
 /**
@@ -167,6 +173,16 @@ function fusedConv2d_<T extends Tensor3D|Tensor4D>({
 }): T {
   activation = activation || 'linear';
 
+  if (shouldFuse(ENGINE.state.gradientDepth, activation) === false) {
+    let result = unfusedConv2d(
+        x, filter, strides, pad, dataFormat, dilations, dimRoundingMode);
+    if (bias != null) {
+      result = add(result, bias);
+    }
+
+    return applyActivation(result, activation, preluActivationWeights) as T;
+  }
+
   const $x = convertToTensor(x, 'x', 'conv2d');
   const $filter = convertToTensor(filter, 'filter', 'conv2d');
 
@@ -222,11 +238,29 @@ function fusedConv2d_<T extends Tensor3D|Tensor4D>({
         preluActivationWeights, 'prelu weights', 'fused conv2d');
   }
 
-  const inputs: FusedConv2DInputs = {
-    x: x4D,
-    filter: $filter,
-    bias: $bias,
-    preluActivationWeights: $preluActivationWeights
+  const grad = (dy: Tensor4D, saved: Tensor[]) => {
+    const [$filter, x4D, y, $bias] =
+        saved as [Tensor4D, Tensor4D, Tensor4D, Tensor];
+
+    const dyActivation = getFusedDyActivation(dy, y, activation) as Tensor4D;
+
+    util.assert(
+        conv_util.tupleValuesAreOne(dilations),
+        () => 'Error in gradient of fused conv2D: ' +
+            `dilation rates greater than 1 ` +
+            `are not yet supported in gradients. Got dilations '${dilations}'`);
+
+    const xDer =
+        conv2DBackpropInput(x4D.shape, dyActivation, $filter, strides, pad);
+    const filterDer =
+        conv2DBackpropFilter(x4D, dyActivation, $filter.shape, strides, pad);
+    const der: Tensor[] = [xDer, filterDer];
+
+    if ($bias != null) {
+      const biasDer = getFusedBiasGradient($bias, dyActivation);
+      der.push(biasDer);
+    }
+    return der;
   };
 
   const forward: ForwardFunc<Tensor> = (backend) => {
@@ -241,17 +275,51 @@ function fusedConv2d_<T extends Tensor3D|Tensor4D>({
     return res;
   };
 
+  const inputs: FusedConv2DInputs = {
+    x: x4D,
+    filter: $filter,
+    bias: $bias,
+    preluActivationWeights: $preluActivationWeights
+  };
+
   const attrs: FusedConv2DAttrs =
       {strides, pad, dataFormat, dilations, dimRoundingMode, activation};
 
-  const res = ENGINE.runKernelFunc(
-      forward, inputs as {} as NamedTensorMap, null /* grad */, FusedConv2D,
-      attrs as {} as NamedAttrMap);
+  // Depending on the the params passed in we will have different number of
+  // inputs and thus a a different number of elements in the gradient.
+  if (bias == null) {
+    const customOp =
+        customGrad((x4D: Tensor4D, filter: Tensor4D, save: GradSaveFunc) => {
+          let res = ENGINE.runKernelFunc(
+              forward, inputs as {} as NamedTensorMap, null /* grad */,
+              FusedConv2D, attrs as {} as NamedAttrMap);
 
-  if (reshapedTo4D) {
-    return res.as3D(res.shape[1], res.shape[2], res.shape[3]) as T;
-  }
+          save([filter, x4D, res]);
 
-  return res as T;
+          if (reshapedTo4D) {
+            res = res.as3D(res.shape[1], res.shape[2], res.shape[3]) as T;
+          }
+
+          return {value: res, gradFunc: grad};
+        });
+    return customOp(x4D, $filter) as T;
+  } else {
+    const customOpWithBias = customGrad(
+        (x4D: Tensor4D, filter: Tensor4D, bias: Tensor, save: GradSaveFunc) => {
+          let res = ENGINE.runKernelFunc(
+              forward, inputs as {} as NamedTensorMap, null /* grad */,
+              FusedConv2D, attrs as {} as NamedAttrMap);
+
+          save([filter, x4D, res, bias]);
+
+          if (reshapedTo4D) {
+            res = res.as3D(res.shape[1], res.shape[2], res.shape[3]) as T;
+          }
+
+          return {value: res, gradFunc: grad};
+        });
+
+    return customOpWithBias(x4D, $filter, $bias) as T;
+  }
 }
 export const conv2d = op({fusedConv2d_});
diff --git a/tfjs-core/src/ops/fused_conv2d_test.ts b/tfjs-core/src/ops/fused_conv2d_test.ts
index 1f2b34cc84d..fa20cd0d87d 100644
--- a/tfjs-core/src/ops/fused_conv2d_test.ts
+++ b/tfjs-core/src/ops/fused_conv2d_test.ts
@@ -656,209 +656,189 @@ describeWithFlags('fused conv2d', ALL_ENVS, () => {
         await result.data(), [15, 10, 15, 55, 30, 55, 0, 0, 0, 0, 0, 0]);
   });
 
-  // it('backProp input x=[2,3,3,1] f=[2,2,1,1] s=1 p=0', async () => {
-  //   const inputDepth = 1;
-  //   const outputDepth = 1;
-  //   const inputShape: [number, number, number, number] = [2, 3, 3,
-  //   inputDepth]; const filterSize = 2; const strides = 1; const pad = 0;
-
-  //   const filterShape: [number, number, number, number] =
-  //       [filterSize, filterSize, inputDepth, outputDepth];
-  //   const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
-
-  //   const x = tf.tensor4d(
-  //       [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
-  //   const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
-
-  //   const grads = tf.grads(
-  //       (x: tf.Tensor4D) => tf.fused.conv2d({x, filter, strides, pad}));
-  //   const [dx] = grads([x], dy);
-
-  //   expect(dx.shape).toEqual(x.shape);
-  //   expectArraysClose(
-  //       await dx.data(),
-  //       [-3, 2, 1, -8, 1.5, 0.5, -4, 1, 0, -3, 2, 1, -8, 1.5, 0.5, -4, 1,
-  //       0]);
-  // });
-
-  // it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0', async () => {
-  //   const inputDepth = 1;
-  //   const outputDepth = 1;
-  //   const inputShape: [number, number, number, number] = [2, 3, 3,
-  //   inputDepth]; const filterSize = 2; const strides = 1; const pad = 0;
-
-  //   const filterShape: [number, number, number, number] =
-  //       [filterSize, filterSize, inputDepth, outputDepth];
-  //   const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
-
-  //   const x = tf.tensor4d(
-  //       [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
-  //   const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
-
-  //   const grads = tf.grads(
-  //       (x: tf.Tensor4D, filter: tf.Tensor4D) =>
-  //           tf.fused.conv2d({x, filter, strides, pad}));
-  //   const [dx, dfilter] = grads([x, filter], dy);
-
-  //   expect(dx.shape).toEqual(x.shape);
-  //   expectArraysClose(
-  //       await dx.data(),
-  //       [-3, 2, 1, -8, 1.5, 0.5, -4, 1, 0, -3, 2, 1, -8, 1.5, 0.5, -4, 1,
-  //       0]);
-
-  //   expect(dfilter.shape).toEqual(filterShape);
-  //   expectArraysClose(await dfilter.data(), [26, 38, 62, 74]);
-  // });
-
-  // it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0 with bias', async () => {
-  //   const inputDepth = 1;
-  //   const outputDepth = 1;
-  //   const inputShape: [number, number, number, number] = [2, 3, 3,
-  //   inputDepth]; const filterSize = 2; const strides = 1; const pad = 0;
-
-  //   const filterShape: [number, number, number, number] =
-  //       [filterSize, filterSize, inputDepth, outputDepth];
-  //   const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
-  //   const bias = tf.ones([2, 2, 2, 1]);
-
-  //   const x = tf.tensor4d(
-  //       [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
-  //   const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
-
-  //   const fusedGrads =
-  //       tf.grads((x: tf.Tensor4D, w: tf.Tensor4D, b) => tf.fused.conv2d({
-  //         x,
-  //         filter: w,
-  //         strides,
-  //         pad,
-  //         dataFormat: 'NHWC',
-  //         dilations: [1, 1],
-  //         bias: b
-  //       }));
-  //   const [dxFused, dfilterFused, dbiasFused] =
-  //       fusedGrads([x, filter, bias], dy);
-
-  //   const grads = tf.grads((x: tf.Tensor4D, filter: tf.Tensor4D, bias) => {
-  //     const conv = tf.conv2d(x, filter, strides, pad);
-  //     const sum = tf.add(conv, bias);
-  //     return sum;
-  //   });
-  //   const [dx, dfilter, dbias] = grads([x, filter, bias], dy);
-
-  //   expectArraysClose(await dxFused.array(), await dx.array());
-  //   expectArraysClose(await dfilterFused.array(), await dfilter.array());
-  //   expectArraysClose(await dbiasFused.array(), await dbias.array());
-  // });
-
-  // it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0 with bias and relu',
-  //    async () => {
-  //      const inputDepth = 1;
-  //      const outputDepth = 1;
-  //      const inputShape: [number, number, number, number] =
-  //          [2, 3, 3, inputDepth];
-  //      const filterSize = 2;
-  //      const strides = 1;
-  //      const pad = 0;
-
-  //      const filterShape: [number, number, number, number] =
-  //          [filterSize, filterSize, inputDepth, outputDepth];
-  //      const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
-  //      const bias = tf.ones([2, 2, 2, 1]);
-
-  //      const x = tf.tensor4d(
-  //          [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9],
-  //          inputShape);
-  //      const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
-
-  //      const fusedGrads =
-  //          tf.grads((x: tf.Tensor4D, w: tf.Tensor4D, b) => tf.fused.conv2d({
-  //            x,
-  //            filter: w,
-  //            strides,
-  //            pad,
-  //            dataFormat: 'NHWC',
-  //            dilations: [1, 1],
-  //            bias: b,
-  //            activation: 'relu'
-  //          }));
-  //      const [dxFused, dfilterFused, dbiasFused] =
-  //          fusedGrads([x, filter, bias], dy);
-
-  //      const grads = tf.grads((x: tf.Tensor4D, filter: tf.Tensor4D, bias)
-  //      => {
-  //        const conv = tf.conv2d(x, filter, strides, pad);
-  //        const sum = tf.add(conv, bias);
-  //        return tf.relu(sum);
-  //      });
-  //      const [dx, dfilter, dbias] = grads([x, filter, bias], dy);
-
-  //      expectArraysClose(await dxFused.array(), await dx.array());
-  //      expectArraysClose(await dfilterFused.array(), await dfilter.array());
-  //      expectArraysClose(await dbiasFused.array(), await dbias.array());
-  //    });
-
-  // it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0 with bias and elu', async ()
-  // => {
-  //   const inputDepth = 1;
-  //   const outputDepth = 1;
-  //   const inputShape: [number, number, number, number] = [2, 3, 3,
-  //   inputDepth]; const filterSize = 2; const strides = 1; const pad = 0;
-
-  //   const filterShape: [number, number, number, number] =
-  //       [filterSize, filterSize, inputDepth, outputDepth];
-  //   const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
-  //   const bias = tf.ones([2, 2, 2, 1]);
-
-  //   const x = tf.tensor4d(
-  //       [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
-  //   const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
-
-  //   const fusedGrads =
-  //       tf.grads((x: tf.Tensor4D, w: tf.Tensor4D, b) => tf.fused.conv2d({
-  //         x,
-  //         filter: w,
-  //         strides,
-  //         pad,
-  //         dataFormat: 'NHWC',
-  //         dilations: [1, 1],
-  //         bias: b,
-  //         activation: 'elu'
-  //       }));
-  //   const [dxFused, dfilterFused, dbiasFused] =
-  //       fusedGrads([x, filter, bias], dy);
-
-  //   const grads = tf.grads((x: tf.Tensor4D, filter: tf.Tensor4D, bias) => {
-  //     const conv = tf.conv2d(x, filter, strides, pad);
-  //     const sum = tf.add(conv, bias);
-  //     return tf.elu(sum);
-  //   });
-  //   const [dx, dfilter, dbias] = grads([x, filter, bias], dy);
-
-  //   expectArraysClose(await dxFused.array(), await dx.array());
-  //   expectArraysClose(await dfilterFused.array(), await dfilter.array());
-  //   expectArraysClose(await dbiasFused.array(), await dbias.array());
-  // });
-
-  // it('fused matmul with relu6 and gradients', async () => {
-  //   const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
-  //   const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
-  //   const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
-  //   const transposeA = false;
-  //   const transposeB = false;
-
-  //   const fusedGrads = tf.grads((a, b) => {
-  //     return tf.fused.matMul(
-  //         {a, b, transposeA, transposeB, bias: null, activation: 'relu6'});
-  //   });
-  //   const [fusedDa, fusedDb] = fusedGrads([a, b], dy);
-
-  //   const grads = tf.grads((a, b) => {
-  //     const prod = tf.matMul(a, b, transposeA, transposeB);
-  //     return tf.relu6(prod);
-  //   });
-  //   const [da, db] = grads([a, b], dy);
-
-  //   expectArraysClose(await da.array(), await fusedDa.array());
-  //   expectArraysClose(await db.data(), await fusedDb.array());
-  // });
+  it('backProp input x=[2,3,3,1] f=[2,2,1,1] s=1 p=0', async () => {
+    const inputDepth = 1;
+    const outputDepth = 1;
+    const inputShape: [number, number, number, number] = [2, 3, 3, inputDepth];
+    const filterSize = 2;
+    const strides = 1;
+    const pad = 0;
+
+    const filterShape: [number, number, number, number] =
+        [filterSize, filterSize, inputDepth, outputDepth];
+    const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
+
+    const x = tf.tensor4d(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
+    const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
+
+    const grads = tf.grads(
+        (x: tf.Tensor4D) => tf.fused.conv2d({x, filter, strides, pad}));
+    const [dx] = grads([x], dy);
+
+    expect(dx.shape).toEqual(x.shape);
+    expectArraysClose(
+        await dx.data(),
+        [-3, 2, 1, -8, 1.5, 0.5, -4, 1, 0, -3, 2, 1, -8, 1.5, 0.5, -4, 1, 0]);
+  });
+
+  it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0', async () => {
+    const inputDepth = 1;
+    const outputDepth = 1;
+    const inputShape: [number, number, number, number] = [2, 3, 3, inputDepth];
+    const filterSize = 2;
+    const strides = 1;
+    const pad = 0;
+
+    const filterShape: [number, number, number, number] =
+        [filterSize, filterSize, inputDepth, outputDepth];
+    const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
+
+    const x = tf.tensor4d(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
+    const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
+
+    const grads = tf.grads(
+        (x: tf.Tensor4D, filter: tf.Tensor4D) =>
+            tf.fused.conv2d({x, filter, strides, pad}));
+    const [dx, dfilter] = grads([x, filter], dy);
+
+    expect(dx.shape).toEqual(x.shape);
+    expectArraysClose(
+        await dx.data(),
+        [-3, 2, 1, -8, 1.5, 0.5, -4, 1, 0, -3, 2, 1, -8, 1.5, 0.5, -4, 1, 0]);
+
+    expect(dfilter.shape).toEqual(filterShape);
+    expectArraysClose(await dfilter.data(), [26, 38, 62, 74]);
+  });
+
+  it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0 with bias', async () => {
+    const inputDepth = 1;
+    const outputDepth = 1;
+    const inputShape: [number, number, number, number] = [2, 3, 3, inputDepth];
+    const filterSize = 2;
+    const strides = 1;
+    const pad = 0;
+
+    const filterShape: [number, number, number, number] =
+        [filterSize, filterSize, inputDepth, outputDepth];
+    const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
+    const bias = tf.ones([2, 2, 2, 1]);
+
+    const x = tf.tensor4d(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
+    const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
+
+    const fusedGrads =
+        tf.grads((x: tf.Tensor4D, w: tf.Tensor4D, b) => tf.fused.conv2d({
+          x,
+          filter: w,
+          strides,
+          pad,
+          dataFormat: 'NHWC',
+          dilations: [1, 1],
+          bias: b
+        }));
+    const [dxFused, dfilterFused, dbiasFused] =
+        fusedGrads([x, filter, bias], dy);
+
+    const grads = tf.grads((x: tf.Tensor4D, filter: tf.Tensor4D, bias) => {
+      const conv = tf.conv2d(x, filter, strides, pad);
+      const sum = tf.add(conv, bias);
+      return sum;
+    });
+    const [dx, dfilter, dbias] = grads([x, filter, bias], dy);
+
+    expectArraysClose(await dxFused.array(), await dx.array());
+    expectArraysClose(await dfilterFused.array(), await dfilter.array());
+    expectArraysClose(await dbiasFused.array(), await dbias.array());
+  });
+
+  it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0 with bias and relu',
+     async () => {
+       const inputDepth = 1;
+       const outputDepth = 1;
+       const inputShape: [number, number, number, number] =
+           [2, 3, 3, inputDepth];
+       const filterSize = 2;
+       const strides = 1;
+       const pad = 0;
+
+       const filterShape: [number, number, number, number] =
+           [filterSize, filterSize, inputDepth, outputDepth];
+       const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
+       const bias = tf.ones([2, 2, 2, 1]);
+
+       const x = tf.tensor4d(
+           [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
+       const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
+
+       const fusedGrads =
+           tf.grads((x: tf.Tensor4D, w: tf.Tensor4D, b) => tf.fused.conv2d({
+             x,
+             filter: w,
+             strides,
+             pad,
+             dataFormat: 'NHWC',
+             dilations: [1, 1],
+             bias: b,
+             activation: 'relu'
+           }));
+       const [dxFused, dfilterFused, dbiasFused] =
+           fusedGrads([x, filter, bias], dy);
+
+       const grads = tf.grads((x: tf.Tensor4D, filter: tf.Tensor4D, bias) => {
+         const conv = tf.conv2d(x, filter, strides, pad);
+         const sum = tf.add(conv, bias);
+         return tf.relu(sum);
+       });
+       const [dx, dfilter, dbias] = grads([x, filter, bias], dy);
+
+       expectArraysClose(await dxFused.array(), await dx.array());
+       expectArraysClose(await dfilterFused.array(), await dfilter.array());
+       expectArraysClose(await dbiasFused.array(), await dbias.array());
+     });
+
+  it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0 with bias and elu', async () => {
+    const inputDepth = 1;
+    const outputDepth = 1;
+    const inputShape: [number, number, number, number] = [2, 3, 3, inputDepth];
+    const filterSize = 2;
+    const strides = 1;
+    const pad = 0;
+
+    const filterShape: [number, number, number, number] =
+        [filterSize, filterSize, inputDepth, outputDepth];
+    const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
+    const bias = tf.ones([2, 2, 2, 1]);
+
+    const x = tf.tensor4d(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
+    const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
+
+    const fusedGrads =
+        tf.grads((x: tf.Tensor4D, w: tf.Tensor4D, b) => tf.fused.conv2d({
+          x,
+          filter: w,
+          strides,
+          pad,
+          dataFormat: 'NHWC',
+          dilations: [1, 1],
+          bias: b,
+          activation: 'elu'
+        }));
+    const [dxFused, dfilterFused, dbiasFused] =
+        fusedGrads([x, filter, bias], dy);
+
+    const grads = tf.grads((x: tf.Tensor4D, filter: tf.Tensor4D, bias) => {
+      const conv = tf.conv2d(x, filter, strides, pad);
+      const sum = tf.add(conv, bias);
+      return tf.elu(sum);
+    });
+    const [dx, dfilter, dbias] = grads([x, filter, bias], dy);
+
+    expectArraysClose(await dxFused.array(), await dx.array());
+    expectArraysClose(await dfilterFused.array(), await dfilter.array());
+    expectArraysClose(await dbiasFused.array(), await dbias.array());
+  });
 });
diff --git a/tfjs-core/src/ops/fused_depthwise_conv2d.ts b/tfjs-core/src/ops/fused_depthwise_conv2d.ts
index f253757bf3d..8b6f9f49b29 100644
--- a/tfjs-core/src/ops/fused_depthwise_conv2d.ts
+++ b/tfjs-core/src/ops/fused_depthwise_conv2d.ts
@@ -16,18 +16,24 @@
  */
 
 import {ENGINE, ForwardFunc} from '../engine';
+import {customGrad} from '../gradients';
 import {FusedDepthwiseConv2D, FusedDepthwiseConv2DAttrs, FusedDepthwiseConv2DInputs} from '../kernel_names';
 import {NamedAttrMap} from '../kernel_registry';
 import {Tensor, Tensor3D, Tensor4D} from '../tensor';
-import {NamedTensorMap} from '../tensor_types';
+import {GradSaveFunc, NamedTensorMap} from '../tensor_types';
 import {makeTypesMatch} from '../tensor_util';
 import {convertToTensor} from '../tensor_util_env';
 import {TensorLike} from '../types';
 import * as util from '../util';
 
+import {add} from './add';
 import * as broadcast_util from './broadcast_util';
 import * as conv_util from './conv_util';
+import {depthwiseConv2d as unfusedDepthwiseConv2d} from './depthwise_conv2d';
+import {depthwiseConv2dNativeBackpropFilter} from './depthwise_conv2d_native_backprop_filter';
+import {depthwiseConv2dNativeBackpropInput} from './depthwise_conv2d_native_backprop_input';
 import {Activation} from './fused_types';
+import {applyActivation, getFusedBiasGradient, getFusedDyActivation, shouldFuse} from './fused_util';
 import {op} from './operation';
 
 /**
@@ -103,6 +109,16 @@ function fusedDepthwiseConv2d_<T extends Tensor3D|Tensor4D>({
   activation?: Activation,
   preluActivationWeights?: Tensor
 }): T {
+  if (shouldFuse(ENGINE.state.gradientDepth, activation) === false) {
+    let result = unfusedDepthwiseConv2d(
+        x, filter, strides, pad, dataFormat, dilations, dimRoundingMode);
+    if (bias != null) {
+      result = add(result, bias);
+    }
+
+    return applyActivation(result, activation, preluActivationWeights) as T;
+  }
+
   const $x = convertToTensor(x, 'x', 'depthwiseConv2d');
   const $filter = convertToTensor(filter, 'filter', 'depthwiseConv2d');
 
@@ -159,6 +175,29 @@ function fusedDepthwiseConv2d_<T extends Tensor3D|Tensor4D>({
         preluActivationWeights, 'prelu weights', 'fused depthwiseConv2d');
   }
 
+  const grad = (dy: Tensor4D, saved: Tensor[]) => {
+    util.assert(
+        conv_util.tupleValuesAreOne(dilations),
+        () => 'Error in gradient of fused depthwiseConv2d: dilation rates ' +
+            `greater than 1 are not yet supported. Got dilations ` +
+            `'${dilations}'`);
+    const [$filter, x4D, y, bias] = saved;
+
+    const dyActivation = getFusedDyActivation(dy, y, activation) as Tensor4D;
+
+    const xDer = depthwiseConv2dNativeBackpropInput(
+        (x4D as Tensor4D).shape, dyActivation, $filter as Tensor4D, convInfo);
+    const filterDer = depthwiseConv2dNativeBackpropFilter(
+        x4D as Tensor4D, dyActivation, ($filter as Tensor4D).shape, convInfo);
+
+    if (bias != null) {
+      const biasDer = getFusedBiasGradient($bias, dyActivation);
+      return [xDer, filterDer, biasDer];
+    } else {
+      return [xDer, filterDer];
+    }
+  };
+
   const forward: ForwardFunc<Tensor> = (backend) => {
     const res = backend.fusedDepthwiseConv2D({
       input: x4D,
@@ -180,13 +219,41 @@ function fusedDepthwiseConv2d_<T extends Tensor3D|Tensor4D>({
   const attrs: FusedDepthwiseConv2DAttrs =
       {strides, pad, dataFormat, dilations, dimRoundingMode, activation};
 
-  const res = ENGINE.runKernelFunc(
-      forward, inputs as {} as NamedTensorMap, null /* grad */,
-      FusedDepthwiseConv2D, attrs as {} as NamedAttrMap);
+  // Depending on the the params passed in we will have different number of
+  // inputs and thus a a different number of elements in the gradient.
+  if (bias == null) {
+    const customOp =
+        customGrad((x4D: Tensor4D, filter: Tensor4D, save: GradSaveFunc) => {
+          let res = ENGINE.runKernelFunc(
+              forward, inputs as {} as NamedTensorMap, null /* grad */,
+              FusedDepthwiseConv2D, attrs as {} as NamedAttrMap);
+
+          save([filter, x4D, res]);
+
+          if (reshapedTo4D) {
+            res = res.as3D(res.shape[1], res.shape[2], res.shape[3]) as T;
+          }
+
+          return {value: res, gradFunc: grad};
+        });
+    return customOp(x4D, $filter) as T;
+  } else {
+    const customOpWithBias = customGrad(
+        (x4D: Tensor4D, filter: Tensor4D, bias: Tensor, save: GradSaveFunc) => {
+          let res = ENGINE.runKernelFunc(
+              forward, inputs as {} as NamedTensorMap, null /* grad */,
+              FusedDepthwiseConv2D, attrs as {} as NamedAttrMap);
+
+          save([filter, x4D, res, bias]);
+
+          if (reshapedTo4D) {
+            res = res.as3D(res.shape[1], res.shape[2], res.shape[3]) as T;
+          }
+
+          return {value: res, gradFunc: grad};
+        });
 
-  if (reshapedTo4D) {
-    return res.as3D(res.shape[1], res.shape[2], res.shape[3]) as T;
+    return customOpWithBias(x4D, $filter, $bias) as T;
   }
-  return res as T;
 }
 export const depthwiseConv2d = op({fusedDepthwiseConv2d_});
diff --git a/tfjs-core/src/ops/fused_depthwise_conv2d_test.ts b/tfjs-core/src/ops/fused_depthwise_conv2d_test.ts
index 49e318a7844..ada8531b9c8 100644
--- a/tfjs-core/src/ops/fused_depthwise_conv2d_test.ts
+++ b/tfjs-core/src/ops/fused_depthwise_conv2d_test.ts
@@ -135,119 +135,120 @@ describeWithFlags('fused depthwiseConv2D', ALL_ENVS, () => {
     expectArraysClose(await result.data(), expected);
   });
 
-  // it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0', async () => {
-  //   const inputDepth = 1;
-  //   const outputDepth = 1;
-  //   const inputShape: [number, number, number, number] = [2, 3, 3,
-  //   inputDepth]; const filterSize = 2; const strides = 1; const pad = 0;
-
-  //   const filterShape: [number, number, number, number] =
-  //       [filterSize, filterSize, inputDepth, outputDepth];
-  //   const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
-
-  //   const x = tf.tensor4d(
-  //       [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
-  //   const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
-
-  //   const grads = tf.grads(
-  //       (x: tf.Tensor4D, filter: tf.Tensor4D) =>
-  //           tf.fused.depthwiseConv2d({x, filter, strides, pad}));
-  //   const [dx, dfilter] = grads([x, filter], dy);
-
-  //   expect(dx.shape).toEqual(x.shape);
-  //   expectArraysClose(
-  //       await dx.data(),
-  //       [-3, 2, 1, -8, 1.5, 0.5, -4, 1, 0, -3, 2, 1, -8, 1.5, 0.5, -4, 1,
-  //       0]);
-
-  //   expect(dfilter.shape).toEqual(filterShape);
-  //   expectArraysClose(await dfilter.data(), [26, 38, 62, 74]);
-  // });
-
-  // it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0 with bias', async () => {
-  //   const inputDepth = 1;
-  //   const outputDepth = 1;
-  //   const inputShape: [number, number, number, number] = [2, 3, 3,
-  //   inputDepth]; const filterSize = 2; const strides = 1; const pad = 0;
-
-  //   const filterShape: [number, number, number, number] =
-  //       [filterSize, filterSize, inputDepth, outputDepth];
-  //   const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
-  //   const bias = tf.ones([2, 2, 2, 1]);
-
-  //   const x = tf.tensor4d(
-  //       [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
-  //   const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
-
-  //   const fusedGrads = tf.grads(
-  //       (x: tf.Tensor4D, w: tf.Tensor4D, b) => tf.fused.depthwiseConv2d({
-  //         x,
-  //         filter: w,
-  //         strides,
-  //         pad,
-  //         dataFormat: 'NHWC',
-  //         dilations: [1, 1],
-  //         bias: b
-  //       }));
-  //   const [dxFused, dfilterFused, dbiasFused] =
-  //       fusedGrads([x, filter, bias], dy);
-
-  //   const grads = tf.grads((x: tf.Tensor4D, filter: tf.Tensor4D, bias) => {
-  //     const conv = tf.depthwiseConv2d(x, filter, strides, pad);
-  //     const sum = tf.add(conv, bias);
-  //     return sum;
-  //   });
-  //   const [dx, dfilter, dbias] = grads([x, filter, bias], dy);
-
-  //   expectArraysClose(await dxFused.array(), await dx.array());
-  //   expectArraysClose(await dfilterFused.array(), await dfilter.array());
-  //   expectArraysClose(await dbiasFused.array(), await dbias.array());
-  // });
-
-  // it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0 with bias and activation',
-  //    async () => {
-  //      const inputDepth = 1;
-  //      const outputDepth = 1;
-  //      const inputShape: [number, number, number, number] =
-  //          [2, 3, 3, inputDepth];
-  //      const filterSize = 2;
-  //      const strides = 1;
-  //      const pad = 0;
-
-  //      const filterShape: [number, number, number, number] =
-  //          [filterSize, filterSize, inputDepth, outputDepth];
-  //      const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
-  //      const bias = tf.ones([2, 2, 2, 1]);
-
-  //      const x = tf.tensor4d(
-  //          [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9],
-  //          inputShape);
-  //      const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
-
-  //      const fusedGrads = tf.grads(
-  //          (x: tf.Tensor4D, w: tf.Tensor4D, b) => tf.fused.depthwiseConv2d({
-  //            x,
-  //            filter: w,
-  //            strides,
-  //            pad,
-  //            dataFormat: 'NHWC',
-  //            dilations: [1, 1],
-  //            bias: b,
-  //            activation: 'relu'
-  //          }));
-  //      const [dxFused, dfilterFused, dbiasFused] =
-  //          fusedGrads([x, filter, bias], dy);
-
-  //      const grads = tf.grads((x: tf.Tensor4D, filter: tf.Tensor4D, bias)
-  //      => {
-  //        const conv = tf.depthwiseConv2d(x, filter, strides, pad);
-  //        const sum = tf.add(conv, bias);
-  //        return tf.relu(sum);
-  //      });
-  //      const [dx, dfilter, dbias] = grads([x, filter, bias], dy);
-
-  //      expectArraysClose(await dxFused.array(), await dx.array());
-  //      expectArraysClose(await dfilterFused.array(), await dfilter.array());
-  //      expectArraysClose(await dbiasFused.array(), await dbias.array());
-  //    });
+  it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0', async () => {
+    const inputDepth = 1;
+    const outputDepth = 1;
+    const inputShape: [number, number, number, number] = [2, 3, 3, inputDepth];
+    const filterSize = 2;
+    const strides = 1;
+    const pad = 0;
+
+    const filterShape: [number, number, number, number] =
+        [filterSize, filterSize, inputDepth, outputDepth];
+    const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
+
+    const x = tf.tensor4d(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
+    const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
+
+    const grads = tf.grads(
+        (x: tf.Tensor4D, filter: tf.Tensor4D) =>
+            tf.fused.depthwiseConv2d({x, filter, strides, pad}));
+    const [dx, dfilter] = grads([x, filter], dy);
+
+    expect(dx.shape).toEqual(x.shape);
+    expectArraysClose(
+        await dx.data(),
+        [-3, 2, 1, -8, 1.5, 0.5, -4, 1, 0, -3, 2, 1, -8, 1.5, 0.5, -4, 1, 0]);
+
+    expect(dfilter.shape).toEqual(filterShape);
+    expectArraysClose(await dfilter.data(), [26, 38, 62, 74]);
+  });
+
+  it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0 with bias', async () => {
+    const inputDepth = 1;
+    const outputDepth = 1;
+    const inputShape: [number, number, number, number] = [2, 3, 3, inputDepth];
+    const filterSize = 2;
+    const strides = 1;
+    const pad = 0;
+
+    const filterShape: [number, number, number, number] =
+        [filterSize, filterSize, inputDepth, outputDepth];
+    const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
+    const bias = tf.ones([2, 2, 2, 1]);
+
+    const x = tf.tensor4d(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
+    const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
+
+    const fusedGrads = tf.grads(
+        (x: tf.Tensor4D, w: tf.Tensor4D, b) => tf.fused.depthwiseConv2d({
+          x,
+          filter: w,
+          strides,
+          pad,
+          dataFormat: 'NHWC',
+          dilations: [1, 1],
+          bias: b
+        }));
+    const [dxFused, dfilterFused, dbiasFused] =
+        fusedGrads([x, filter, bias], dy);
+
+    const grads = tf.grads((x: tf.Tensor4D, filter: tf.Tensor4D, bias) => {
+      const conv = tf.depthwiseConv2d(x, filter, strides, pad);
+      const sum = tf.add(conv, bias);
+      return sum;
+    });
+    const [dx, dfilter, dbias] = grads([x, filter, bias], dy);
+
+    expectArraysClose(await dxFused.array(), await dx.array());
+    expectArraysClose(await dfilterFused.array(), await dfilter.array());
+    expectArraysClose(await dbiasFused.array(), await dbias.array());
+  });
+
+  it('gradient x=[2,3,3,1] f=[2,2,1,1] s=1 p=0 with bias and activation',
+     async () => {
+       const inputDepth = 1;
+       const outputDepth = 1;
+       const inputShape: [number, number, number, number] =
+           [2, 3, 3, inputDepth];
+       const filterSize = 2;
+       const strides = 1;
+       const pad = 0;
+
+       const filterShape: [number, number, number, number] =
+           [filterSize, filterSize, inputDepth, outputDepth];
+       const filter = tf.tensor4d([-1, 1, -2, 0.5], filterShape);
+       const bias = tf.ones([2, 2, 2, 1]);
+
+       const x = tf.tensor4d(
+           [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9], inputShape);
+       const dy = tf.tensor4d([3, 1, 2, 0, 3, 1, 2, 0], [2, 2, 2, 1]);
+
+       const fusedGrads = tf.grads(
+           (x: tf.Tensor4D, w: tf.Tensor4D, b) => tf.fused.depthwiseConv2d({
+             x,
+             filter: w,
+             strides,
+             pad,
+             dataFormat: 'NHWC',
+             dilations: [1, 1],
+             bias: b,
+             activation: 'relu'
+           }));
+       const [dxFused, dfilterFused, dbiasFused] =
+           fusedGrads([x, filter, bias], dy);
+
+       const grads = tf.grads((x: tf.Tensor4D, filter: tf.Tensor4D, bias) => {
+         const conv = tf.depthwiseConv2d(x, filter, strides, pad);
+         const sum = tf.add(conv, bias);
+         return tf.relu(sum);
+       });
+       const [dx, dfilter, dbias] = grads([x, filter, bias], dy);
+
+       expectArraysClose(await dxFused.array(), await dx.array());
+       expectArraysClose(await dfilterFused.array(), await dfilter.array());
+       expectArraysClose(await dbiasFused.array(), await dbias.array());
+     });
 });
diff --git a/tfjs-core/src/ops/fused_mat_mul.ts b/tfjs-core/src/ops/fused_mat_mul.ts
index a4254ea959b..b2cf41505ec 100644
--- a/tfjs-core/src/ops/fused_mat_mul.ts
+++ b/tfjs-core/src/ops/fused_mat_mul.ts
@@ -16,17 +16,21 @@
  */
 
 import {ENGINE, ForwardFunc} from '../engine';
+import {customGrad} from '../gradients';
 import {_FusedMatMul, _FusedMatMulAttrs, _FusedMatMulInputs} from '../kernel_names';
 import {NamedAttrMap} from '../kernel_registry';
-import {Tensor} from '../tensor';
-import {NamedTensorMap} from '../tensor_types';
+import {Tensor, Tensor3D} from '../tensor';
+import {GradSaveFunc, NamedTensorMap} from '../tensor_types';
 import {makeTypesMatch} from '../tensor_util';
 import {convertToTensor} from '../tensor_util_env';
 import {TensorLike} from '../types';
 import * as util from '../util';
 
+import {add} from './add';
 import * as broadcast_util from './broadcast_util';
 import {Activation} from './fused_types';
+import {applyActivation, getFusedBiasGradient, getFusedDyActivation, shouldFuse} from './fused_util';
+import {matMul as unfusedMatMul} from './mat_mul';
 import {op} from './operation';
 
 /**
@@ -66,6 +70,15 @@ function fusedMatMul_<T extends Tensor>({
   activation?: Activation,
   preluActivationWeights?: Tensor
 }): T {
+  if (shouldFuse(ENGINE.state.gradientDepth, activation) === false) {
+    let result = unfusedMatMul(a, b, transposeA, transposeB);
+    if (bias != null) {
+      result = add(result, bias);
+    }
+
+    return applyActivation(result, activation, preluActivationWeights) as T;
+  }
+
   let $a = convertToTensor(a, 'a', 'fused matMul');
   let $b = convertToTensor(b, 'b', 'fused matMul');
   [$a, $b] = makeTypesMatch($a, $b);
@@ -125,6 +138,38 @@ function fusedMatMul_<T extends Tensor>({
         preluActivationWeights, 'prelu weights', 'fused matMul');
   }
 
+  const grad = (dy: Tensor3D, saved: Tensor[]) => {
+    const [a3D, b3D, y, bias] = saved;
+    const dyActivation = getFusedDyActivation(dy, y, activation);
+
+    let aDer: Tensor;
+    let bDer: Tensor;
+
+    if (!transposeA && !transposeB) {
+      aDer = dyActivation.matMul(b3D as Tensor3D, false, true);
+      bDer = a3D.matMul(dyActivation, true, false);
+
+    } else if (!transposeA && transposeB) {
+      aDer = dyActivation.matMul(b3D as Tensor3D, false, false);
+      bDer = dyActivation.matMul(a3D as Tensor3D, true, false);
+
+    } else if (transposeA && !transposeB) {
+      aDer = b3D.matMul(dyActivation, false, true);
+      bDer = a3D.matMul(dyActivation, false, false);
+
+    } else {
+      aDer = b3D.matMul(dyActivation, true, true);
+      bDer = dyActivation.matMul(a3D as Tensor3D, true, true);
+    }
+
+    if (bias != null) {
+      const biasDer = getFusedBiasGradient(bias, dyActivation);
+      return [aDer, bDer, biasDer];
+    } else {
+      return [aDer, bDer];
+    }
+  };
+
   const forward: ForwardFunc<Tensor> = (backend) => {
     const y = backend.fusedBatchMatMul({
       a: a3D,
@@ -146,11 +191,34 @@ function fusedMatMul_<T extends Tensor>({
   };
   const attrs: _FusedMatMulAttrs = {transposeA, transposeB, activation};
 
-  const res = ENGINE.runKernelFunc(
-      forward, inputs as {} as NamedTensorMap, null /* grad */, _FusedMatMul,
-      attrs as {} as NamedAttrMap);
-
-  return res.reshape(outShape);
+  // Depending on the the params passed in we will have different number of
+  // inputs and thus a a different number of elements in the gradient.
+  if (bias == null) {
+    const customOp =
+        customGrad((a: Tensor3D, b: Tensor3D, save: GradSaveFunc) => {
+          const res = ENGINE.runKernelFunc(
+              forward, inputs as {} as NamedTensorMap, null /* grad */,
+              _FusedMatMul, attrs as {} as NamedAttrMap);
+
+          save([a, b, res]);
+
+          return {value: res.reshape(outShape), gradFunc: grad};
+        });
+    return customOp(a3D, b3D) as T;
+  } else {
+    const customOpWithBias = customGrad(
+        (a: Tensor3D, b: Tensor3D, bias: Tensor, save: GradSaveFunc) => {
+          const res = ENGINE.runKernelFunc(
+              forward, inputs as {} as NamedTensorMap, null /* grad */,
+              _FusedMatMul, attrs as {} as NamedAttrMap);
+
+          save([a, b, res, bias]);
+
+          return {value: res.reshape(outShape), gradFunc: grad};
+        });
+
+    return customOpWithBias(a3D, b3D, $bias) as T;
+  }
 }
 
 export const matMul = op({fusedMatMul_});
diff --git a/tfjs-core/src/ops/fused_mat_mul_test.ts b/tfjs-core/src/ops/fused_mat_mul_test.ts
index c85123cde11..cb15f706b42 100644
--- a/tfjs-core/src/ops/fused_mat_mul_test.ts
+++ b/tfjs-core/src/ops/fused_mat_mul_test.ts
@@ -176,135 +176,158 @@ describeWithFlags('fused matmul', ALL_ENVS, () => {
     expectArraysClose(await d.data(), [1, 9, -2, 21]);
   });
 
-  // it('fused A x B with relu gradient', async () => {
-  //   const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
-  //   const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
-  //   const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
-  //   const transposeA = false;
-  //   const transposeB = false;
-
-  //   const grads = tf.grads((a, b) => {
-  //     const prod = tf.matMul(a, b, transposeA, transposeB);
-  //     return tf.relu(prod);
-  //   });
-
-  //   const fusedGrads = tf.grads((a, b) => {
-  //     return tf.fused.matMul(
-  //         {a, b, transposeA, transposeB, bias: null, activation: 'relu'});
-  //   });
-
-  //   const [da, db] = grads([a, b], dy);
-  //   const [fusedDa, fusedDb] = fusedGrads([a, b], dy);
-  //   expectArraysClose(await da.array(), await fusedDa.array());
-  //   expectArraysClose(await db.data(), await fusedDb.array());
-  // });
-
-  // it('gradient with clones A x B with relu', () => {
-  //   const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
-  //   const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
-  //   const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
-  //   const transposeA = false;
-  //   const transposeB = false;
-
-  //   const fusedGrads = tf.grads((a, b) => {
-  //     return tf.fused
-  //         .matMul({
-  //           a: a.clone(),
-  //           b: b.clone(),
-  //           transposeA,
-  //           transposeB,
-  //           bias: null,
-  //           activation: 'relu'
-  //         })
-  //         .clone();
-  //   });
-
-  //   const [fusedDa, fusedDb] = fusedGrads([a, b], dy);
-  //   expect(fusedDa.shape).toEqual(a.shape);
-  //   expect(fusedDb.shape).toEqual(b.shape);
-  // });
-
-  // it('fused A x B with relu bias gradient', async () => {
-  //   const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
-  //   const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
-  //   const c = tf.tensor2d([1, 1, 1, 1], [2, 2]);
-  //   const transposeA = false;
-  //   const transposeB = false;
-
-  //   const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
-
-  //   const grads = tf.grads((a, b, c) => {
-  //     const prod = tf.matMul(a, b, transposeA, transposeB);
-  //     const sum = tf.add(prod, c);
-  //     return tf.relu(sum);
-  //   });
-
-  //   const fusedGrads = tf.grads((a, b, c) => {
-  //     return tf.fused.matMul(
-  //         {a, b, transposeA, transposeB, bias: c, activation: 'relu'});
-  //   });
-
-  //   const [da, db, dc] = grads([a, b, c], dy);
-  //   const [fusedDa, fusedDb, fusedDc] = fusedGrads([a, b, c], dy);
-
-  //   expectArraysClose(await da.array(), await fusedDa.array());
-  //   expectArraysClose(await db.array(), await fusedDb.array());
-  //   expectArraysClose(await dc.array(), await fusedDc.array());
-  // });
-
-  // it('fused A x B with relu bias gradient transpose', async () => {
-  //   const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [3, 2]);
-  //   const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
-  //   const c = tf.tensor2d([1, 1, 1, 1], [2, 2]);
-  //   const transposeA = true;
-  //   const transposeB = false;
-
-  //   const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
-
-  //   const grads = tf.grads((a, b, c) => {
-  //     const prod = tf.matMul(a, b, transposeA, transposeB);
-  //     const sum = tf.add(prod, c);
-  //     return tf.relu(sum);
-  //   });
-
-  //   const fusedGrads = tf.grads((a, b, c) => {
-  //     return tf.fused.matMul(
-  //         {a, b, transposeA, transposeB, bias: c, activation: 'relu'});
-  //   });
-
-  //   const [da, db, dc] = grads([a, b, c], dy);
-  //   const [fusedDa, fusedDb, fusedDc] = fusedGrads([a, b, c], dy);
-
-  //   expectArraysClose(await da.array(), await fusedDa.array());
-  //   expectArraysClose(await db.array(), await fusedDb.array());
-  //   expectArraysClose(await dc.array(), await fusedDc.array());
-  // });
-
-  // it('fused A x B with relu and broadcasted bias gradient', async () => {
-  //   const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
-  //   const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
-  //   const c = tf.tensor2d([[1]]);
-  //   const transposeA = false;
-  //   const transposeB = false;
-
-  //   const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
-
-  //   const grads = tf.grads((a, b, c) => {
-  //     const prod = tf.matMul(a, b, transposeA, transposeB);
-  //     const sum = tf.add(prod, c);
-  //     return tf.relu(sum);
-  //   });
-
-  //   const fusedGrads = tf.grads((a, b, c) => {
-  //     return tf.fused.matMul(
-  //         {a, b, transposeA, transposeB, bias: c, activation: 'relu'});
-  //   });
-
-  //   const [da, db, dc] = grads([a, b, c], dy);
-  //   const [fusedDa, fusedDb, fusedDc] = fusedGrads([a, b, c], dy);
-
-  //   expectArraysClose(await da.array(), await fusedDa.array());
-  //   expectArraysClose(await db.array(), await fusedDb.array());
-  //   expectArraysClose(await dc.array(), await fusedDc.array());
-  // });
+  it('fused A x B with relu gradient', async () => {
+    const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
+    const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
+    const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
+    const transposeA = false;
+    const transposeB = false;
+
+    const grads = tf.grads((a, b) => {
+      const prod = tf.matMul(a, b, transposeA, transposeB);
+      return tf.relu(prod);
+    });
+
+    const fusedGrads = tf.grads((a, b) => {
+      return tf.fused.matMul(
+          {a, b, transposeA, transposeB, bias: null, activation: 'relu'});
+    });
+
+    const [da, db] = grads([a, b], dy);
+    const [fusedDa, fusedDb] = fusedGrads([a, b], dy);
+    expectArraysClose(await da.array(), await fusedDa.array());
+    expectArraysClose(await db.data(), await fusedDb.array());
+  });
+
+  it('gradient with clones A x B with relu', () => {
+    const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
+    const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
+    const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
+    const transposeA = false;
+    const transposeB = false;
+
+    const fusedGrads = tf.grads((a, b) => {
+      return tf.fused
+          .matMul({
+            a: a.clone(),
+            b: b.clone(),
+            transposeA,
+            transposeB,
+            bias: null,
+            activation: 'relu'
+          })
+          .clone();
+    });
+
+    const [fusedDa, fusedDb] = fusedGrads([a, b], dy);
+    expect(fusedDa.shape).toEqual(a.shape);
+    expect(fusedDb.shape).toEqual(b.shape);
+  });
+
+  it('fused A x B with relu bias gradient', async () => {
+    const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
+    const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
+    const c = tf.tensor2d([1, 1, 1, 1], [2, 2]);
+    const transposeA = false;
+    const transposeB = false;
+
+    const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
+
+    const grads = tf.grads((a, b, c) => {
+      const prod = tf.matMul(a, b, transposeA, transposeB);
+      const sum = tf.add(prod, c);
+      return tf.relu(sum);
+    });
+
+    const fusedGrads = tf.grads((a, b, c) => {
+      return tf.fused.matMul(
+          {a, b, transposeA, transposeB, bias: c, activation: 'relu'});
+    });
+
+    const [da, db, dc] = grads([a, b, c], dy);
+    const [fusedDa, fusedDb, fusedDc] = fusedGrads([a, b, c], dy);
+
+    expectArraysClose(await da.array(), await fusedDa.array());
+    expectArraysClose(await db.array(), await fusedDb.array());
+    expectArraysClose(await dc.array(), await fusedDc.array());
+  });
+
+  it('fused A x B with relu bias gradient transpose', async () => {
+    const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [3, 2]);
+    const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
+    const c = tf.tensor2d([1, 1, 1, 1], [2, 2]);
+    const transposeA = true;
+    const transposeB = false;
+
+    const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
+
+    const grads = tf.grads((a, b, c) => {
+      const prod = tf.matMul(a, b, transposeA, transposeB);
+      const sum = tf.add(prod, c);
+      return tf.relu(sum);
+    });
+
+    const fusedGrads = tf.grads((a, b, c) => {
+      return tf.fused.matMul(
+          {a, b, transposeA, transposeB, bias: c, activation: 'relu'});
+    });
+
+    const [da, db, dc] = grads([a, b, c], dy);
+    const [fusedDa, fusedDb, fusedDc] = fusedGrads([a, b, c], dy);
+
+    expectArraysClose(await da.array(), await fusedDa.array());
+    expectArraysClose(await db.array(), await fusedDb.array());
+    expectArraysClose(await dc.array(), await fusedDc.array());
+  });
+
+  it('fused A x B with relu and broadcasted bias gradient', async () => {
+    const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
+    const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
+    const c = tf.tensor2d([[1]]);
+    const transposeA = false;
+    const transposeB = false;
+
+    const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
+
+    const grads = tf.grads((a, b, c) => {
+      const prod = tf.matMul(a, b, transposeA, transposeB);
+      const sum = tf.add(prod, c);
+      return tf.relu(sum);
+    });
+
+    const fusedGrads = tf.grads((a, b, c) => {
+      return tf.fused.matMul(
+          {a, b, transposeA, transposeB, bias: c, activation: 'relu'});
+    });
+
+    const [da, db, dc] = grads([a, b, c], dy);
+    const [fusedDa, fusedDb, fusedDc] = fusedGrads([a, b, c], dy);
+
+    expectArraysClose(await da.array(), await fusedDa.array());
+    expectArraysClose(await db.array(), await fusedDb.array());
+    expectArraysClose(await dc.array(), await fusedDc.array());
+  });
+
+  it('fused matmul with relu6 and gradients', async () => {
+    const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
+    const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
+    const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
+    const transposeA = false;
+    const transposeB = false;
+
+    const fusedGrads = tf.grads((a, b) => {
+      return tf.fused.matMul(
+          {a, b, transposeA, transposeB, bias: null, activation: 'relu6'});
+    });
+    const [fusedDa, fusedDb] = fusedGrads([a, b], dy);
+
+    const grads = tf.grads((a, b) => {
+      const prod = tf.matMul(a, b, transposeA, transposeB);
+      return tf.relu6(prod);
+    });
+    const [da, db] = grads([a, b], dy);
+
+    expectArraysClose(await da.array(), await fusedDa.array());
+    expectArraysClose(await db.data(), await fusedDb.array());
+  });
 });
diff --git a/tfjs-core/src/ops/fused_util.ts b/tfjs-core/src/ops/fused_util.ts
index 1250b3460c5..206cdaa4913 100644
--- a/tfjs-core/src/ops/fused_util.ts
+++ b/tfjs-core/src/ops/fused_util.ts
@@ -66,3 +66,9 @@ export function applyActivation(
   }
   throw new Error(`Unknown fused activation ${activation}.`);
 }
+
+// Whether we should call fused ops.
+export const shouldFuse = (gradientDepth: number, activation: Activation) => {
+  const gradientMode = gradientDepth > 0;
+  return !gradientMode || activation === 'linear';
+};

From 4e126489a34eceacb22f946a6424575bc8c9c7bd Mon Sep 17 00:00:00 2001
From: Yannick Assogba <yassogba@google.com>
Date: Fri, 17 Jul 2020 00:19:28 -0400
Subject: [PATCH 07/12] fix gradient of fusedMatMul_

---
 tfjs-core/src/ops/fused_mat_mul.ts | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/tfjs-core/src/ops/fused_mat_mul.ts b/tfjs-core/src/ops/fused_mat_mul.ts
index b2cf41505ec..2500731115e 100644
--- a/tfjs-core/src/ops/fused_mat_mul.ts
+++ b/tfjs-core/src/ops/fused_mat_mul.ts
@@ -139,31 +139,31 @@ function fusedMatMul_<T extends Tensor>({
   }
 
   const grad = (dy: Tensor3D, saved: Tensor[]) => {
-    const [a3D, b3D, y, bias] = saved;
-    const dyActivation = getFusedDyActivation(dy, y, activation);
-
+    const [a3D, b3D, y, $bias] = saved;
+    // we reshape dy because the result of the forward is not
+    // necessarily going to be a 3d tensor due to a reshape done at the end of
+    // the customOp.
+    const dyActivation =
+        getFusedDyActivation(dy.reshape(y.shape), y, activation)
     let aDer: Tensor;
     let bDer: Tensor;
 
     if (!transposeA && !transposeB) {
-      aDer = dyActivation.matMul(b3D as Tensor3D, false, true);
+      aDer = dyActivation.matMul(b3D, false, true);
       bDer = a3D.matMul(dyActivation, true, false);
-
     } else if (!transposeA && transposeB) {
-      aDer = dyActivation.matMul(b3D as Tensor3D, false, false);
-      bDer = dyActivation.matMul(a3D as Tensor3D, true, false);
-
+      aDer = dyActivation.matMul(b3D, false, false);
+      bDer = dyActivation.matMul(a3D, true, false);
     } else if (transposeA && !transposeB) {
       aDer = b3D.matMul(dyActivation, false, true);
       bDer = a3D.matMul(dyActivation, false, false);
-
     } else {
       aDer = b3D.matMul(dyActivation, true, true);
-      bDer = dyActivation.matMul(a3D as Tensor3D, true, true);
+      bDer = dyActivation.matMul(a3D, true, true);
     }
 
     if (bias != null) {
-      const biasDer = getFusedBiasGradient(bias, dyActivation);
+      const biasDer = getFusedBiasGradient($bias, dyActivation);
       return [aDer, bDer, biasDer];
     } else {
       return [aDer, bDer];
@@ -195,24 +195,24 @@ function fusedMatMul_<T extends Tensor>({
   // inputs and thus a a different number of elements in the gradient.
   if (bias == null) {
     const customOp =
-        customGrad((a: Tensor3D, b: Tensor3D, save: GradSaveFunc) => {
+        customGrad((a3D: Tensor3D, b3D: Tensor3D, save: GradSaveFunc) => {
           const res = ENGINE.runKernelFunc(
               forward, inputs as {} as NamedTensorMap, null /* grad */,
               _FusedMatMul, attrs as {} as NamedAttrMap);
 
-          save([a, b, res]);
+          save([a3D, b3D, res]);
 
           return {value: res.reshape(outShape), gradFunc: grad};
         });
     return customOp(a3D, b3D) as T;
   } else {
     const customOpWithBias = customGrad(
-        (a: Tensor3D, b: Tensor3D, bias: Tensor, save: GradSaveFunc) => {
+        (a3D: Tensor3D, b3D: Tensor3D, $bias: Tensor, save: GradSaveFunc) => {
           const res = ENGINE.runKernelFunc(
               forward, inputs as {} as NamedTensorMap, null /* grad */,
               _FusedMatMul, attrs as {} as NamedAttrMap);
 
-          save([a, b, res, bias]);
+          save([a3D, b3D, res, $bias]);
 
           return {value: res.reshape(outShape), gradFunc: grad};
         });

From 80d998e92adc5db9d017ac76c8cce15643f84cfd Mon Sep 17 00:00:00 2001
From: Yannick Assogba <yassogba@google.com>
Date: Fri, 17 Jul 2020 00:22:11 -0400
Subject: [PATCH 08/12] save

---
 tfjs-core/src/ops/fused_mat_mul.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tfjs-core/src/ops/fused_mat_mul.ts b/tfjs-core/src/ops/fused_mat_mul.ts
index 2500731115e..9e65e3549f8 100644
--- a/tfjs-core/src/ops/fused_mat_mul.ts
+++ b/tfjs-core/src/ops/fused_mat_mul.ts
@@ -144,7 +144,7 @@ function fusedMatMul_<T extends Tensor>({
     // necessarily going to be a 3d tensor due to a reshape done at the end of
     // the customOp.
     const dyActivation =
-        getFusedDyActivation(dy.reshape(y.shape), y, activation)
+        getFusedDyActivation(dy.reshape(y.shape), y, activation);
     let aDer: Tensor;
     let bDer: Tensor;
 

From a5059f9895c702cc2ddf3dc37022d8f8282c981e Mon Sep 17 00:00:00 2001
From: Yannick Assogba <yassogba@google.com>
Date: Fri, 17 Jul 2020 11:52:40 -0400
Subject: [PATCH 09/12] wasm fix

---
 tfjs-backend-wasm/src/kernels/FusedConv2D.ts  | 27 +++++++++----------
 .../src/kernels/FusedDepthwiseConv2D.ts       | 27 +++++++++----------
 tfjs-backend-wasm/src/kernels/_FusedMatMul.ts | 23 ++++------------
 tfjs-backend-wasm/src/setup_test.ts           |  3 ++-
 4 files changed, 33 insertions(+), 47 deletions(-)

diff --git a/tfjs-backend-wasm/src/kernels/FusedConv2D.ts b/tfjs-backend-wasm/src/kernels/FusedConv2D.ts
index e22ef305a73..77e426a8f4c 100644
--- a/tfjs-backend-wasm/src/kernels/FusedConv2D.ts
+++ b/tfjs-backend-wasm/src/kernels/FusedConv2D.ts
@@ -15,18 +15,12 @@
  * =============================================================================
  */
 
-import {backend_util, KernelConfig, KernelFunc, NamedTensorInfoMap, TensorInfo} from '@tensorflow/tfjs-core';
+import {backend_util, FusedConv2D, FusedConv2DAttrs, FusedConv2DInputs, KernelConfig, KernelFunc, Tensor4D} from '@tensorflow/tfjs-core';
 
 import {BackendWasm} from '../backend_wasm';
 
 import {FusableActivation} from './types';
 
-interface FusedConv2DInputs extends NamedTensorInfoMap {
-  x: TensorInfo;
-  filter: TensorInfo;
-  bias?: TensorInfo;
-}
-
 let wasmFusedConv2d: (
     xId: number, batchSize: number, inputHeight: number, inputWidth: number,
     filterId: number, filterHeight: number, filterWidth: number, biasId: number,
@@ -66,11 +60,17 @@ function setup(backend: BackendWasm) {
 function fusedConv2d(args: {
   inputs: FusedConv2DInputs,
   backend: BackendWasm,
-  attrs:
-      {convInfo: backend_util.Conv2DInfo, activation: backend_util.Activation}
+  attrs: FusedConv2DAttrs
 }) {
   const {inputs, attrs, backend} = args;
-  const {convInfo, activation} = attrs;
+  const {x, filter, bias, preluActivationWeights} = inputs;
+  const {strides, pad, dilations, dataFormat, dimRoundingMode, activation} =
+      attrs;
+
+  const convInfo = backend_util.computeConv2DInfo(
+      (x as Tensor4D).shape, (filter as Tensor4D).shape, strides, dilations,
+      pad, dimRoundingMode);
+
   const fusedActivation =
       FusableActivation[activation as {} as keyof typeof FusableActivation];
   if (fusedActivation == null) {
@@ -79,7 +79,6 @@ function fusedConv2d(args: {
         `in the wasm backend.`);
   }
 
-  const {x, filter, bias, preluActivationWeights} = inputs;
   const xId = backend.dataIdMap.get(x.dataId).id;
   const filterId = backend.dataIdMap.get(filter.dataId).id;
 
@@ -117,10 +116,10 @@ function fusedConv2d(args: {
   const inHeight = convInfo.inHeight;
   const inWidth = convInfo.inWidth;
 
-  if (convInfo.dataFormat !== 'channelsLast') {
+  if (dataFormat !== 'NHWC') {
     throw new Error(
         `wasm backend FusedConv2D does not support dataFormat:'` +
-        `${convInfo.dataFormat}'. Please use 'channelsLast'.`);
+        `${dataFormat}'. Please use 'NHWC'.`);
   }
 
   const out = backend.makeOutput(convInfo.outShape, 'float32');
@@ -137,7 +136,7 @@ function fusedConv2d(args: {
 }
 
 export const fusedConv2DConfig: KernelConfig = {
-  kernelName: 'FusedConv2D',
+  kernelName: FusedConv2D,
   backendName: 'wasm',
   setupFunc: setup,
   kernelFunc: fusedConv2d as {} as KernelFunc
diff --git a/tfjs-backend-wasm/src/kernels/FusedDepthwiseConv2D.ts b/tfjs-backend-wasm/src/kernels/FusedDepthwiseConv2D.ts
index e0b95f65c06..208d1574ade 100644
--- a/tfjs-backend-wasm/src/kernels/FusedDepthwiseConv2D.ts
+++ b/tfjs-backend-wasm/src/kernels/FusedDepthwiseConv2D.ts
@@ -15,18 +15,12 @@
  * =============================================================================
  */
 
-import {backend_util, KernelConfig, KernelFunc, NamedTensorInfoMap, TensorInfo} from '@tensorflow/tfjs-core';
+import {backend_util, FusedDepthwiseConv2D, FusedDepthwiseConv2DAttrs, FusedDepthwiseConv2DInputs, KernelConfig, KernelFunc, Tensor4D} from '@tensorflow/tfjs-core';
 
 import {BackendWasm} from '../backend_wasm';
 
 import {FusableActivation} from './types';
 
-interface FusedDepthwiseConv2DInputs extends NamedTensorInfoMap {
-  x: TensorInfo;
-  filter: TensorInfo;
-  bias?: TensorInfo;
-}
-
 let wasmFusedDepthwiseConv2d: (
     xId: number, batchSize: number, inputHeight: number, inputWidth: number,
     filterId: number, filterHeight: number, filterWidth: number, biasId: number,
@@ -67,11 +61,17 @@ function setup(backend: BackendWasm) {
 function fusedDepthwiseConv2d(args: {
   inputs: FusedDepthwiseConv2DInputs,
   backend: BackendWasm,
-  attrs:
-      {convInfo: backend_util.Conv2DInfo, activation: backend_util.Activation}
+  attrs: FusedDepthwiseConv2DAttrs
 }) {
   const {inputs, attrs, backend} = args;
-  const {convInfo, activation} = attrs;
+  const {x, filter, bias, preluActivationWeights} = inputs;
+  const {strides, pad, dilations, dataFormat, dimRoundingMode, activation} =
+      attrs;
+
+  const convInfo = backend_util.computeConv2DInfo(
+      (x as Tensor4D).shape, (filter as Tensor4D).shape, strides, dilations,
+      pad, dimRoundingMode);
+
   const fusedActivation =
       FusableActivation[activation as {} as keyof typeof FusableActivation];
   if (fusedActivation == null) {
@@ -80,7 +80,6 @@ function fusedDepthwiseConv2d(args: {
         `in the wasm backend.`);
   }
 
-  const {x, filter, bias, preluActivationWeights} = inputs;
   const xId = backend.dataIdMap.get(x.dataId).id;
   const filterId = backend.dataIdMap.get(filter.dataId).id;
 
@@ -118,10 +117,10 @@ function fusedDepthwiseConv2d(args: {
   const inHeight = convInfo.inHeight;
   const inWidth = convInfo.inWidth;
 
-  if (convInfo.dataFormat !== 'channelsLast') {
+  if (dataFormat !== 'NHWC') {
     throw new Error(
         `wasm backend FusedDepthwiseConv2D does not support dataFormat:'` +
-        `${convInfo.dataFormat}'. Please use 'channelsLast'.`);
+        `${dataFormat}'. Please use 'NHWC'.`);
   }
 
   const out = backend.makeOutput(convInfo.outShape, 'float32');
@@ -138,7 +137,7 @@ function fusedDepthwiseConv2d(args: {
 }
 
 export const fusedDepthwiseConv2DConfig: KernelConfig = {
-  kernelName: 'FusedDepthwiseConv2D',
+  kernelName: FusedDepthwiseConv2D,
   backendName: 'wasm',
   setupFunc: setup,
   kernelFunc: fusedDepthwiseConv2d as {} as KernelFunc
diff --git a/tfjs-backend-wasm/src/kernels/_FusedMatMul.ts b/tfjs-backend-wasm/src/kernels/_FusedMatMul.ts
index 16349df81a9..3ae7e8ef977 100644
--- a/tfjs-backend-wasm/src/kernels/_FusedMatMul.ts
+++ b/tfjs-backend-wasm/src/kernels/_FusedMatMul.ts
@@ -15,25 +15,12 @@
  * =============================================================================
  */
 
-import {KernelConfig, NamedAttrMap, NamedTensorInfoMap, TensorInfo} from '@tensorflow/tfjs-core';
+import {_FusedMatMul, _FusedMatMulAttrs, _FusedMatMulInputs, KernelConfig, KernelFunc} from '@tensorflow/tfjs-core';
 
 import {BackendWasm} from '../backend_wasm';
 
 import {FusableActivation} from './types';
 
-interface FusedMatMulInputs extends NamedTensorInfoMap {
-  a: TensorInfo;
-  b: TensorInfo;
-  bias?: TensorInfo;
-  preluActivationWeights?: TensorInfo;
-}
-
-interface FusedMatMulAttrs extends NamedAttrMap {
-  transposeA: boolean;
-  transposeB: boolean;
-  activation: FusableActivation;
-}
-
 let wasmFusedMatMul: (
     aId: number, aShape: Uint8Array, aShapeSize: number, bId: number,
     bShape: Uint8Array, bShapeSize: number, transposeA: boolean,
@@ -58,9 +45,9 @@ function setup(backend: BackendWasm) {
 }
 
 function fusedBatchMatMul(args: {
-  inputs: FusedMatMulInputs,
+  inputs: _FusedMatMulInputs,
   backend: BackendWasm,
-  attrs: FusedMatMulAttrs
+  attrs: _FusedMatMulAttrs
 }) {
   const {inputs, backend, attrs} = args;
   const {a, b, bias, preluActivationWeights} = inputs;
@@ -114,8 +101,8 @@ function fusedBatchMatMul(args: {
 }
 
 export const fusedMatMulConfig: KernelConfig = {
-  kernelName: '_FusedMatMul',
+  kernelName: _FusedMatMul,
   backendName: 'wasm',
   setupFunc: setup,
-  kernelFunc: fusedBatchMatMul
+  kernelFunc: fusedBatchMatMul as {} as KernelFunc
 };
diff --git a/tfjs-backend-wasm/src/setup_test.ts b/tfjs-backend-wasm/src/setup_test.ts
index c21a3382585..1f32a51601b 100644
--- a/tfjs-backend-wasm/src/setup_test.ts
+++ b/tfjs-backend-wasm/src/setup_test.ts
@@ -117,7 +117,8 @@ const TEST_FILTERS: TestFilter[] = [
       'basic with elu',    // Only fused relu, relu6, prelu activations
                            // supported.
       'gradient',          // Gradients not defined yet.
-      'NCHW',              // xnn pack does not support channels first.
+      'backProp input x=[2,3,3,1] f=[2,2,1,1] s=1 p=0',
+      'NCHW',  // xnn pack does not support channels first.
       // Issue: https://github.com/tensorflow/tfjs/issues/3104.
       // Actual != expected.
       'relu bias stride 2 x=[1,8,8,16] f=[3,3,16,1] s=[2,2] d=8 p=same',

From 641cf58ba9319a0538607a7b52effc4d84a2c137 Mon Sep 17 00:00:00 2001
From: Yannick Assogba <yassogba@google.com>
Date: Fri, 17 Jul 2020 13:18:20 -0400
Subject: [PATCH 10/12] remove chaining

---
 tfjs-core/src/ops/fused_mat_mul.ts | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tfjs-core/src/ops/fused_mat_mul.ts b/tfjs-core/src/ops/fused_mat_mul.ts
index 9e65e3549f8..8cf4a46e28a 100644
--- a/tfjs-core/src/ops/fused_mat_mul.ts
+++ b/tfjs-core/src/ops/fused_mat_mul.ts
@@ -32,6 +32,7 @@ import {Activation} from './fused_types';
 import {applyActivation, getFusedBiasGradient, getFusedDyActivation, shouldFuse} from './fused_util';
 import {matMul as unfusedMatMul} from './mat_mul';
 import {op} from './operation';
+import {reshape} from './reshape';
 
 /**
  * Computes the dot product of two matrices with optional activation and bias.
@@ -144,7 +145,7 @@ function fusedMatMul_<T extends Tensor>({
     // necessarily going to be a 3d tensor due to a reshape done at the end of
     // the customOp.
     const dyActivation =
-        getFusedDyActivation(dy.reshape(y.shape), y, activation);
+        getFusedDyActivation(reshape(dy, y.shape), y, activation);
     let aDer: Tensor;
     let bDer: Tensor;
 

From 52648c195820814917eb7877685c7504888096a0 Mon Sep 17 00:00:00 2001
From: Yannick Assogba <yassogba@google.com>
Date: Mon, 20 Jul 2020 15:24:29 -0400
Subject: [PATCH 11/12] code review comments

---
 .../src/kernels/FusedDepthwiseConv2D.ts       |  2 +-
 tfjs-backend-wasm/src/kernels/_FusedMatMul.ts |  2 +-
 tfjs-backend-wasm/src/setup_test.ts           |  3 +-
 tfjs-core/src/ops/fused_conv2d.ts             | 56 -------------------
 tfjs-core/src/ops/fused_depthwise_conv2d.ts   |  3 +-
 tfjs-core/src/ops/fused_mat_mul.ts            | 20 +++----
 6 files changed, 15 insertions(+), 71 deletions(-)

diff --git a/tfjs-backend-wasm/src/kernels/FusedDepthwiseConv2D.ts b/tfjs-backend-wasm/src/kernels/FusedDepthwiseConv2D.ts
index 208d1574ade..8231ccc521b 100644
--- a/tfjs-backend-wasm/src/kernels/FusedDepthwiseConv2D.ts
+++ b/tfjs-backend-wasm/src/kernels/FusedDepthwiseConv2D.ts
@@ -32,7 +32,7 @@ let wasmFusedDepthwiseConv2d: (
 
 function setup(backend: BackendWasm) {
   wasmFusedDepthwiseConv2d =
-      backend.wasm.cwrap('FusedDepthwiseConv2D', null /* void */, [
+      backend.wasm.cwrap(FusedDepthwiseConv2D, null /* void */, [
         'number',  // xId
         'number',  // batchSize
         'number',  // inputHeight
diff --git a/tfjs-backend-wasm/src/kernels/_FusedMatMul.ts b/tfjs-backend-wasm/src/kernels/_FusedMatMul.ts
index 3ae7e8ef977..3fcb21745a7 100644
--- a/tfjs-backend-wasm/src/kernels/_FusedMatMul.ts
+++ b/tfjs-backend-wasm/src/kernels/_FusedMatMul.ts
@@ -28,7 +28,7 @@ let wasmFusedMatMul: (
     preluActivationWeightsId: number, outId: number) => void;
 
 function setup(backend: BackendWasm) {
-  wasmFusedMatMul = backend.wasm.cwrap('_FusedMatMul', null /* void */, [
+  wasmFusedMatMul = backend.wasm.cwrap(_FusedMatMul, null /* void */, [
     'number',  // a_id
     'array',   // a_shape
     'number',  // a_shape.length
diff --git a/tfjs-backend-wasm/src/setup_test.ts b/tfjs-backend-wasm/src/setup_test.ts
index 1f32a51601b..507703a9298 100644
--- a/tfjs-backend-wasm/src/setup_test.ts
+++ b/tfjs-backend-wasm/src/setup_test.ts
@@ -117,7 +117,8 @@ const TEST_FILTERS: TestFilter[] = [
       'basic with elu',    // Only fused relu, relu6, prelu activations
                            // supported.
       'gradient',          // Gradients not defined yet.
-      'backProp input x=[2,3,3,1] f=[2,2,1,1] s=1 p=0',
+      'backProp input x=[2,3,3,1] f=[2,2,1,1] s=1 p=0',  // Gradients not
+                                                         // defined.
       'NCHW',  // xnn pack does not support channels first.
       // Issue: https://github.com/tensorflow/tfjs/issues/3104.
       // Actual != expected.
diff --git a/tfjs-core/src/ops/fused_conv2d.ts b/tfjs-core/src/ops/fused_conv2d.ts
index 00317430a20..c1e86df2c6c 100644
--- a/tfjs-core/src/ops/fused_conv2d.ts
+++ b/tfjs-core/src/ops/fused_conv2d.ts
@@ -36,62 +36,6 @@ import {Activation} from './fused_types';
 import {applyActivation, getFusedBiasGradient, getFusedDyActivation, shouldFuse} from './fused_util';
 import {op} from './operation';
 
-/**
- * Computes a 2D convolution over the input x, optionally fused with adding a
- * bias and applying an activation.
- *
- * ```js
- * const inputDepth = 2;
- * const inShape = [2, 2, 2, inputDepth];
- * const outputDepth = 2;
- * const fSize = 1;
- * const pad = 0;
- * const strides = 1;
- *
- * const x = tf.tensor4d( [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- * 16], inShape);
- * const w = tf.tensor4d([-1, 1, -2, 0.5], [fSize, fSize, inputDepth,
- * outputDepth]);
- *
- * tf.fused.conv2d({ x, filter: w, strides, pad, dataFormat: 'NHWC',
- * dilations: [1, 1], bias: tf.scalar(5), activation: 'relu' }).print();
- * ```
- *
- * @param obj An object with the following properties:
- * @param x The input tensor, of rank 4 or rank 3, of shape
- *     `[batch, height, width, inChannels]`. If rank 3, batch of 1 is
- * assumed.
- * @param filter The filter, rank 4, of shape
- *     `[filterHeight, filterWidth, inDepth, outDepth]`.
- * @param strides The strides of the convolution: `[strideHeight,
- * strideWidth]`.
- * @param pad The type of padding algorithm.
- *   - `same` and stride 1: output will be of same size as input,
- *       regardless of filter size.
- *   - `valid` output will be smaller than input if filter is larger
- *       than 1x1.
- *   - For more info, see this guide:
- *     [https://www.tensorflow.org/api_guides/python/nn#Convolution](
- *          https://www.tensorflow.org/api_guides/python/nn#Convolution)
- * @param dataFormat An optional string from: "NHWC", "NCHW". Defaults to
- *     "NHWC". Specify the data format of the input and output data. With the
- *     default format "NHWC", the data is stored in the order of: [batch,
- *     height, width, channels]. Only "NHWC" is currently supported.
- * @param dilations The dilation rates: `[dilationHeight, dilationWidth]`
- *     in which we sample input values across the height and width dimensions
- *     in atrous convolution. Defaults to `[1, 1]`. If `dilations` is a single
- *     number, then `dilationHeight == dilationWidth`. If it is greater than
- *     1, then all values of `strides` must be 1.
- * @param dimRoundingMode The rounding mode used when computing output
- *     dimensions if pad is a number. If none is provided, it will not round
- *     and error if the output is of fractional size.
- * @param bias Tensor to be added to the result.
- * @param activation Name of activation kernel (defaults to `linear`) to be
- *     applied
- *      after biasAdd.
- * @param preluActivationWeights Tensor of prelu weights to be applied as part
- *     of a `prelu` activation, typically the same shape as `x`.
- */
 /**
  * Computes a 2D convolution over the input x, optionally fused with adding a
  * bias and applying an activation.
diff --git a/tfjs-core/src/ops/fused_depthwise_conv2d.ts b/tfjs-core/src/ops/fused_depthwise_conv2d.ts
index 8b6f9f49b29..5f62f1617c0 100644
--- a/tfjs-core/src/ops/fused_depthwise_conv2d.ts
+++ b/tfjs-core/src/ops/fused_depthwise_conv2d.ts
@@ -193,9 +193,8 @@ function fusedDepthwiseConv2d_<T extends Tensor3D|Tensor4D>({
     if (bias != null) {
       const biasDer = getFusedBiasGradient($bias, dyActivation);
       return [xDer, filterDer, biasDer];
-    } else {
-      return [xDer, filterDer];
     }
+    return [xDer, filterDer];
   };
 
   const forward: ForwardFunc<Tensor> = (backend) => {
diff --git a/tfjs-core/src/ops/fused_mat_mul.ts b/tfjs-core/src/ops/fused_mat_mul.ts
index 8cf4a46e28a..7abf4e65e00 100644
--- a/tfjs-core/src/ops/fused_mat_mul.ts
+++ b/tfjs-core/src/ops/fused_mat_mul.ts
@@ -150,17 +150,17 @@ function fusedMatMul_<T extends Tensor>({
     let bDer: Tensor;
 
     if (!transposeA && !transposeB) {
-      aDer = dyActivation.matMul(b3D, false, true);
-      bDer = a3D.matMul(dyActivation, true, false);
+      aDer = unfusedMatMul(dyActivation, b3D, false, true);
+      bDer = unfusedMatMul(a3D, dyActivation, true, false);
     } else if (!transposeA && transposeB) {
-      aDer = dyActivation.matMul(b3D, false, false);
-      bDer = dyActivation.matMul(a3D, true, false);
+      aDer = unfusedMatMul(dyActivation, b3D, false, false);
+      bDer = unfusedMatMul(dyActivation, a3D, true, false);
     } else if (transposeA && !transposeB) {
-      aDer = b3D.matMul(dyActivation, false, true);
-      bDer = a3D.matMul(dyActivation, false, false);
+      aDer = unfusedMatMul(b3D, dyActivation, false, true);
+      bDer = unfusedMatMul(a3D, dyActivation, false, false);
     } else {
-      aDer = b3D.matMul(dyActivation, true, true);
-      bDer = dyActivation.matMul(a3D, true, true);
+      aDer = unfusedMatMul(b3D, dyActivation, true, true);
+      bDer = unfusedMatMul(dyActivation, a3D, true, true);
     }
 
     if (bias != null) {
@@ -203,7 +203,7 @@ function fusedMatMul_<T extends Tensor>({
 
           save([a3D, b3D, res]);
 
-          return {value: res.reshape(outShape), gradFunc: grad};
+          return {value: reshape(res, outShape), gradFunc: grad};
         });
     return customOp(a3D, b3D) as T;
   } else {
@@ -215,7 +215,7 @@ function fusedMatMul_<T extends Tensor>({
 
           save([a3D, b3D, res, $bias]);
 
-          return {value: res.reshape(outShape), gradFunc: grad};
+          return {value: reshape(res, outShape), gradFunc: grad};
         });
 
     return customOpWithBias(a3D, b3D, $bias) as T;

From bb340988cb894d07b936a9bd5577f382f2a3ad24 Mon Sep 17 00:00:00 2001
From: Yannick Assogba <yassogba@google.com>
Date: Wed, 22 Jul 2020 11:50:29 -0400
Subject: [PATCH 12/12] update error message

---
 tfjs-core/src/ops/fused_util.ts | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tfjs-core/src/ops/fused_util.ts b/tfjs-core/src/ops/fused_util.ts
index 206cdaa4913..a41c7a574bb 100644
--- a/tfjs-core/src/ops/fused_util.ts
+++ b/tfjs-core/src/ops/fused_util.ts
@@ -34,8 +34,7 @@ export function getFusedDyActivation(
     return dy.mul(y.step());
   }
   throw new Error(
-      `Gradient for activation ${activation} has not been ` +
-      `implemented yet.`);
+      `Cannot compute gradient for fused activation ${activation}.`);
 }
 
 // Returns gradient for fused bias.