From 22ba0f2126041013cfd03c44d94afa075dcc0513 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Mon, 24 Feb 2020 15:51:22 -0500
Subject: [PATCH 01/35] initial

---
 tfjs-backend-wasm/src/cc/BUILD               | 10 ++++
 tfjs-backend-wasm/src/cc/kernels/Pow.cc      | 61 ++++++++++++++++++++
 tfjs-backend-wasm/src/kernels/Pow.ts         | 20 +++++++
 tfjs-backend-wasm/src/kernels/all_kernels.ts |  1 +
 tfjs-backend-wasm/src/setup_test.ts          |  1 +
 tfjs-core/src/ops/binary_ops.ts              |  4 +-
 6 files changed, 95 insertions(+), 2 deletions(-)
 create mode 100644 tfjs-backend-wasm/src/cc/kernels/Pow.cc
 create mode 100644 tfjs-backend-wasm/src/kernels/Pow.ts
diff --git a/tfjs-backend-wasm/src/cc/BUILD b/tfjs-backend-wasm/src/cc/BUILD
index 0126adfa16e..ff6cce26bcb 100644
--- a/tfjs-backend-wasm/src/cc/BUILD
+++ b/tfjs-backend-wasm/src/cc/BUILD
@@ -182,6 +182,7 @@ tfjs_cc_library(
         ":NonMaxSuppressionV5",
         ":NotEqual",
         ":PadV2",
+        ":Pow",
         ":Prelu",
         ":Relu",
         ":Relu6",
@@ -593,6 +594,15 @@ tfjs_cc_library(
     ],
 )
 
+tfjs_cc_library(
+    name = "Pow",
+    srcs = ["kernels/Pow.cc"],
+    deps = [
+        ":backend",
+        ":util",
+    ],
+)
+
 tfjs_cc_library(
     name = "Prelu",
     srcs = ["kernels/Prelu.cc"],
diff --git a/tfjs-backend-wasm/src/cc/kernels/Pow.cc b/tfjs-backend-wasm/src/cc/kernels/Pow.cc
new file mode 100644
index 00000000000..d6080d1ca56
--- /dev/null
+++ b/tfjs-backend-wasm/src/cc/kernels/Pow.cc
@@ -0,0 +1,61 @@
+/* Copyright 2019 Google Inc. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ===========================================================================*/
+
+#ifdef __EMSCRIPTEN__
+#include <emscripten.h>
+#endif
+
+#include <cmath>
+#include <cstddef>
+
+#include "src/cc/binary.h"
+#include "src/cc/util.h"
+
+namespace {
+template <class T>
+inline T pow(T a, T b) {
+  return pow(a, b);
+}
+}  // namespace
+
+namespace tfjs {
+namespace wasm {
+// We use C-style API to interface with Javascript.
+extern "C" {
+
+#ifdef __EMSCRIPTEN__
+EMSCRIPTEN_KEEPALIVE
+#endif
+void Pow(const size_t a_id, const size_t* a_shape_ptr, const size_t a_shape_len,
+         const size_t b_id, const size_t* b_shape_ptr, const size_t b_shape_len,
+         const DType dtype, const size_t out_id) {
+  switch (dtype) {
+    case DType::float32:
+      binary_f32(a_id, b_id, out_id, pow<float>);
+      break;
+    case DType::int32:
+      binary_i32(a_id, b_id, out_id, pow<int32_t>);
+      break;
+    case DType::boolean:
+      binary_bool(a_id, b_id, out_id, pow<bool>);
+      break;
+    default:
+      util::warn("Pow for tensor ids %d and %d failed. Unknown dtype %d", a_id,
+                 b_id, dtype);
+  }
+}
+
+}  // extern "C"
+}  // namespace wasm
+}  // namespace tfjs
diff --git a/tfjs-backend-wasm/src/kernels/Pow.ts b/tfjs-backend-wasm/src/kernels/Pow.ts
new file mode 100644
index 00000000000..fae7489c2e6
--- /dev/null
+++ b/tfjs-backend-wasm/src/kernels/Pow.ts
@@ -0,0 +1,20 @@
+/**
+ * @license
+ * Copyright 2019 Google Inc. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {registerBinaryKernel} from './binary_kernel';
+const supportsFullBroadcast = false;
+registerBinaryKernel('Pow', supportsFullBroadcast);
diff --git a/tfjs-backend-wasm/src/kernels/all_kernels.ts b/tfjs-backend-wasm/src/kernels/all_kernels.ts
index 157a4cbd8fa..be3f297fdb2 100644
--- a/tfjs-backend-wasm/src/kernels/all_kernels.ts
+++ b/tfjs-backend-wasm/src/kernels/all_kernels.ts
@@ -56,6 +56,7 @@ import './NonMaxSuppressionV3';
 import './NonMaxSuppressionV5';
 import './NotEqual';
 import './PadV2';
+import './Pow';
 import './Prelu';
 import './Relu';
 import './Relu6';
diff --git a/tfjs-backend-wasm/src/setup_test.ts b/tfjs-backend-wasm/src/setup_test.ts
index 8c4d289b7e1..212b63cea95 100644
--- a/tfjs-backend-wasm/src/setup_test.ts
+++ b/tfjs-backend-wasm/src/setup_test.ts
@@ -35,6 +35,7 @@ const TEST_FILTERS: TestFilter[] = [
     ]
   },
   {include: 'softmax'},
+  {include: 'pow'},
   {
     include: 'add ',
     excludes: [
diff --git a/tfjs-core/src/ops/binary_ops.ts b/tfjs-core/src/ops/binary_ops.ts
index e6d7e241429..3e34249e008 100644
--- a/tfjs-core/src/ops/binary_ops.ts
+++ b/tfjs-core/src/ops/binary_ops.ts
@@ -276,13 +276,13 @@ function pow_<T extends Tensor>(base: T|TensorLike, exp: Tensor|TensorLike): T {
       }
       return res.reshape($exp.shape);
     };
-    return {$base: derBase, $exp: derExp};
+    return {base: derBase, exp: derExp};
   };
   return ENGINE.runKernelFunc((backend, save) => {
     const y = backend.pow($base, $exp);
     save([$base, $exp, y]);
     return y;
-  }, {$base, $exp}, grad) as T;
+  }, {base: $base, exp: $exp}, grad, 'Pow') as T;
 }
 
 /**

From 20049d142e49ae19b75e301625afe5285e813880 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Tue, 25 Feb 2020 08:53:29 -0500
Subject: [PATCH 02/35] build

---
 tfjs-backend-wasm/src/cc/BUILD      |  1 +
 tfjs-backend-wasm/src/index_test.ts | 13 +++++++++++--
 tfjs-core/src/ops/binary_ops.ts     |  8 ++++++--
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/BUILD b/tfjs-backend-wasm/src/cc/BUILD
index ff6cce26bcb..b32b78b2f76 100644
--- a/tfjs-backend-wasm/src/cc/BUILD
+++ b/tfjs-backend-wasm/src/cc/BUILD
@@ -599,6 +599,7 @@ tfjs_cc_library(
     srcs = ["kernels/Pow.cc"],
     deps = [
         ":backend",
+        ":binary",
         ":util",
     ],
 )
diff --git a/tfjs-backend-wasm/src/index_test.ts b/tfjs-backend-wasm/src/index_test.ts
index 370615f2cf3..3c97cf9457a 100644
--- a/tfjs-backend-wasm/src/index_test.ts
+++ b/tfjs-backend-wasm/src/index_test.ts
@@ -58,8 +58,8 @@ describeWithFlags('wasm init', BROWSER_ENVS, () => {
     }, 100);
 
     // Silences backend registration warnings.
-    spyOn(console, 'warn');
-    spyOn(console, 'log');
+    // spyOn(console, 'warn');
+    // spyOn(console, 'log');
   });
 
   afterEach(() => {
@@ -92,4 +92,13 @@ describeWithFlags('wasm init', BROWSER_ENVS, () => {
     expect(() => setWasmPath('too/late'))
         .toThrowError(/The WASM backend was already initialized. Make sure/);
   });
+
+  fit('pow', async () => {
+    const a = tf.tensor2d([1, -2, -3, 0, 7, 1], [2, 3]);
+    const b = tf.tensor2d([5, 3, 4, 5, 2, -3], [2, 3], 'int32');
+    // const expected = [1, -8, 81, 0, 49, 1];
+    const result = tf.pow(a, b);
+    const data = await result.data();
+    console.log(Array.from(data));
+  });
 });
diff --git a/tfjs-core/src/ops/binary_ops.ts b/tfjs-core/src/ops/binary_ops.ts
index 3e34249e008..52fcebbc638 100644
--- a/tfjs-core/src/ops/binary_ops.ts
+++ b/tfjs-core/src/ops/binary_ops.ts
@@ -276,13 +276,17 @@ function pow_<T extends Tensor>(base: T|TensorLike, exp: Tensor|TensorLike): T {
       }
       return res.reshape($exp.shape);
     };
-    return {base: derBase, exp: derExp};
+    return {a: derBase, b: derExp};
   };
+
+  const attrs = {};
+  const inputsToSave = [$base, $exp];
+  const outputsToSave = [true];
   return ENGINE.runKernelFunc((backend, save) => {
     const y = backend.pow($base, $exp);
     save([$base, $exp, y]);
     return y;
-  }, {base: $base, exp: $exp}, grad, 'Pow') as T;
+  }, {a: $base, b: $exp}, grad, 'Pow', attrs, inputsToSave, outputsToSave) as T;
 }
 
 /**

From d7aaf12ea12a806e067d57647f2a2cfbcfc425a5 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Tue, 25 Feb 2020 09:27:00 -0500
Subject: [PATCH 03/35] update

---
 tfjs-backend-wasm/src/cc/kernels/Pow.cc |  8 ++++----
 tfjs-backend-wasm/src/index_test.ts     |  1 +
 tfjs-core/src/ops/binary_ops.ts         | 12 ++++++------
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/kernels/Pow.cc b/tfjs-backend-wasm/src/cc/kernels/Pow.cc
index d6080d1ca56..646fe355fa1 100644
--- a/tfjs-backend-wasm/src/cc/kernels/Pow.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/Pow.cc
@@ -24,7 +24,7 @@
 
 namespace {
 template <class T>
-inline T pow(T a, T b) {
+inline T power(T a, T b) {
   return pow(a, b);
 }
 }  // namespace
@@ -42,13 +42,13 @@ void Pow(const size_t a_id, const size_t* a_shape_ptr, const size_t a_shape_len,
          const DType dtype, const size_t out_id) {
   switch (dtype) {
     case DType::float32:
-      binary_f32(a_id, b_id, out_id, pow<float>);
+      binary_f32(a_id, b_id, out_id, power<float>);
       break;
     case DType::int32:
-      binary_i32(a_id, b_id, out_id, pow<int32_t>);
+      binary_i32(a_id, b_id, out_id, power<int32_t>);
       break;
     case DType::boolean:
-      binary_bool(a_id, b_id, out_id, pow<bool>);
+      binary_bool(a_id, b_id, out_id, power<bool>);
       break;
     default:
       util::warn("Pow for tensor ids %d and %d failed. Unknown dtype %d", a_id,
diff --git a/tfjs-backend-wasm/src/index_test.ts b/tfjs-backend-wasm/src/index_test.ts
index 3c97cf9457a..b7d00105195 100644
--- a/tfjs-backend-wasm/src/index_test.ts
+++ b/tfjs-backend-wasm/src/index_test.ts
@@ -97,6 +97,7 @@ describeWithFlags('wasm init', BROWSER_ENVS, () => {
     const a = tf.tensor2d([1, -2, -3, 0, 7, 1], [2, 3]);
     const b = tf.tensor2d([5, 3, 4, 5, 2, -3], [2, 3], 'int32');
     // const expected = [1, -8, 81, 0, 49, 1];
+    // const result = tf.pow(a, b);
     const result = tf.pow(a, b);
     const data = await result.data();
     console.log(Array.from(data));
diff --git a/tfjs-core/src/ops/binary_ops.ts b/tfjs-core/src/ops/binary_ops.ts
index 52fcebbc638..ce070e4d9d1 100644
--- a/tfjs-core/src/ops/binary_ops.ts
+++ b/tfjs-core/src/ops/binary_ops.ts
@@ -20,7 +20,7 @@ import {Tensor} from '../tensor';
 import {NamedTensorMap} from '../tensor_types';
 import {makeTypesMatch} from '../tensor_util';
 import {convertToTensor} from '../tensor_util_env';
-import {TensorLike, upcastType} from '../types';
+import {TensorLike} from '../types';
 import * as util from '../util';
 import * as broadcast_util from './broadcast_util';
 import {where} from './logical_ops';
@@ -247,14 +247,14 @@ function subStrict_<T extends Tensor>(a: T|TensorLike, b: T|TensorLike): T {
  * @param exp The exponent `tf.Tensor` to pow element-wise.
  */
 /** @doc {heading: 'Operations', subheading: 'Arithmetic'} */
-function pow_<T extends Tensor>(base: T|TensorLike, exp: Tensor|TensorLike): T {
-  const $base = convertToTensor(base, 'base', 'pow');
-  const $exp = convertToTensor(exp, 'exp', 'pow');
+function pow_<T extends Tensor>(
+    base: Tensor|TensorLike, exp: Tensor|TensorLike): T {
+  let $base = convertToTensor(base, 'base', 'pow');
+  let $exp = convertToTensor(exp, 'exp', 'pow');
+  [$base, $exp] = makeTypesMatch($base, $exp);
 
   const outShape =
       broadcast_util.assertAndGetBroadcastShape($base.shape, $exp.shape);
-  base = $base.cast(upcastType($base.dtype, $exp.dtype));
-  exp = $exp.cast(upcastType($base.dtype, $exp.dtype));
   const grad = (dy: Tensor, saved: Tensor[]) => {
     const [$base, $exp, y] = saved;
     const derBase = () => {

From 2f99398f9df4588d7dd9d90cba10798b057ddab4 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Tue, 25 Feb 2020 09:35:17 -0500
Subject: [PATCH 04/35] add pow

---
 tfjs-backend-wasm/src/index_test.ts |  4 ++--
 tfjs-backend-wasm/src/setup_test.ts | 10 +++++++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/tfjs-backend-wasm/src/index_test.ts b/tfjs-backend-wasm/src/index_test.ts
index b7d00105195..ab38cfad48d 100644
--- a/tfjs-backend-wasm/src/index_test.ts
+++ b/tfjs-backend-wasm/src/index_test.ts
@@ -93,12 +93,12 @@ describeWithFlags('wasm init', BROWSER_ENVS, () => {
         .toThrowError(/The WASM backend was already initialized. Make sure/);
   });
 
-  fit('pow', async () => {
+  it('pow', async () => {
     const a = tf.tensor2d([1, -2, -3, 0, 7, 1], [2, 3]);
     const b = tf.tensor2d([5, 3, 4, 5, 2, -3], [2, 3], 'int32');
     // const expected = [1, -8, 81, 0, 49, 1];
-    // const result = tf.pow(a, b);
     const result = tf.pow(a, b);
+    // const result = tf.div(a, b);
     const data = await result.data();
     console.log(Array.from(data));
   });
diff --git a/tfjs-backend-wasm/src/setup_test.ts b/tfjs-backend-wasm/src/setup_test.ts
index 212b63cea95..4d942718f47 100644
--- a/tfjs-backend-wasm/src/setup_test.ts
+++ b/tfjs-backend-wasm/src/setup_test.ts
@@ -35,7 +35,15 @@ const TEST_FILTERS: TestFilter[] = [
     ]
   },
   {include: 'softmax'},
-  {include: 'pow'},
+  {
+    include: 'pow',
+    excludes: [
+      'gradient',  // zerosLike not defined yet.
+      'broadcasting same rank Tensors different shape',  // Broadcasting along
+                                                         // inner dims not
+                                                         // supported yet.
+    ]
+  },
   {
     include: 'add ',
     excludes: [

From f1021be9b313a728c06e7977c7ffcc7f343df637 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Thu, 5 Mar 2020 07:23:43 -0500
Subject: [PATCH 05/35] fusedbmm running

---
 tfjs-backend-wasm/src/cc/BUILD                |  25 ++++
 tfjs-backend-wasm/src/cc/batchMatMul_impl.cc  |  19 +++
 tfjs-backend-wasm/src/cc/batchMatMul_impl.h   |  28 ++++
 .../src/cc/kernels/FusedBatchMatMul.cc        |  35 +++++
 .../src/cc/kernels/FusedBatchMatMul.h         |  26 ++++
 tfjs-backend-wasm/src/cc/kernels/Pow.cc       |   2 +-
 tfjs-backend-wasm/src/index_test.ts           |   9 ++
 .../src/kernels/FusedBatchMatMul.ts           | 128 ++++++++++++++++++
 tfjs-core/src/ops/fused_ops.ts                |  64 +++++----
 9 files changed, 305 insertions(+), 31 deletions(-)
 create mode 100644 tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
 create mode 100644 tfjs-backend-wasm/src/cc/batchMatMul_impl.h
 create mode 100644 tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc
 create mode 100644 tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.h
 create mode 100644 tfjs-backend-wasm/src/kernels/FusedBatchMatMul.ts

diff --git a/tfjs-backend-wasm/src/cc/BUILD b/tfjs-backend-wasm/src/cc/BUILD
index b32b78b2f76..b6a34763ce8 100644
--- a/tfjs-backend-wasm/src/cc/BUILD
+++ b/tfjs-backend-wasm/src/cc/BUILD
@@ -94,6 +94,18 @@ tfjs_cc_library(
     ],
 )
 
+tfjs_cc_library(
+    name = "batchMatMul_impl",
+    srcs = ["batchMatMul_impl.cc"],
+    hdrs = ["batchMatMul_impl.h"],
+    deps = [
+        ":backend",
+        ":prelu_impl",
+        ":transpose_impl",
+        ":util",
+    ],
+)
+
 tfjs_cc_library(
     name = "interpolate_bilinear_impl",
     srcs = ["interpolate_bilinear_impl.cc"],
@@ -155,6 +167,7 @@ tfjs_cc_library(
         ":ArgMax",
         ":AvgPool",
         ":BatchMatMul",
+        ":FusedBatchMatMul",
         ":ClipByValue",
         ":Conv2D",
         ":CropAndResize",
@@ -258,6 +271,18 @@ tfjs_cc_library(
     deps = [
         ":backend",
         ":util",
+        ":batchMatMul_impl",
+    ],
+)
+
+tfjs_cc_library(
+    name = "FusedBatchMatMul",
+    srcs = ["kernels/FusedBatchMatMul.cc"],
+    hdrs = ["kernels/FusedBatchMatMul.h"],
+    deps = [
+        ":backend",
+        ":util",
+        ":batchMatMul_impl",
     ],
 )
 
diff --git a/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc b/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
new file mode 100644
index 00000000000..e7a13a0a4d6
--- /dev/null
+++ b/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
@@ -0,0 +1,19 @@
+/* Copyright 2019 Google Inc. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ===========================================================================*/
+
+namespace tfjs {
+namespace wasm {
+void batchMatMul() {}
+}  // namespace wasm
+}  // namespace tfjs
diff --git a/tfjs-backend-wasm/src/cc/batchMatMul_impl.h b/tfjs-backend-wasm/src/cc/batchMatMul_impl.h
new file mode 100644
index 00000000000..0942ebe5690
--- /dev/null
+++ b/tfjs-backend-wasm/src/cc/batchMatMul_impl.h
@@ -0,0 +1,28 @@
+/* Copyright 2019 Google Inc. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ===========================================================================*/
+
+#ifndef BATCHMATMUL_IMPL_H_
+#define BATCHMATMUL_IMPL_H_
+
+#include <cstddef>
+
+namespace tfjs {
+namespace wasm {
+
+void batchMatMul();
+
+}
+}  // namespace tfjs
+
+#endif  // BATCHMATMUL_IMPL_H_
diff --git a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc
new file mode 100644
index 00000000000..b0a5154187d
--- /dev/null
+++ b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc
@@ -0,0 +1,35 @@
+/* Copyright 2019 Google Inc. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ===========================================================================*/
+
+#ifdef __EMSCRIPTEN__
+#include <emscripten.h>
+#endif
+
+#include "src/cc/kernels/FusedBatchMatMul.h"
+
+#include "src/cc/batchMatMul_impl.h"
+
+namespace tfjs {
+namespace wasm {
+
+extern "C" {
+
+#ifdef __EMSCRIPTEN__
+EMSCRIPTEN_KEEPALIVE
+#endif
+
+void FusedBatchMatMul() {}
+}
+}  // namespace wasm
+}  // namespace tfjs
diff --git a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.h b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.h
new file mode 100644
index 00000000000..9fb70193516
--- /dev/null
+++ b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.h
@@ -0,0 +1,26 @@
+/* Copyright 2019 Google Inc. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ===========================================================================*/
+
+#ifndef KERNELS_FUSEDBATCHMATMUL_H_
+#define KERNELS_FUSEDBATCHMATMUL_H_
+
+namespace tfjs {
+namespace wasm {
+extern "C" {
+void FusedBatchMatMul();
+}
+}  // namespace wasm
+}  // namespace tfjs
+
+#endif  // KERNELS_FUSEDBATCHMATMUL_H_
diff --git a/tfjs-backend-wasm/src/cc/kernels/Pow.cc b/tfjs-backend-wasm/src/cc/kernels/Pow.cc
index 646fe355fa1..b9c5bed7388 100644
--- a/tfjs-backend-wasm/src/cc/kernels/Pow.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/Pow.cc
@@ -25,7 +25,7 @@
 namespace {
 template <class T>
 inline T power(T a, T b) {
-  return pow(a, b);
+  return std::pow(a, b);
 }
 }  // namespace
 
diff --git a/tfjs-backend-wasm/src/index_test.ts b/tfjs-backend-wasm/src/index_test.ts
index ab38cfad48d..baa71b8ef2f 100644
--- a/tfjs-backend-wasm/src/index_test.ts
+++ b/tfjs-backend-wasm/src/index_test.ts
@@ -102,4 +102,13 @@ describeWithFlags('wasm init', BROWSER_ENVS, () => {
     const data = await result.data();
     console.log(Array.from(data));
   });
+
+  fit('fused batch mm', async () => {
+    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
+    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
+
+    const c = tf.fused.matMul({a, b});
+    const data = await c.data();
+    console.log(data);  // 0, 8, -3, 20
+  });
 });
diff --git a/tfjs-backend-wasm/src/kernels/FusedBatchMatMul.ts b/tfjs-backend-wasm/src/kernels/FusedBatchMatMul.ts
new file mode 100644
index 00000000000..cceacbc0363
--- /dev/null
+++ b/tfjs-backend-wasm/src/kernels/FusedBatchMatMul.ts
@@ -0,0 +1,128 @@
+/**
+ * @license
+ * Copyright 2019 Google Inc. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {NamedAttrMap, NamedTensorInfoMap, registerKernel, TensorInfo} from '@tensorflow/tfjs-core';
+
+import {BackendWasm} from '../backend_wasm';
+
+interface FusedBatchMatMulInputs extends NamedTensorInfoMap {
+  a: TensorInfo;
+  b: TensorInfo;
+  bias?: TensorInfo;
+  preluActivationWeights?: TensorInfo;
+}
+
+interface FusedBatchMatMulAttrs extends NamedAttrMap {
+  transposeA: boolean;
+  transposeB: boolean;
+  activation: FusableActivation;
+}
+
+// Must match enum in batchMatMul_impl.h.
+enum FusableActivation {
+  linear = 0,
+  relu = 1,
+  relu6 = 2,
+  prelu = 3
+}
+
+let wasmFusedBatchMatMul: (
+    aId: number, aShape: Uint8Array, aShapeSize: number, bId: number,
+    bShape: Uint8Array, bShapeSize: number, transposeA: boolean,
+    transposeB: boolean, activation: number, biasId: number,
+    preluActivationWeightsId: number, outId: number) => void;
+
+function setup(backend: BackendWasm) {
+  wasmFusedBatchMatMul =
+      backend.wasm.cwrap('FusedBatchMatMul', null /* void */, [
+        'number',  // a_id
+        'array',   // a_shape
+        'number',  // a_shape.length
+        'number',  // b_id
+        'array',   // b_shape
+        'number',  // b_shape.length
+        'number',  // transpose_a
+        'number',  // transpose_b
+        'number',  // activation
+        'number',  // biasId
+        'number',  // preluActivationWeightsId
+        'number'   // out_id
+      ]);
+}
+
+function fusedBatchMatMul(args: {
+  inputs: FusedBatchMatMulInputs,
+  backend: BackendWasm,
+  attrs: FusedBatchMatMulAttrs
+}) {
+  const {inputs, backend, attrs} = args;
+  const {a, b, bias, preluActivationWeights} = inputs;
+
+  if (a.dtype !== 'float32' || b.dtype !== 'float32') {
+    throw new Error(
+        `FusedBatchMatMul for non non-float32 tensors not yet supported.`);
+  }
+
+  const {transposeA, transposeB, activation} = attrs;
+  const aId = backend.dataIdMap.get(a.dataId).id;
+  const bId = backend.dataIdMap.get(b.dataId).id;
+
+  let biasId = 0;
+  if (bias != null) {
+    const biasData = backend.dataIdMap.get(bias.dataId);
+    if (biasData.shape.length !== 1) {
+      throw new Error(
+          `FusedBatchMatMul only supports rank-1 bias but got ` +
+          `rank ${biasData.shape.length}.`);
+    }
+    biasId = biasData.id;
+  }
+  const preluActivationWeightsId = preluActivationWeights == null ?
+      0 :
+      backend.dataIdMap.get(preluActivationWeights.dataId).id;
+  const fusedActivation =
+      FusableActivation[activation as {} as keyof typeof FusableActivation];
+  if (fusedActivation == null) {
+    throw new Error(
+        `${activation} activation not yet supported for FusedConv2D ` +
+        `in the wasm backend.`);
+  }
+
+  const leftDim = transposeA ? a.shape[2] : a.shape[1];
+  const rightDim = transposeB ? b.shape[1] : b.shape[2];
+  const batchDim = a.shape[0];
+
+  const out = backend.makeOutput([batchDim, leftDim, rightDim], a.dtype);
+  const outId = backend.dataIdMap.get(out.dataId).id;
+
+  const aShapeBytes = new Uint8Array(new Int32Array(a.shape).buffer);
+  const bShapeBytes = new Uint8Array(new Int32Array(b.shape).buffer);
+
+  wasmFusedBatchMatMul(
+      aId, aShapeBytes, a.shape.length, bId, bShapeBytes, b.shape.length,
+      transposeA, transposeB, activation, biasId, preluActivationWeightsId,
+      outId);
+
+  return out;
+}
+
+registerKernel({
+  kernelName: 'FusedBatchMatMul',
+  backendName: 'wasm',
+  setupFunc: setup,
+  kernelFunc: fusedBatchMatMul
+});
diff --git a/tfjs-core/src/ops/fused_ops.ts b/tfjs-core/src/ops/fused_ops.ts
index 821b35406f5..dce09ebbf42 100644
--- a/tfjs-core/src/ops/fused_ops.ts
+++ b/tfjs-core/src/ops/fused_ops.ts
@@ -186,66 +186,70 @@ function fusedMatMul_<T extends Tensor>({
 
     let biasGradient = {};
     if (bias != null) {
-      biasGradient = {$bias: () => getFusedBiasGradient($bias, dyActivation)};
+      biasGradient = {bias: () => getFusedBiasGradient($bias, dyActivation)};
     }
 
     if (!transposeA && !transposeB) {
       return Object.assign(
           {
-            $a: () => dyActivation.matMul(b3D as Tensor3D, false, true),
-            $b: () => a3D.matMul(dyActivation, true, false)
+            a: () => dyActivation.matMul(b3D as Tensor3D, false, true),
+            b: () => a3D.matMul(dyActivation, true, false)
           },
           biasGradient);
     } else if (!transposeA && transposeB) {
       return Object.assign(
           {
-            $a: () => dyActivation.matMul(b3D as Tensor3D, false, false),
-            $b: () => dyActivation.matMul(a3D as Tensor3D, true, false)
+            a: () => dyActivation.matMul(b3D as Tensor3D, false, false),
+            b: () => dyActivation.matMul(a3D as Tensor3D, true, false)
           },
           biasGradient);
     } else if (transposeA && !transposeB) {
       return Object.assign(
           {
-            $a: () => b3D.matMul(dyActivation, false, true),
-            $b: () => a3D.matMul(dyActivation, false, false)
+            a: () => b3D.matMul(dyActivation, false, true),
+            b: () => a3D.matMul(dyActivation, false, false)
           },
           biasGradient);
     } else {
       return Object.assign(
           {
-            $a: () => b3D.matMul(dyActivation, true, true),
-            $b: () => dyActivation.matMul(a3D as Tensor3D, true, true)
+            a: () => b3D.matMul(dyActivation, true, true),
+            b: () => dyActivation.matMul(a3D as Tensor3D, true, true)
           },
           biasGradient);
     }
   };
 
-  const inputs: {
-    $a: Tensor,
-    $b: Tensor,
-    $bias?: Tensor,
-    $preluActivationWeights?: Tensor
-  } = {$a: a3D, $b: b3D};
+  const inputs:
+      {a: Tensor, b: Tensor,
+       bias?: Tensor,
+       preluActivationWeights?: Tensor} = {a: a3D, b: b3D};
   if (bias != null) {
-    inputs.$bias = $bias;
+    inputs.bias = $bias;
   }
   if (preluActivationWeights != null) {
-    inputs.$preluActivationWeights = $preluActivationWeights;
+    inputs.preluActivationWeights = $preluActivationWeights;
   }
 
-  const res = ENGINE.runKernelFunc((backend, save) => {
-    const y = backend.fusedBatchMatMul({
-      a: a3D,
-      b: b3D,
-      transposeA,
-      transposeB,
-      bias: $bias,
-      activation,
-      preluActivationWeights: $preluActivationWeights
-    });
-    save([a3D, b3D, y]);
-    return y;
-  }, inputs, grad);
+  const inputsToSave = [a3D, b3D];
+  const outputsToSave = [true];
+
+  const res = ENGINE.runKernelFunc(
+      (backend, save) => {
+        const y = backend.fusedBatchMatMul({
+          a: a3D,
+          b: b3D,
+          transposeA,
+          transposeB,
+          bias: $bias,
+          activation,
+          preluActivationWeights: $preluActivationWeights
+        });
+        save([a3D, b3D, y]);
+        return y;
+      },
+      inputs, grad, 'FusedBatchMatMul', {transposeA, transposeB, activation},
+      inputsToSave, outputsToSave);
   return res.reshape(outShape) as T;
 }
 

From ad3778ab74f6d49798a76ae705a766843f2fc02a Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Thu, 5 Mar 2020 07:48:07 -0500
Subject: [PATCH 06/35] basic

---
 tfjs-backend-wasm/src/cc/batchMatMul_impl.h   |   4 +-
 .../src/cc/kernels/FusedBatchMatMul.cc        | 192 +++++++++++++++++-
 .../src/cc/kernels/FusedBatchMatMul.h         |  19 +-
 3 files changed, 206 insertions(+), 9 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/batchMatMul_impl.h b/tfjs-backend-wasm/src/cc/batchMatMul_impl.h
index 0942ebe5690..3d315f237cb 100644
--- a/tfjs-backend-wasm/src/cc/batchMatMul_impl.h
+++ b/tfjs-backend-wasm/src/cc/batchMatMul_impl.h
@@ -20,9 +20,11 @@
 namespace tfjs {
 namespace wasm {
 
+enum FusableActivation { LINEAR = 0, RELU = 1, RELU6 = 2, PRELU = 3 };
+
 void batchMatMul();
 
-}
+}  // namespace wasm
 }  // namespace tfjs
 
 #endif  // BATCHMATMUL_IMPL_H_
diff --git a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc
index b0a5154187d..bc58f9580aa 100644
--- a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc
@@ -16,20 +16,204 @@
 #include <emscripten.h>
 #endif
 
-#include "src/cc/kernels/FusedBatchMatMul.h"
+#include <xnnpack.h>
+#include <algorithm>
+#include <cstddef>
+#include <limits>
+#include <map>
+#include <tuple>
+#include <vector>
+
+#include "src/cc/backend.h"
+#include "src/cc/util.h"
 
 #include "src/cc/batchMatMul_impl.h"
+#include "src/cc/kernels/FusedBatchMatMul.h"
+
+const size_t kBlockSize = 48;
+
+namespace {
+// We use std::tuple as the cache key as it implements the compare operator
+// needed for std::map.
+typedef std::tuple<size_t> OperatorCacheKey;
+
+// The operator cache maps the weights id to the xnn_operator_t instantiated for
+// this set of weights.
+std::map<OperatorCacheKey, xnn_operator_t> operator_cache;
+
+void delete_xnn_operator(const size_t weights_id) {
+  xnn_operator_t fully_connected_op = operator_cache.at(weights_id);
+  xnn_delete_operator(fully_connected_op);
+  tfjs::backend::xnn_operator_count--;
+
+  operator_cache.erase(weights_id);
+}
+
+void xnn_matmul(const size_t a_id, const size_t* a_shape_ptr,
+                const size_t a_shape_len, const size_t b_id,
+                const size_t* b_shape_ptr, const size_t b_shape_len,
+                const size_t out_id) {
+  auto& a_info = tfjs::backend::get_tensor_info(a_id);
+  auto& b_info = tfjs::backend::get_tensor_info(b_id);
+  auto& out_info = tfjs::backend::get_tensor_info_out(out_id);
+
+  const float* a_buf = a_info.f32();
+  const float* b_buf = b_info.f32();
+  float* out_buf = out_info.f32_write();
+
+  xnn_operator_t fully_connected_op = nullptr;
+
+  OperatorCacheKey cache_key = {b_id};
+
+  // We assume b is the weights and cache the xnn operator on it.
+  auto operator_cache_idx = operator_cache.find(cache_key);
+  if (operator_cache_idx == operator_cache.end()) {
+    const size_t input_channels = b_shape_ptr[1];
+    const size_t output_channels = b_shape_ptr[2];
+    const size_t input_stride = input_channels;
+    const size_t output_stride = output_channels;
+    const float* bias = nullptr;
+
+    const float output_min = -std::numeric_limits<float>::infinity();
+    const float output_max = std::numeric_limits<float>::infinity();
+
+    // XNNPack expects b to already be transposed. TensorFlow.js doesn't do this
+    // automatically so we have to tell XNNPack to do the transposing.
+    const uint32_t flags = XNN_FLAG_TRANSPOSE_WEIGHTS;
+    xnn_status status = xnn_create_fully_connected_nc_f32(
+        input_channels, output_channels, input_stride, output_stride, b_buf,
+        bias, output_min, output_max, flags, &fully_connected_op);
+    if (status != xnn_status_success) {
+      tfjs::util::warn(
+          "XNN status for xnn_create_fully_connected_nc_f32 is not successful. "
+          "Got status %d. Use -c dbg to see XNN logs.",
+          status);
+      return;
+    }
+
+    operator_cache.insert({cache_key, fully_connected_op});
+
+    tfjs::backend::register_disposal_callback(b_id, *delete_xnn_operator);
+
+    tfjs::backend::xnn_operator_count++;
+  } else {
+    fully_connected_op = operator_cache_idx->second;
+  }
+
+  const size_t batch_size = a_shape_ptr[1];
+  xnn_status status =
+      xnn_setup_fully_connected_nc_f32(fully_connected_op, batch_size, a_buf,
+                                       out_buf, nullptr /* thread pool */);
+  if (status != xnn_status_success) {
+    tfjs::util::warn(
+        "XNN status for xnn_setup_fully_connected_nc_f32 is not successful. "
+        "Got status %d. Use -c dbg to see XNN logs.",
+        status);
+    return;
+  }
+
+  xnn_run_operator(fully_connected_op, nullptr /* thread pool */);
+}
+
+void slow_batch_matmul(const size_t a_id, const size_t* a_shape_ptr,
+                       const size_t a_shape_len, const size_t b_id,
+                       const size_t* b_shape_ptr, const size_t b_shape_len,
+                       const bool transpose_a, const bool transpose_b,
+                       const size_t out_id) {
+  const size_t shared_dim = transpose_a ? a_shape_ptr[1] : a_shape_ptr[2];
+  const size_t left_dim = transpose_a ? a_shape_ptr[2] : a_shape_ptr[1];
+  const size_t right_dim = transpose_b ? b_shape_ptr[1] : b_shape_ptr[2];
+  const size_t batch_dim = a_shape_ptr[0];
+
+  std::vector<size_t> a_shape(a_shape_ptr, a_shape_ptr + a_shape_len);
+  std::vector<size_t> b_shape(b_shape_ptr, b_shape_ptr + b_shape_len);
+  const std::vector<size_t> a_strides = tfjs::util::compute_strides(a_shape);
+  const std::vector<size_t> b_strides = tfjs::util::compute_strides(b_shape);
+
+  size_t a_batch = a_strides[0];
+  size_t a_outer_step, a_inner_step;
+  if (transpose_a) {
+    a_outer_step = 1;
+    a_inner_step = a_strides[1];
+  } else {
+    a_outer_step = a_strides[1];
+    a_inner_step = 1;
+  }
+  size_t b_batch = b_strides[0];
+  size_t b_outer_step, b_inner_step;
+  if (transpose_b) {
+    b_outer_step = b_strides[1];
+    b_inner_step = 1;
+  } else {
+    b_outer_step = 1;
+    b_inner_step = b_strides[1];
+  }
+
+  auto& a_info = tfjs::backend::get_tensor_info(a_id);
+  auto& b_info = tfjs::backend::get_tensor_info(b_id);
+  auto& out_info = tfjs::backend::get_tensor_info_out(out_id);
+
+  const float* a_buf = a_info.f32();
+  const float* b_buf = b_info.f32();
+  float* out_buf = out_info.f32_write();
+
+  const size_t size = left_dim * right_dim;
+
+  // Zero out the output buffer because it might have been used before.
+  std::fill(out_buf, out_buf + batch_dim * size, 0);
+
+  for (size_t b = 0; b < batch_dim; ++b) {
+    for (size_t i0 = 0; i0 < left_dim; i0 += kBlockSize) {
+      for (size_t j0 = 0; j0 < right_dim; j0 += kBlockSize) {
+        for (size_t k0 = 0; k0 < shared_dim; k0 += kBlockSize) {
+          // for when kBlockSize doesn't evenly divide the input
+          const size_t i_block = std::min(i0 + kBlockSize, left_dim);
+          const size_t j_block = std::min(j0 + kBlockSize, right_dim);
+          const size_t k_block = std::min(k0 + kBlockSize, shared_dim);
+
+          for (size_t i = i0; i < i_block; ++i) {
+            for (size_t j = j0; j < j_block; ++j) {
+              float sum = 0.0;
+
+              for (size_t k = k0; k < k_block; ++k) {
+                sum +=
+                    a_buf[b * a_batch + i * a_outer_step + k * a_inner_step] *
+                    b_buf[k * b_inner_step + j * b_outer_step + b * b_batch];
+              }
+              out_buf[b * size + (i * right_dim + j)] += sum;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace
 
 namespace tfjs {
 namespace wasm {
-
+// We use C-style API to interface with Javascript.
 extern "C" {
 
 #ifdef __EMSCRIPTEN__
 EMSCRIPTEN_KEEPALIVE
 #endif
-
-void FusedBatchMatMul() {}
+void FusedBatchMatMul(const size_t a_id, const size_t* a_shape_ptr,
+                      const size_t a_shape_len, const size_t b_id,
+                      const size_t* b_shape_ptr, const size_t b_shape_len,
+                      const bool transpose_a, const bool transpose_b,
+                      const FusableActivation activation, const size_t bias_id,
+                      const size_t prelu_weights_id, const size_t out_id) {
+  if (!transpose_a && !transpose_b && a_shape_ptr[0] == 1 &&
+      b_shape_ptr[0] == 1) {
+    xnn_matmul(a_id, a_shape_ptr, a_shape_len, b_id, b_shape_ptr, b_shape_len,
+               out_id);
+  } else {
+    slow_batch_matmul(a_id, a_shape_ptr, a_shape_len, b_id, b_shape_ptr,
+                      b_shape_len, transpose_a, transpose_b, out_id);
+  }
 }
+
+}  // extern "C"
 }  // namespace wasm
 }  // namespace tfjs
diff --git a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.h b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.h
index 9fb70193516..6af656c47b3 100644
--- a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.h
+++ b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.h
@@ -12,15 +12,26 @@
  * limitations under the License.
  * ===========================================================================*/
 
-#ifndef KERNELS_FUSEDBATCHMATMUL_H_
-#define KERNELS_FUSEDBATCHMATMUL_H_
+#ifndef KERNELS_BATCHMATMUL_H_
+#define KERNELS_BATCHMATMUL_H_
+
+#include <cstddef>
+
+#include "src/cc/batchMatMul_impl.h"
 
 namespace tfjs {
 namespace wasm {
 extern "C" {
-void FusedBatchMatMul();
+
+void FusedBatchMatMul(const size_t a_id, const size_t* a_shape_ptr,
+                      const size_t a_shape_len, const size_t b_id,
+                      const size_t* b_shape_ptr, const size_t b_shape_len,
+                      const bool transpose_a, const bool transpose_b,
+                      const FusableActivation activation, const size_t bias_id,
+                      const size_t prelu_weights_id, const size_t out_id);
 }
+
 }  // namespace wasm
 }  // namespace tfjs
 
-#endif  // KERNELS_FUSEDBATCHMATMUL_H_
+#endif  // KERNELS_BATCHMATMUL_H_

From 0819120eb5ea451bc9524c9f2d8ffcb03cd4f499 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Thu, 5 Mar 2020 07:57:43 -0500
Subject: [PATCH 07/35] yup

---
 tfjs-backend-wasm/src/cc/batchMatMul_impl.cc  | 193 +++++++++++++++++-
 tfjs-backend-wasm/src/cc/batchMatMul_impl.h   |   7 +-
 .../src/cc/kernels/FusedBatchMatMul.cc        | 180 +---------------
 3 files changed, 201 insertions(+), 179 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc b/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
index e7a13a0a4d6..359669012df 100644
--- a/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
+++ b/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
@@ -12,8 +12,199 @@
  * limitations under the License.
  * ===========================================================================*/
 
+#ifdef __EMSCRIPTEN__
+#include <emscripten.h>
+#endif
+
+#include <xnnpack.h>
+#include <algorithm>
+#include <cstddef>
+#include <limits>
+#include <map>
+#include <tuple>
+#include <vector>
+
+#include "src/cc/backend.h"
+#include "src/cc/util.h"
+
+#include "src/cc/batchMatMul_impl.h"
+
+const size_t kBlockSize = 48;
+
+namespace {
+// We use std::tuple as the cache key as it implements the compare operator
+// needed for std::map.
+typedef std::tuple<size_t> OperatorCacheKey;
+
+// The operator cache maps the weights id to the xnn_operator_t instantiated for
+// this set of weights.
+std::map<OperatorCacheKey, xnn_operator_t> operator_cache;
+
+void delete_xnn_operator(const size_t weights_id) {
+  xnn_operator_t fully_connected_op = operator_cache.at(weights_id);
+  xnn_delete_operator(fully_connected_op);
+  tfjs::backend::xnn_operator_count--;
+
+  operator_cache.erase(weights_id);
+}
+
+void xnn_matmul(const size_t a_id, const size_t* a_shape_ptr,
+                const size_t a_shape_len, const size_t b_id,
+                const size_t* b_shape_ptr, const size_t b_shape_len,
+                const size_t out_id) {
+  auto& a_info = tfjs::backend::get_tensor_info(a_id);
+  auto& b_info = tfjs::backend::get_tensor_info(b_id);
+  auto& out_info = tfjs::backend::get_tensor_info_out(out_id);
+
+  const float* a_buf = a_info.f32();
+  const float* b_buf = b_info.f32();
+  float* out_buf = out_info.f32_write();
+
+  xnn_operator_t fully_connected_op = nullptr;
+
+  OperatorCacheKey cache_key = {b_id};
+
+  // We assume b is the weights and cache the xnn operator on it.
+  auto operator_cache_idx = operator_cache.find(cache_key);
+  if (operator_cache_idx == operator_cache.end()) {
+    const size_t input_channels = b_shape_ptr[1];
+    const size_t output_channels = b_shape_ptr[2];
+    const size_t input_stride = input_channels;
+    const size_t output_stride = output_channels;
+    const float* bias = nullptr;
+
+    const float output_min = -std::numeric_limits<float>::infinity();
+    const float output_max = std::numeric_limits<float>::infinity();
+
+    // XNNPack expects b to already be transposed. TensorFlow.js doesn't do this
+    // automatically so we have to tell XNNPack to do the transposing.
+    const uint32_t flags = XNN_FLAG_TRANSPOSE_WEIGHTS;
+    xnn_status status = xnn_create_fully_connected_nc_f32(
+        input_channels, output_channels, input_stride, output_stride, b_buf,
+        bias, output_min, output_max, flags, &fully_connected_op);
+    if (status != xnn_status_success) {
+      tfjs::util::warn(
+          "XNN status for xnn_create_fully_connected_nc_f32 is not successful. "
+          "Got status %d. Use -c dbg to see XNN logs.",
+          status);
+      return;
+    }
+
+    operator_cache.insert({cache_key, fully_connected_op});
+
+    tfjs::backend::register_disposal_callback(b_id, *delete_xnn_operator);
+
+    tfjs::backend::xnn_operator_count++;
+  } else {
+    fully_connected_op = operator_cache_idx->second;
+  }
+
+  const size_t batch_size = a_shape_ptr[1];
+  xnn_status status =
+      xnn_setup_fully_connected_nc_f32(fully_connected_op, batch_size, a_buf,
+                                       out_buf, nullptr /* thread pool */);
+  if (status != xnn_status_success) {
+    tfjs::util::warn(
+        "XNN status for xnn_setup_fully_connected_nc_f32 is not successful. "
+        "Got status %d. Use -c dbg to see XNN logs.",
+        status);
+    return;
+  }
+
+  xnn_run_operator(fully_connected_op, nullptr /* thread pool */);
+}
+
+void slow_batch_matmul(const size_t a_id, const size_t* a_shape_ptr,
+                       const size_t a_shape_len, const size_t b_id,
+                       const size_t* b_shape_ptr, const size_t b_shape_len,
+                       const bool transpose_a, const bool transpose_b,
+                       const size_t out_id) {
+  const size_t shared_dim = transpose_a ? a_shape_ptr[1] : a_shape_ptr[2];
+  const size_t left_dim = transpose_a ? a_shape_ptr[2] : a_shape_ptr[1];
+  const size_t right_dim = transpose_b ? b_shape_ptr[1] : b_shape_ptr[2];
+  const size_t batch_dim = a_shape_ptr[0];
+
+  std::vector<size_t> a_shape(a_shape_ptr, a_shape_ptr + a_shape_len);
+  std::vector<size_t> b_shape(b_shape_ptr, b_shape_ptr + b_shape_len);
+  const std::vector<size_t> a_strides = tfjs::util::compute_strides(a_shape);
+  const std::vector<size_t> b_strides = tfjs::util::compute_strides(b_shape);
+
+  size_t a_batch = a_strides[0];
+  size_t a_outer_step, a_inner_step;
+  if (transpose_a) {
+    a_outer_step = 1;
+    a_inner_step = a_strides[1];
+  } else {
+    a_outer_step = a_strides[1];
+    a_inner_step = 1;
+  }
+  size_t b_batch = b_strides[0];
+  size_t b_outer_step, b_inner_step;
+  if (transpose_b) {
+    b_outer_step = b_strides[1];
+    b_inner_step = 1;
+  } else {
+    b_outer_step = 1;
+    b_inner_step = b_strides[1];
+  }
+
+  auto& a_info = tfjs::backend::get_tensor_info(a_id);
+  auto& b_info = tfjs::backend::get_tensor_info(b_id);
+  auto& out_info = tfjs::backend::get_tensor_info_out(out_id);
+
+  const float* a_buf = a_info.f32();
+  const float* b_buf = b_info.f32();
+  float* out_buf = out_info.f32_write();
+
+  const size_t size = left_dim * right_dim;
+
+  // Zero out the output buffer because it might have been used before.
+  std::fill(out_buf, out_buf + batch_dim * size, 0);
+
+  for (size_t b = 0; b < batch_dim; ++b) {
+    for (size_t i0 = 0; i0 < left_dim; i0 += kBlockSize) {
+      for (size_t j0 = 0; j0 < right_dim; j0 += kBlockSize) {
+        for (size_t k0 = 0; k0 < shared_dim; k0 += kBlockSize) {
+          // for when kBlockSize doesn't evenly divide the input
+          const size_t i_block = std::min(i0 + kBlockSize, left_dim);
+          const size_t j_block = std::min(j0 + kBlockSize, right_dim);
+          const size_t k_block = std::min(k0 + kBlockSize, shared_dim);
+
+          for (size_t i = i0; i < i_block; ++i) {
+            for (size_t j = j0; j < j_block; ++j) {
+              float sum = 0.0;
+
+              for (size_t k = k0; k < k_block; ++k) {
+                sum +=
+                    a_buf[b * a_batch + i * a_outer_step + k * a_inner_step] *
+                    b_buf[k * b_inner_step + j * b_outer_step + b * b_batch];
+              }
+              out_buf[b * size + (i * right_dim + j)] += sum;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace
+
 namespace tfjs {
 namespace wasm {
-void batchMatMul() {}
+void batchMatMul(const size_t a_id, const size_t* a_shape_ptr,
+                 const size_t a_shape_len, const size_t b_id,
+                 const size_t* b_shape_ptr, const size_t b_shape_len,
+                 const bool transpose_a, const bool transpose_b,
+                 const FusableActivation activation, const size_t bias_id,
+                 const size_t prelu_weights_id, const size_t out_id) {
+  if (!transpose_a && !transpose_b && a_shape_ptr[0] == 1 &&
+      b_shape_ptr[0] == 1) {
+    xnn_matmul(a_id, a_shape_ptr, a_shape_len, b_id, b_shape_ptr, b_shape_len,
+               out_id);
+  } else {
+    slow_batch_matmul(a_id, a_shape_ptr, a_shape_len, b_id, b_shape_ptr,
+                      b_shape_len, transpose_a, transpose_b, out_id);
+  }
+}
 }  // namespace wasm
 }  // namespace tfjs
diff --git a/tfjs-backend-wasm/src/cc/batchMatMul_impl.h b/tfjs-backend-wasm/src/cc/batchMatMul_impl.h
index 3d315f237cb..c69c204113a 100644
--- a/tfjs-backend-wasm/src/cc/batchMatMul_impl.h
+++ b/tfjs-backend-wasm/src/cc/batchMatMul_impl.h
@@ -22,7 +22,12 @@ namespace wasm {
 
 enum FusableActivation { LINEAR = 0, RELU = 1, RELU6 = 2, PRELU = 3 };
 
-void batchMatMul();
+void batchMatMul(const size_t a_id, const size_t* a_shape_ptr,
+                 const size_t a_shape_len, const size_t b_id,
+                 const size_t* b_shape_ptr, const size_t b_shape_len,
+                 const bool transpose_a, const bool transpose_b,
+                 const FusableActivation activation, const size_t bias_id,
+                 const size_t prelu_weights_id, const size_t out_id);
 
 }  // namespace wasm
 }  // namespace tfjs
diff --git a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc
index bc58f9580aa..fa5783df4cb 100644
--- a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc
@@ -16,180 +16,11 @@
 #include <emscripten.h>
 #endif
 
-#include <xnnpack.h>
-#include <algorithm>
 #include <cstddef>
-#include <limits>
-#include <map>
-#include <tuple>
-#include <vector>
-
-#include "src/cc/backend.h"
-#include "src/cc/util.h"
 
 #include "src/cc/batchMatMul_impl.h"
 #include "src/cc/kernels/FusedBatchMatMul.h"
 
-const size_t kBlockSize = 48;
-
-namespace {
-// We use std::tuple as the cache key as it implements the compare operator
-// needed for std::map.
-typedef std::tuple<size_t> OperatorCacheKey;
-
-// The operator cache maps the weights id to the xnn_operator_t instantiated for
-// this set of weights.
-std::map<OperatorCacheKey, xnn_operator_t> operator_cache;
-
-void delete_xnn_operator(const size_t weights_id) {
-  xnn_operator_t fully_connected_op = operator_cache.at(weights_id);
-  xnn_delete_operator(fully_connected_op);
-  tfjs::backend::xnn_operator_count--;
-
-  operator_cache.erase(weights_id);
-}
-
-void xnn_matmul(const size_t a_id, const size_t* a_shape_ptr,
-                const size_t a_shape_len, const size_t b_id,
-                const size_t* b_shape_ptr, const size_t b_shape_len,
-                const size_t out_id) {
-  auto& a_info = tfjs::backend::get_tensor_info(a_id);
-  auto& b_info = tfjs::backend::get_tensor_info(b_id);
-  auto& out_info = tfjs::backend::get_tensor_info_out(out_id);
-
-  const float* a_buf = a_info.f32();
-  const float* b_buf = b_info.f32();
-  float* out_buf = out_info.f32_write();
-
-  xnn_operator_t fully_connected_op = nullptr;
-
-  OperatorCacheKey cache_key = {b_id};
-
-  // We assume b is the weights and cache the xnn operator on it.
-  auto operator_cache_idx = operator_cache.find(cache_key);
-  if (operator_cache_idx == operator_cache.end()) {
-    const size_t input_channels = b_shape_ptr[1];
-    const size_t output_channels = b_shape_ptr[2];
-    const size_t input_stride = input_channels;
-    const size_t output_stride = output_channels;
-    const float* bias = nullptr;
-
-    const float output_min = -std::numeric_limits<float>::infinity();
-    const float output_max = std::numeric_limits<float>::infinity();
-
-    // XNNPack expects b to already be transposed. TensorFlow.js doesn't do this
-    // automatically so we have to tell XNNPack to do the transposing.
-    const uint32_t flags = XNN_FLAG_TRANSPOSE_WEIGHTS;
-    xnn_status status = xnn_create_fully_connected_nc_f32(
-        input_channels, output_channels, input_stride, output_stride, b_buf,
-        bias, output_min, output_max, flags, &fully_connected_op);
-    if (status != xnn_status_success) {
-      tfjs::util::warn(
-          "XNN status for xnn_create_fully_connected_nc_f32 is not successful. "
-          "Got status %d. Use -c dbg to see XNN logs.",
-          status);
-      return;
-    }
-
-    operator_cache.insert({cache_key, fully_connected_op});
-
-    tfjs::backend::register_disposal_callback(b_id, *delete_xnn_operator);
-
-    tfjs::backend::xnn_operator_count++;
-  } else {
-    fully_connected_op = operator_cache_idx->second;
-  }
-
-  const size_t batch_size = a_shape_ptr[1];
-  xnn_status status =
-      xnn_setup_fully_connected_nc_f32(fully_connected_op, batch_size, a_buf,
-                                       out_buf, nullptr /* thread pool */);
-  if (status != xnn_status_success) {
-    tfjs::util::warn(
-        "XNN status for xnn_setup_fully_connected_nc_f32 is not successful. "
-        "Got status %d. Use -c dbg to see XNN logs.",
-        status);
-    return;
-  }
-
-  xnn_run_operator(fully_connected_op, nullptr /* thread pool */);
-}
-
-void slow_batch_matmul(const size_t a_id, const size_t* a_shape_ptr,
-                       const size_t a_shape_len, const size_t b_id,
-                       const size_t* b_shape_ptr, const size_t b_shape_len,
-                       const bool transpose_a, const bool transpose_b,
-                       const size_t out_id) {
-  const size_t shared_dim = transpose_a ? a_shape_ptr[1] : a_shape_ptr[2];
-  const size_t left_dim = transpose_a ? a_shape_ptr[2] : a_shape_ptr[1];
-  const size_t right_dim = transpose_b ? b_shape_ptr[1] : b_shape_ptr[2];
-  const size_t batch_dim = a_shape_ptr[0];
-
-  std::vector<size_t> a_shape(a_shape_ptr, a_shape_ptr + a_shape_len);
-  std::vector<size_t> b_shape(b_shape_ptr, b_shape_ptr + b_shape_len);
-  const std::vector<size_t> a_strides = tfjs::util::compute_strides(a_shape);
-  const std::vector<size_t> b_strides = tfjs::util::compute_strides(b_shape);
-
-  size_t a_batch = a_strides[0];
-  size_t a_outer_step, a_inner_step;
-  if (transpose_a) {
-    a_outer_step = 1;
-    a_inner_step = a_strides[1];
-  } else {
-    a_outer_step = a_strides[1];
-    a_inner_step = 1;
-  }
-  size_t b_batch = b_strides[0];
-  size_t b_outer_step, b_inner_step;
-  if (transpose_b) {
-    b_outer_step = b_strides[1];
-    b_inner_step = 1;
-  } else {
-    b_outer_step = 1;
-    b_inner_step = b_strides[1];
-  }
-
-  auto& a_info = tfjs::backend::get_tensor_info(a_id);
-  auto& b_info = tfjs::backend::get_tensor_info(b_id);
-  auto& out_info = tfjs::backend::get_tensor_info_out(out_id);
-
-  const float* a_buf = a_info.f32();
-  const float* b_buf = b_info.f32();
-  float* out_buf = out_info.f32_write();
-
-  const size_t size = left_dim * right_dim;
-
-  // Zero out the output buffer because it might have been used before.
-  std::fill(out_buf, out_buf + batch_dim * size, 0);
-
-  for (size_t b = 0; b < batch_dim; ++b) {
-    for (size_t i0 = 0; i0 < left_dim; i0 += kBlockSize) {
-      for (size_t j0 = 0; j0 < right_dim; j0 += kBlockSize) {
-        for (size_t k0 = 0; k0 < shared_dim; k0 += kBlockSize) {
-          // for when kBlockSize doesn't evenly divide the input
-          const size_t i_block = std::min(i0 + kBlockSize, left_dim);
-          const size_t j_block = std::min(j0 + kBlockSize, right_dim);
-          const size_t k_block = std::min(k0 + kBlockSize, shared_dim);
-
-          for (size_t i = i0; i < i_block; ++i) {
-            for (size_t j = j0; j < j_block; ++j) {
-              float sum = 0.0;
-
-              for (size_t k = k0; k < k_block; ++k) {
-                sum +=
-                    a_buf[b * a_batch + i * a_outer_step + k * a_inner_step] *
-                    b_buf[k * b_inner_step + j * b_outer_step + b * b_batch];
-              }
-              out_buf[b * size + (i * right_dim + j)] += sum;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-}  // namespace
-
 namespace tfjs {
 namespace wasm {
 // We use C-style API to interface with Javascript.
@@ -204,14 +35,9 @@ void FusedBatchMatMul(const size_t a_id, const size_t* a_shape_ptr,
                       const bool transpose_a, const bool transpose_b,
                       const FusableActivation activation, const size_t bias_id,
                       const size_t prelu_weights_id, const size_t out_id) {
-  if (!transpose_a && !transpose_b && a_shape_ptr[0] == 1 &&
-      b_shape_ptr[0] == 1) {
-    xnn_matmul(a_id, a_shape_ptr, a_shape_len, b_id, b_shape_ptr, b_shape_len,
-               out_id);
-  } else {
-    slow_batch_matmul(a_id, a_shape_ptr, a_shape_len, b_id, b_shape_ptr,
-                      b_shape_len, transpose_a, transpose_b, out_id);
-  }
+  tfjs::wasm::batchMatMul(a_id, a_shape_ptr, a_shape_len, b_id, b_shape_ptr,
+                          b_shape_len, transpose_a, transpose_b, activation,
+                          bias_id, prelu_weights_id, out_id);
 }
 
 }  // extern "C"

From e2b152c57abc3c682a71ff2a1bffe058d8353320 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Thu, 5 Mar 2020 08:03:13 -0500
Subject: [PATCH 08/35] create shared impl

---
 .../src/cc/kernels/BatchMatMul.cc             | 184 +-----------------
 tfjs-backend-wasm/src/index_test.ts           |   3 +-
 2 files changed, 9 insertions(+), 178 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc b/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc
index e0417dbacaf..5318dbeb3aa 100644
--- a/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc
@@ -16,179 +16,11 @@
 #include <emscripten.h>
 #endif
 
-#include <xnnpack.h>
-#include <algorithm>
 #include <cstddef>
-#include <limits>
-#include <map>
-#include <tuple>
-#include <vector>
-
-#include "src/cc/backend.h"
-#include "src/cc/util.h"
 
+#include "src/cc/batchMatMul_impl.h"
 #include "src/cc/kernels/BatchMatMul.h"
 
-const size_t kBlockSize = 48;
-
-namespace {
-// We use std::tuple as the cache key as it implements the compare operator
-// needed for std::map.
-typedef std::tuple<size_t> OperatorCacheKey;
-
-// The operator cache maps the weights id to the xnn_operator_t instantiated for
-// this set of weights.
-std::map<OperatorCacheKey, xnn_operator_t> operator_cache;
-
-void delete_xnn_operator(const size_t weights_id) {
-  xnn_operator_t fully_connected_op = operator_cache.at(weights_id);
-  xnn_delete_operator(fully_connected_op);
-  tfjs::backend::xnn_operator_count--;
-
-  operator_cache.erase(weights_id);
-}
-
-void xnn_matmul(const size_t a_id, const size_t* a_shape_ptr,
-                const size_t a_shape_len, const size_t b_id,
-                const size_t* b_shape_ptr, const size_t b_shape_len,
-                const size_t out_id) {
-  auto& a_info = tfjs::backend::get_tensor_info(a_id);
-  auto& b_info = tfjs::backend::get_tensor_info(b_id);
-  auto& out_info = tfjs::backend::get_tensor_info_out(out_id);
-
-  const float* a_buf = a_info.f32();
-  const float* b_buf = b_info.f32();
-  float* out_buf = out_info.f32_write();
-
-  xnn_operator_t fully_connected_op = nullptr;
-
-  OperatorCacheKey cache_key = {b_id};
-
-  // We assume b is the weights and cache the xnn operator on it.
-  auto operator_cache_idx = operator_cache.find(cache_key);
-  if (operator_cache_idx == operator_cache.end()) {
-    const size_t input_channels = b_shape_ptr[1];
-    const size_t output_channels = b_shape_ptr[2];
-    const size_t input_stride = input_channels;
-    const size_t output_stride = output_channels;
-    const float* bias = nullptr;
-
-    const float output_min = -std::numeric_limits<float>::infinity();
-    const float output_max = std::numeric_limits<float>::infinity();
-
-    // XNNPack expects b to already be transposed. TensorFlow.js doesn't do this
-    // automatically so we have to tell XNNPack to do the transposing.
-    const uint32_t flags = XNN_FLAG_TRANSPOSE_WEIGHTS;
-    xnn_status status = xnn_create_fully_connected_nc_f32(
-        input_channels, output_channels, input_stride, output_stride, b_buf,
-        bias, output_min, output_max, flags, &fully_connected_op);
-    if (status != xnn_status_success) {
-      tfjs::util::warn(
-          "XNN status for xnn_create_fully_connected_nc_f32 is not successful. "
-          "Got status %d. Use -c dbg to see XNN logs.",
-          status);
-      return;
-    }
-
-    operator_cache.insert({cache_key, fully_connected_op});
-
-    tfjs::backend::register_disposal_callback(b_id, *delete_xnn_operator);
-
-    tfjs::backend::xnn_operator_count++;
-  } else {
-    fully_connected_op = operator_cache_idx->second;
-  }
-
-  const size_t batch_size = a_shape_ptr[1];
-  xnn_status status =
-      xnn_setup_fully_connected_nc_f32(fully_connected_op, batch_size, a_buf,
-                                       out_buf, nullptr /* thread pool */);
-  if (status != xnn_status_success) {
-    tfjs::util::warn(
-        "XNN status for xnn_setup_fully_connected_nc_f32 is not successful. "
-        "Got status %d. Use -c dbg to see XNN logs.",
-        status);
-    return;
-  }
-
-  xnn_run_operator(fully_connected_op, nullptr /* thread pool */);
-}
-
-void slow_batch_matmul(const size_t a_id, const size_t* a_shape_ptr,
-                       const size_t a_shape_len, const size_t b_id,
-                       const size_t* b_shape_ptr, const size_t b_shape_len,
-                       const bool transpose_a, const bool transpose_b,
-                       const size_t out_id) {
-  const size_t shared_dim = transpose_a ? a_shape_ptr[1] : a_shape_ptr[2];
-  const size_t left_dim = transpose_a ? a_shape_ptr[2] : a_shape_ptr[1];
-  const size_t right_dim = transpose_b ? b_shape_ptr[1] : b_shape_ptr[2];
-  const size_t batch_dim = a_shape_ptr[0];
-
-  std::vector<size_t> a_shape(a_shape_ptr, a_shape_ptr + a_shape_len);
-  std::vector<size_t> b_shape(b_shape_ptr, b_shape_ptr + b_shape_len);
-  const std::vector<size_t> a_strides = tfjs::util::compute_strides(a_shape);
-  const std::vector<size_t> b_strides = tfjs::util::compute_strides(b_shape);
-
-  size_t a_batch = a_strides[0];
-  size_t a_outer_step, a_inner_step;
-  if (transpose_a) {
-    a_outer_step = 1;
-    a_inner_step = a_strides[1];
-  } else {
-    a_outer_step = a_strides[1];
-    a_inner_step = 1;
-  }
-  size_t b_batch = b_strides[0];
-  size_t b_outer_step, b_inner_step;
-  if (transpose_b) {
-    b_outer_step = b_strides[1];
-    b_inner_step = 1;
-  } else {
-    b_outer_step = 1;
-    b_inner_step = b_strides[1];
-  }
-
-  auto& a_info = tfjs::backend::get_tensor_info(a_id);
-  auto& b_info = tfjs::backend::get_tensor_info(b_id);
-  auto& out_info = tfjs::backend::get_tensor_info_out(out_id);
-
-  const float* a_buf = a_info.f32();
-  const float* b_buf = b_info.f32();
-  float* out_buf = out_info.f32_write();
-
-  const size_t size = left_dim * right_dim;
-
-  // Zero out the output buffer because it might have been used before.
-  std::fill(out_buf, out_buf + batch_dim * size, 0);
-
-  for (size_t b = 0; b < batch_dim; ++b) {
-    for (size_t i0 = 0; i0 < left_dim; i0 += kBlockSize) {
-      for (size_t j0 = 0; j0 < right_dim; j0 += kBlockSize) {
-        for (size_t k0 = 0; k0 < shared_dim; k0 += kBlockSize) {
-          // for when kBlockSize doesn't evenly divide the input
-          const size_t i_block = std::min(i0 + kBlockSize, left_dim);
-          const size_t j_block = std::min(j0 + kBlockSize, right_dim);
-          const size_t k_block = std::min(k0 + kBlockSize, shared_dim);
-
-          for (size_t i = i0; i < i_block; ++i) {
-            for (size_t j = j0; j < j_block; ++j) {
-              float sum = 0.0;
-
-              for (size_t k = k0; k < k_block; ++k) {
-                sum +=
-                    a_buf[b * a_batch + i * a_outer_step + k * a_inner_step] *
-                    b_buf[k * b_inner_step + j * b_outer_step + b * b_batch];
-              }
-              out_buf[b * size + (i * right_dim + j)] += sum;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-}  // namespace
-
 namespace tfjs {
 namespace wasm {
 // We use C-style API to interface with Javascript.
@@ -202,14 +34,12 @@ void BatchMatMul(const size_t a_id, const size_t* a_shape_ptr,
                  const size_t* b_shape_ptr, const size_t b_shape_len,
                  const bool transpose_a, const bool transpose_b,
                  const size_t out_id) {
-  if (!transpose_a && !transpose_b && a_shape_ptr[0] == 1 &&
-      b_shape_ptr[0] == 1) {
-    xnn_matmul(a_id, a_shape_ptr, a_shape_len, b_id, b_shape_ptr, b_shape_len,
-               out_id);
-  } else {
-    slow_batch_matmul(a_id, a_shape_ptr, a_shape_len, b_id, b_shape_ptr,
-                      b_shape_len, transpose_a, transpose_b, out_id);
-  }
+  const size_t bias_id = 0;
+  const size_t prelu_weights_id = 0;
+  const FusableActivation activation = FusableActivation::LINEAR;
+  tfjs::wasm::batchMatMul(a_id, a_shape_ptr, a_shape_len, b_id, b_shape_ptr,
+                          b_shape_len, transpose_a, transpose_b, activation,
+                          bias_id, prelu_weights_id, out_id);
 }
 
 }  // extern "C"
diff --git a/tfjs-backend-wasm/src/index_test.ts b/tfjs-backend-wasm/src/index_test.ts
index baa71b8ef2f..5b075a5fa93 100644
--- a/tfjs-backend-wasm/src/index_test.ts
+++ b/tfjs-backend-wasm/src/index_test.ts
@@ -107,7 +107,8 @@ describeWithFlags('wasm init', BROWSER_ENVS, () => {
     const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
     const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
 
-    const c = tf.fused.matMul({a, b});
+    // const c = tf.fused.matMul({a, b});
+    const c = tf.matMul(a, b);
     const data = await c.data();
     console.log(data);  // 0, 8, -3, 20
   });

From d9caed2ecc1e2d0f6db63ff7cd6b91d43f942fbb Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Thu, 5 Mar 2020 08:43:11 -0500
Subject: [PATCH 09/35] add activ

---
 tfjs-backend-wasm/src/cc/batchMatMul_impl.cc  | 34 ++++++++++++++-----
 tfjs-backend-wasm/src/index_test.ts           |  4 +--
 .../src/kernels/FusedBatchMatMul.ts           |  2 +-
 3 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc b/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
index 359669012df..8002f52a2d6 100644
--- a/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
+++ b/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
@@ -51,7 +51,8 @@ void delete_xnn_operator(const size_t weights_id) {
 void xnn_matmul(const size_t a_id, const size_t* a_shape_ptr,
                 const size_t a_shape_len, const size_t b_id,
                 const size_t* b_shape_ptr, const size_t b_shape_len,
-                const size_t out_id) {
+                const size_t out_id, const float output_min,
+                const float output_max) {
   auto& a_info = tfjs::backend::get_tensor_info(a_id);
   auto& b_info = tfjs::backend::get_tensor_info(b_id);
   auto& out_info = tfjs::backend::get_tensor_info_out(out_id);
@@ -73,9 +74,6 @@ void xnn_matmul(const size_t a_id, const size_t* a_shape_ptr,
     const size_t output_stride = output_channels;
     const float* bias = nullptr;
 
-    const float output_min = -std::numeric_limits<float>::infinity();
-    const float output_max = std::numeric_limits<float>::infinity();
-
     // XNNPack expects b to already be transposed. TensorFlow.js doesn't do this
     // automatically so we have to tell XNNPack to do the transposing.
     const uint32_t flags = XNN_FLAG_TRANSPOSE_WEIGHTS;
@@ -118,7 +116,8 @@ void slow_batch_matmul(const size_t a_id, const size_t* a_shape_ptr,
                        const size_t a_shape_len, const size_t b_id,
                        const size_t* b_shape_ptr, const size_t b_shape_len,
                        const bool transpose_a, const bool transpose_b,
-                       const size_t out_id) {
+                       const size_t out_id, const float output_min,
+                       const float output_max) {
   const size_t shared_dim = transpose_a ? a_shape_ptr[1] : a_shape_ptr[2];
   const size_t left_dim = transpose_a ? a_shape_ptr[2] : a_shape_ptr[1];
   const size_t right_dim = transpose_b ? b_shape_ptr[1] : b_shape_ptr[2];
@@ -179,7 +178,10 @@ void slow_batch_matmul(const size_t a_id, const size_t* a_shape_ptr,
                     a_buf[b * a_batch + i * a_outer_step + k * a_inner_step] *
                     b_buf[k * b_inner_step + j * b_outer_step + b * b_batch];
               }
-              out_buf[b * size + (i * right_dim + j)] += sum;
+              size_t out_buf_index = b * size + (i * right_dim + j);
+              float current = out_buf[out_buf_index];
+              out_buf[out_buf_index] =
+                  std::max(std::min(current + sum, output_max), output_min);
             }
           }
         }
@@ -197,13 +199,29 @@ void batchMatMul(const size_t a_id, const size_t* a_shape_ptr,
                  const bool transpose_a, const bool transpose_b,
                  const FusableActivation activation, const size_t bias_id,
                  const size_t prelu_weights_id, const size_t out_id) {
+  FusableActivation clamp_method = activation;
+  if (activation == FusableActivation::PRELU) {
+    clamp_method = FusableActivation::LINEAR;
+  }
+
+  float output_min = -std::numeric_limits<float>::infinity();
+  float output_max = std::numeric_limits<float>::infinity();
+
+  if (activation == FusableActivation::RELU) {
+    output_min = 0;
+  } else if (activation == FusableActivation::RELU6) {
+    output_min = 0;
+    output_max = 6;
+  }
+
   if (!transpose_a && !transpose_b && a_shape_ptr[0] == 1 &&
       b_shape_ptr[0] == 1) {
     xnn_matmul(a_id, a_shape_ptr, a_shape_len, b_id, b_shape_ptr, b_shape_len,
-               out_id);
+               out_id, output_min, output_max);
   } else {
     slow_batch_matmul(a_id, a_shape_ptr, a_shape_len, b_id, b_shape_ptr,
-                      b_shape_len, transpose_a, transpose_b, out_id);
+                      b_shape_len, transpose_a, transpose_b, out_id, output_min,
+                      output_max);
   }
 }
 }  // namespace wasm
diff --git a/tfjs-backend-wasm/src/index_test.ts b/tfjs-backend-wasm/src/index_test.ts
index 5b075a5fa93..dc511c43ffd 100644
--- a/tfjs-backend-wasm/src/index_test.ts
+++ b/tfjs-backend-wasm/src/index_test.ts
@@ -107,8 +107,8 @@ describeWithFlags('wasm init', BROWSER_ENVS, () => {
     const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
     const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
 
-    // const c = tf.fused.matMul({a, b});
-    const c = tf.matMul(a, b);
+    const c = tf.fused.matMul({a, b, activation: 'relu'});
+    // const c = tf.matMul(a, b);
     const data = await c.data();
     console.log(data);  // 0, 8, -3, 20
   });
diff --git a/tfjs-backend-wasm/src/kernels/FusedBatchMatMul.ts b/tfjs-backend-wasm/src/kernels/FusedBatchMatMul.ts
index cceacbc0363..01b59a49fc9 100644
--- a/tfjs-backend-wasm/src/kernels/FusedBatchMatMul.ts
+++ b/tfjs-backend-wasm/src/kernels/FusedBatchMatMul.ts
@@ -114,7 +114,7 @@ function fusedBatchMatMul(args: {
 
   wasmFusedBatchMatMul(
       aId, aShapeBytes, a.shape.length, bId, bShapeBytes, b.shape.length,
-      transposeA, transposeB, activation, biasId, preluActivationWeightsId,
+      transposeA, transposeB, fusedActivation, biasId, preluActivationWeightsId,
       outId);
 
   return out;

From a97619de154761ee73a26969a4bcca84c9b8d416 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Thu, 5 Mar 2020 09:57:45 -0500
Subject: [PATCH 10/35] add clamp to cache key

---
 tfjs-backend-wasm/src/cc/batchMatMul_impl.cc | 60 ++++++++++++++++----
 1 file changed, 50 insertions(+), 10 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc b/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
index 8002f52a2d6..7805937834a 100644
--- a/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
+++ b/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
@@ -18,10 +18,14 @@
 
 #include <xnnpack.h>
 #include <algorithm>
+#include <cmath>
 #include <cstddef>
 #include <limits>
 #include <map>
+#include <memory>
 #include <tuple>
+#include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "src/cc/backend.h"
@@ -34,25 +38,61 @@ const size_t kBlockSize = 48;
 namespace {
 // We use std::tuple as the cache key as it implements the compare operator
 // needed for std::map.
-typedef std::tuple<size_t> OperatorCacheKey;
+typedef std::tuple<size_t, size_t> OperatorCacheKey;
 
 // The operator cache maps the weights id to the xnn_operator_t instantiated for
 // this set of weights.
 std::map<OperatorCacheKey, xnn_operator_t> operator_cache;
 
-void delete_xnn_operator(const size_t weights_id) {
-  xnn_operator_t fully_connected_op = operator_cache.at(weights_id);
-  xnn_delete_operator(fully_connected_op);
-  tfjs::backend::xnn_operator_count--;
+std::unordered_map<size_t, std::vector<OperatorCacheKey>>
+    b_operator_cache_key_map;
+
+void erase_from_cache(const size_t tensor_id,
+                      std::unordered_map<size_t, std::vector<OperatorCacheKey>>&
+                          operator_cache_key_map) {
+  auto operator_cache_keys_idx = operator_cache_key_map.find(tensor_id);
+  if (operator_cache_keys_idx != operator_cache_key_map.end()) {
+    std::vector<OperatorCacheKey>& operator_cache_keys =
+        operator_cache_keys_idx->second;
+    for (auto& operator_cache_key : operator_cache_keys) {
+      auto operator_cache_key_idx = operator_cache.find(operator_cache_key);
+      if (operator_cache_key_idx != operator_cache.end()) {
+        auto& cached_op = operator_cache_key_idx->second;
+        xnn_delete_operator(cached_op);
+        tfjs::backend::xnn_operator_count--;
+
+        operator_cache.erase(operator_cache_key);
+      }
+    }
+    operator_cache_key_map.erase(tensor_id);
+  }
+}
 
-  operator_cache.erase(weights_id);
+void delete_xnn_operators(const size_t weights_id) {
+  erase_from_cache(weights_id, b_operator_cache_key_map);
+}
+
+void associate_tensor_with_key(
+    const size_t tensor_id, const OperatorCacheKey& cache_key,
+    std::unordered_map<size_t, std::vector<OperatorCacheKey>>&
+        operator_cache_key_map) {
+  auto cache_keys_idx = operator_cache_key_map.find(tensor_id);
+  if (cache_keys_idx == operator_cache_key_map.end()) {
+    std::vector<OperatorCacheKey> cache_keys = {cache_key};
+    operator_cache_key_map.emplace(tensor_id, std::move(cache_keys));
+    tfjs::backend::register_disposal_callback(tensor_id, *delete_xnn_operators);
+
+  } else {
+    auto& cache_keys = operator_cache_key_map.at(tensor_id);
+    cache_keys.emplace_back(cache_key);
+  }
 }
 
 void xnn_matmul(const size_t a_id, const size_t* a_shape_ptr,
                 const size_t a_shape_len, const size_t b_id,
                 const size_t* b_shape_ptr, const size_t b_shape_len,
                 const size_t out_id, const float output_min,
-                const float output_max) {
+                const float output_max, const size_t clamp_method) {
   auto& a_info = tfjs::backend::get_tensor_info(a_id);
   auto& b_info = tfjs::backend::get_tensor_info(b_id);
   auto& out_info = tfjs::backend::get_tensor_info_out(out_id);
@@ -63,7 +103,7 @@ void xnn_matmul(const size_t a_id, const size_t* a_shape_ptr,
 
   xnn_operator_t fully_connected_op = nullptr;
 
-  OperatorCacheKey cache_key = {b_id};
+  OperatorCacheKey cache_key = {b_id, clamp_method};
 
   // We assume b is the weights and cache the xnn operator on it.
   auto operator_cache_idx = operator_cache.find(cache_key);
@@ -90,7 +130,7 @@ void xnn_matmul(const size_t a_id, const size_t* a_shape_ptr,
 
     operator_cache.insert({cache_key, fully_connected_op});
 
-    tfjs::backend::register_disposal_callback(b_id, *delete_xnn_operator);
+    associate_tensor_with_key(b_id, cache_key, b_operator_cache_key_map);
 
     tfjs::backend::xnn_operator_count++;
   } else {
@@ -217,7 +257,7 @@ void batchMatMul(const size_t a_id, const size_t* a_shape_ptr,
   if (!transpose_a && !transpose_b && a_shape_ptr[0] == 1 &&
       b_shape_ptr[0] == 1) {
     xnn_matmul(a_id, a_shape_ptr, a_shape_len, b_id, b_shape_ptr, b_shape_len,
-               out_id, output_min, output_max);
+               out_id, output_min, output_max, clamp_method);
   } else {
     slow_batch_matmul(a_id, a_shape_ptr, a_shape_len, b_id, b_shape_ptr,
                       b_shape_len, transpose_a, transpose_b, out_id, output_min,

From 649403c65429fe55b33180faf3d31eec7b1fa317 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Thu, 5 Mar 2020 10:03:41 -0500
Subject: [PATCH 11/35] prelu

---
 tfjs-backend-wasm/src/cc/batchMatMul_impl.cc | 7 +++++++
 tfjs-backend-wasm/src/index_test.ts          | 4 +++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc b/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
index 7805937834a..ce64d2e61e3 100644
--- a/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
+++ b/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
@@ -29,6 +29,7 @@
 #include <vector>
 
 #include "src/cc/backend.h"
+#include "src/cc/prelu_impl.h"
 #include "src/cc/util.h"
 
 #include "src/cc/batchMatMul_impl.h"
@@ -263,6 +264,12 @@ void batchMatMul(const size_t a_id, const size_t* a_shape_ptr,
                       b_shape_len, transpose_a, transpose_b, out_id, output_min,
                       output_max);
   }
+
+  auto& out_info = backend::get_tensor_info_out(out_id);
+  float* out_buf = out_info.f32_write();
+  if (activation == FusableActivation::PRELU) {
+    prelu(out_buf, out_info.size, prelu_weights_id, out_id);
+  }
 }
 }  // namespace wasm
 }  // namespace tfjs
diff --git a/tfjs-backend-wasm/src/index_test.ts b/tfjs-backend-wasm/src/index_test.ts
index dc511c43ffd..9a8d8163c12 100644
--- a/tfjs-backend-wasm/src/index_test.ts
+++ b/tfjs-backend-wasm/src/index_test.ts
@@ -106,8 +106,10 @@ describeWithFlags('wasm init', BROWSER_ENVS, () => {
   fit('fused batch mm', async () => {
     const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
     const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
+    const alpha = tf.tensor2d([0.5, 0.5], [1, 2]);
 
-    const c = tf.fused.matMul({a, b, activation: 'relu'});
+    const c = tf.fused.matMul(
+        {a, b, activation: 'prelu', preluActivationWeights: alpha});
     // const c = tf.matMul(a, b);
     const data = await c.data();
     console.log(data);  // 0, 8, -3, 20

From 698a93c8ddac61e8607de24537159546f02184c6 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Thu, 5 Mar 2020 10:22:30 -0500
Subject: [PATCH 12/35] xnn bias

---
 tfjs-backend-wasm/src/cc/batchMatMul_impl.cc | 39 ++++++++++++++------
 tfjs-backend-wasm/src/index_test.ts          | 16 ++++++++
 2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc b/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
index ce64d2e61e3..dd1d97f4d73 100644
--- a/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
+++ b/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
@@ -39,7 +39,7 @@ const size_t kBlockSize = 48;
 namespace {
 // We use std::tuple as the cache key as it implements the compare operator
 // needed for std::map.
-typedef std::tuple<size_t, size_t> OperatorCacheKey;
+typedef std::tuple<size_t, size_t, size_t> OperatorCacheKey;
 
 // The operator cache maps the weights id to the xnn_operator_t instantiated for
 // this set of weights.
@@ -48,6 +48,9 @@ std::map<OperatorCacheKey, xnn_operator_t> operator_cache;
 std::unordered_map<size_t, std::vector<OperatorCacheKey>>
     b_operator_cache_key_map;
 
+std::unordered_map<size_t, std::vector<OperatorCacheKey>>
+    bias_operator_cache_key_map;
+
 void erase_from_cache(const size_t tensor_id,
                       std::unordered_map<size_t, std::vector<OperatorCacheKey>>&
                           operator_cache_key_map) {
@@ -92,8 +95,9 @@ void associate_tensor_with_key(
 void xnn_matmul(const size_t a_id, const size_t* a_shape_ptr,
                 const size_t a_shape_len, const size_t b_id,
                 const size_t* b_shape_ptr, const size_t b_shape_len,
-                const size_t out_id, const float output_min,
-                const float output_max, const size_t clamp_method) {
+                const size_t out_id, const size_t bias_id,
+                const float output_min, const float output_max,
+                const size_t clamp_method) {
   auto& a_info = tfjs::backend::get_tensor_info(a_id);
   auto& b_info = tfjs::backend::get_tensor_info(b_id);
   auto& out_info = tfjs::backend::get_tensor_info_out(out_id);
@@ -102,9 +106,14 @@ void xnn_matmul(const size_t a_id, const size_t* a_shape_ptr,
   const float* b_buf = b_info.f32();
   float* out_buf = out_info.f32_write();
 
+  const float* bias_buf = nullptr;
+  if (bias_id != 0) {
+    bias_buf = tfjs::backend::get_tensor_info_out(bias_id).f32();
+  }
+
   xnn_operator_t fully_connected_op = nullptr;
 
-  OperatorCacheKey cache_key = {b_id, clamp_method};
+  OperatorCacheKey cache_key = {b_id, bias_id, clamp_method};
 
   // We assume b is the weights and cache the xnn operator on it.
   auto operator_cache_idx = operator_cache.find(cache_key);
@@ -113,14 +122,13 @@ void xnn_matmul(const size_t a_id, const size_t* a_shape_ptr,
     const size_t output_channels = b_shape_ptr[2];
     const size_t input_stride = input_channels;
     const size_t output_stride = output_channels;
-    const float* bias = nullptr;
 
     // XNNPack expects b to already be transposed. TensorFlow.js doesn't do this
     // automatically so we have to tell XNNPack to do the transposing.
     const uint32_t flags = XNN_FLAG_TRANSPOSE_WEIGHTS;
     xnn_status status = xnn_create_fully_connected_nc_f32(
         input_channels, output_channels, input_stride, output_stride, b_buf,
-        bias, output_min, output_max, flags, &fully_connected_op);
+        bias_buf, output_min, output_max, flags, &fully_connected_op);
     if (status != xnn_status_success) {
       tfjs::util::warn(
           "XNN status for xnn_create_fully_connected_nc_f32 is not successful. "
@@ -132,6 +140,10 @@ void xnn_matmul(const size_t a_id, const size_t* a_shape_ptr,
     operator_cache.insert({cache_key, fully_connected_op});
 
     associate_tensor_with_key(b_id, cache_key, b_operator_cache_key_map);
+    if (bias_id != 0) {
+      associate_tensor_with_key(bias_id, cache_key,
+                                bias_operator_cache_key_map);
+    }
 
     tfjs::backend::xnn_operator_count++;
   } else {
@@ -157,8 +169,8 @@ void slow_batch_matmul(const size_t a_id, const size_t* a_shape_ptr,
                        const size_t a_shape_len, const size_t b_id,
                        const size_t* b_shape_ptr, const size_t b_shape_len,
                        const bool transpose_a, const bool transpose_b,
-                       const size_t out_id, const float output_min,
-                       const float output_max) {
+                       const size_t out_id, const size_t bias_id,
+                       const float output_min, const float output_max) {
   const size_t shared_dim = transpose_a ? a_shape_ptr[1] : a_shape_ptr[2];
   const size_t left_dim = transpose_a ? a_shape_ptr[2] : a_shape_ptr[1];
   const size_t right_dim = transpose_b ? b_shape_ptr[1] : b_shape_ptr[2];
@@ -196,6 +208,11 @@ void slow_batch_matmul(const size_t a_id, const size_t* a_shape_ptr,
   const float* b_buf = b_info.f32();
   float* out_buf = out_info.f32_write();
 
+  const float* bias_buf = nullptr;
+  if (bias_id != 0) {
+    bias_buf = tfjs::backend::get_tensor_info_out(bias_id).f32();
+  }
+
   const size_t size = left_dim * right_dim;
 
   // Zero out the output buffer because it might have been used before.
@@ -258,11 +275,11 @@ void batchMatMul(const size_t a_id, const size_t* a_shape_ptr,
   if (!transpose_a && !transpose_b && a_shape_ptr[0] == 1 &&
       b_shape_ptr[0] == 1) {
     xnn_matmul(a_id, a_shape_ptr, a_shape_len, b_id, b_shape_ptr, b_shape_len,
-               out_id, output_min, output_max, clamp_method);
+               out_id, bias_id, output_min, output_max, clamp_method);
   } else {
     slow_batch_matmul(a_id, a_shape_ptr, a_shape_len, b_id, b_shape_ptr,
-                      b_shape_len, transpose_a, transpose_b, out_id, output_min,
-                      output_max);
+                      b_shape_len, transpose_a, transpose_b, out_id, bias_id,
+                      output_min, output_max);
   }
 
   auto& out_info = backend::get_tensor_info_out(out_id);
diff --git a/tfjs-backend-wasm/src/index_test.ts b/tfjs-backend-wasm/src/index_test.ts
index 9a8d8163c12..894c2002c91 100644
--- a/tfjs-backend-wasm/src/index_test.ts
+++ b/tfjs-backend-wasm/src/index_test.ts
@@ -114,4 +114,20 @@ describeWithFlags('wasm init', BROWSER_ENVS, () => {
     const data = await c.data();
     console.log(data);  // 0, 8, -3, 20
   });
+
+  fit('fused batch mm with bias', async () => {
+    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
+    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
+    const c = tf.tensor1d([1, 1]);
+    const transposeA = false;
+    const transposeB = false;
+
+    const d = tf.fused.matMul(
+        {a, b, transposeA, transposeB, bias: c, activation: 'relu'});
+
+    expect(d.shape).toEqual([2, 2]);
+    const data = await d.data();
+    console.log(data);
+    // expectArraysClose(await d.data(), [1, 9, 0, 21]);
+  });
 });

From 1d259362dc56627feace36601ce3ffad5b0d8c88 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Thu, 5 Mar 2020 10:50:03 -0500
Subject: [PATCH 13/35] add bias to slow matmul

---
 tfjs-backend-wasm/src/cc/batchMatMul_impl.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc b/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
index dd1d97f4d73..53f8365eb15 100644
--- a/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
+++ b/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
@@ -236,10 +236,12 @@ void slow_batch_matmul(const size_t a_id, const size_t* a_shape_ptr,
                     a_buf[b * a_batch + i * a_outer_step + k * a_inner_step] *
                     b_buf[k * b_inner_step + j * b_outer_step + b * b_batch];
               }
-              size_t out_buf_index = b * size + (i * right_dim + j);
+              size_t innermost_dim = i * right_dim + j;
+              size_t out_buf_index = b * size + innermost_dim;
               float current = out_buf[out_buf_index];
-              out_buf[out_buf_index] =
-                  std::max(std::min(current + sum, output_max), output_min);
+              out_buf[out_buf_index] = std::max(
+                  std::min(current + sum + bias_buf[innermost_dim], output_max),
+                  output_min);
             }
           }
         }

From 08932125e9cd7498517baf71ea59dcccfc274f46 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Thu, 5 Mar 2020 13:05:32 -0500
Subject: [PATCH 14/35] broadcaset

---
 tfjs-backend-wasm/src/cc/batchMatMul_impl.cc | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc b/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
index 53f8365eb15..e98f5f71c29 100644
--- a/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
+++ b/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
@@ -209,8 +209,9 @@ void slow_batch_matmul(const size_t a_id, const size_t* a_shape_ptr,
   float* out_buf = out_info.f32_write();
 
   const float* bias_buf = nullptr;
+  auto& bias_info = tfjs::backend::get_tensor_info_out(bias_id);
   if (bias_id != 0) {
-    bias_buf = tfjs::backend::get_tensor_info_out(bias_id).f32();
+    bias_buf = bias_info.f32();
   }
 
   const size_t size = left_dim * right_dim;
@@ -239,8 +240,12 @@ void slow_batch_matmul(const size_t a_id, const size_t* a_shape_ptr,
               size_t innermost_dim = i * right_dim + j;
               size_t out_buf_index = b * size + innermost_dim;
               float current = out_buf[out_buf_index];
+
+              // Handles 1D broadcasting.
+              size_t bias_index = std::min(innermost_dim, bias_info.size - 1);
+
               out_buf[out_buf_index] = std::max(
-                  std::min(current + sum + bias_buf[innermost_dim], output_max),
+                  std::min(current + sum + bias_buf[bias_index], output_max),
                   output_min);
             }
           }

From 31bb2ace4648679d56b4e3345845fc030bfacfc0 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Thu, 5 Mar 2020 13:32:17 -0500
Subject: [PATCH 15/35] localize type

---
 tfjs-backend-wasm/src/cc/BUILD                         |  3 +++
 tfjs-backend-wasm/src/cc/backend.h                     |  3 +++
 tfjs-backend-wasm/src/cc/batchMatMul_impl.h            |  2 --
 tfjs-backend-wasm/src/cc/conv2d_impl.h                 |  5 ++---
 tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc        |  1 +
 tfjs-backend-wasm/src/cc/kernels/Conv2D.cc             |  2 +-
 .../src/cc/kernels/DepthwiseConv2dNative.cc            |  1 +
 tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc   |  1 +
 .../src/cc/kernels/FusedDepthwiseConv2D.cc             |  1 +
 .../src/cc/kernels/FusedDepthwiseConv2D.h              |  2 +-
 tfjs-backend-wasm/src/kernels/FusedBatchMatMul.ts      | 10 ++--------
 tfjs-backend-wasm/src/kernels/FusedConv2D.ts           | 10 ++--------
 tfjs-backend-wasm/src/kernels/FusedDepthwiseConv2D.ts  |  9 +--------
 tfjs-backend-wasm/src/kernels/types.ts                 |  8 ++++++++
 14 files changed, 27 insertions(+), 31 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/BUILD b/tfjs-backend-wasm/src/cc/BUILD
index b6a34763ce8..eaa4f2d9c3c 100644
--- a/tfjs-backend-wasm/src/cc/BUILD
+++ b/tfjs-backend-wasm/src/cc/BUILD
@@ -317,6 +317,7 @@ tfjs_cc_library(
     srcs = ["kernels/Conv2D.cc"],
     hdrs = ["kernels/Conv2D.h"],
     deps = [
+        ":backend",
         ":conv2d_impl",
     ],
 )
@@ -344,6 +345,7 @@ tfjs_cc_library(
     srcs = ["kernels/DepthwiseConv2dNative.cc"],
     hdrs = ["kernels/DepthwiseConv2dNative.h"],
     deps = [
+        ":backend",
         ":conv2d_impl",
     ],
 )
@@ -416,6 +418,7 @@ tfjs_cc_library(
     srcs = ["kernels/FusedDepthwiseConv2D.cc"],
     hdrs = ["kernels/FusedDepthwiseConv2D.h"],
     deps = [
+        ":backend",
         ":conv2d_impl",
     ],
 )
diff --git a/tfjs-backend-wasm/src/cc/backend.h b/tfjs-backend-wasm/src/cc/backend.h
index f730bd120ea..581066ec8ef 100644
--- a/tfjs-backend-wasm/src/cc/backend.h
+++ b/tfjs-backend-wasm/src/cc/backend.h
@@ -27,6 +27,9 @@ enum DType {
   complex64 = 4,
 };
 
+// Must match enum in kernels/types.ts.
+enum FusableActivation { LINEAR = 0, RELU = 1, RELU6 = 2, PRELU = 3 };
+
 // Holds the memory offset and the size of a tensor.
 struct TensorInfo {
   // Pointer to the bytes where the data is allocated.
diff --git a/tfjs-backend-wasm/src/cc/batchMatMul_impl.h b/tfjs-backend-wasm/src/cc/batchMatMul_impl.h
index c69c204113a..73a16b95a65 100644
--- a/tfjs-backend-wasm/src/cc/batchMatMul_impl.h
+++ b/tfjs-backend-wasm/src/cc/batchMatMul_impl.h
@@ -20,8 +20,6 @@
 namespace tfjs {
 namespace wasm {
 
-enum FusableActivation { LINEAR = 0, RELU = 1, RELU6 = 2, PRELU = 3 };
-
 void batchMatMul(const size_t a_id, const size_t* a_shape_ptr,
                  const size_t a_shape_len, const size_t b_id,
                  const size_t* b_shape_ptr, const size_t b_shape_len,
diff --git a/tfjs-backend-wasm/src/cc/conv2d_impl.h b/tfjs-backend-wasm/src/cc/conv2d_impl.h
index 1836362b5aa..9d796f8fd94 100644
--- a/tfjs-backend-wasm/src/cc/conv2d_impl.h
+++ b/tfjs-backend-wasm/src/cc/conv2d_impl.h
@@ -17,12 +17,11 @@
 
 #include <cstddef>
 
+#include "src/cc/backend.h"
+
 namespace tfjs {
 namespace wasm {
 
-// Must match enum in FusedConv2D.ts.
-enum FusableActivation { LINEAR = 0, RELU = 1, RELU6 = 2, PRELU = 3 };
-
 void conv2d(const size_t x_id, const size_t batch_size,
             const size_t input_height, const size_t input_width,
             const size_t filter_id, const size_t filter_height,
diff --git a/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc b/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc
index 5318dbeb3aa..aa95cfc680e 100644
--- a/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc
@@ -18,6 +18,7 @@
 
 #include <cstddef>
 
+#include "src/cc/backend.h"
 #include "src/cc/batchMatMul_impl.h"
 #include "src/cc/kernels/BatchMatMul.h"
 
diff --git a/tfjs-backend-wasm/src/cc/kernels/Conv2D.cc b/tfjs-backend-wasm/src/cc/kernels/Conv2D.cc
index 552c9e93521..a3b9a973cd7 100644
--- a/tfjs-backend-wasm/src/cc/kernels/Conv2D.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/Conv2D.cc
@@ -20,7 +20,7 @@
 
 #include <cstddef>
 
-#include "src/cc/conv2d_impl.h"
+#include "src/cc/backend.h"
 
 namespace tfjs {
 namespace wasm {
diff --git a/tfjs-backend-wasm/src/cc/kernels/DepthwiseConv2dNative.cc b/tfjs-backend-wasm/src/cc/kernels/DepthwiseConv2dNative.cc
index 621690197d4..a9f3cc9adb6 100644
--- a/tfjs-backend-wasm/src/cc/kernels/DepthwiseConv2dNative.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/DepthwiseConv2dNative.cc
@@ -18,6 +18,7 @@
 
 #include <cstddef>
 
+#include "src/cc/backend.h"
 #include "src/cc/conv2d_impl.h"
 #include "src/cc/kernels/DepthwiseConv2dNative.h"
 
diff --git a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc
index fa5783df4cb..31c6dbbc6d2 100644
--- a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc
@@ -18,6 +18,7 @@
 
 #include <cstddef>
 
+#include "src/cc/backend.h"
 #include "src/cc/batchMatMul_impl.h"
 #include "src/cc/kernels/FusedBatchMatMul.h"
 
diff --git a/tfjs-backend-wasm/src/cc/kernels/FusedDepthwiseConv2D.cc b/tfjs-backend-wasm/src/cc/kernels/FusedDepthwiseConv2D.cc
index 5d8b55453a6..e796e9b8043 100644
--- a/tfjs-backend-wasm/src/cc/kernels/FusedDepthwiseConv2D.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/FusedDepthwiseConv2D.cc
@@ -20,6 +20,7 @@
 
 #include "src/cc/kernels/FusedDepthwiseConv2D.h"
 
+#include "src/cc/backend.h"
 #include "src/cc/conv2d_impl.h"
 
 namespace tfjs {
diff --git a/tfjs-backend-wasm/src/cc/kernels/FusedDepthwiseConv2D.h b/tfjs-backend-wasm/src/cc/kernels/FusedDepthwiseConv2D.h
index 34bacbbe4e2..4a876d349da 100644
--- a/tfjs-backend-wasm/src/cc/kernels/FusedDepthwiseConv2D.h
+++ b/tfjs-backend-wasm/src/cc/kernels/FusedDepthwiseConv2D.h
@@ -17,7 +17,7 @@
 
 #include <cstddef>
 
-#include "src/cc/conv2d_impl.h"
+#include "src/cc/backend.h"
 
 namespace tfjs {
 
diff --git a/tfjs-backend-wasm/src/kernels/FusedBatchMatMul.ts b/tfjs-backend-wasm/src/kernels/FusedBatchMatMul.ts
index 01b59a49fc9..0b0253df2b1 100644
--- a/tfjs-backend-wasm/src/kernels/FusedBatchMatMul.ts
+++ b/tfjs-backend-wasm/src/kernels/FusedBatchMatMul.ts
@@ -19,6 +19,8 @@ import {NamedAttrMap, NamedTensorInfoMap, registerKernel, TensorInfo} from '@ten
 
 import {BackendWasm} from '../backend_wasm';
 
+import {FusableActivation} from './types';
+
 interface FusedBatchMatMulInputs extends NamedTensorInfoMap {
   a: TensorInfo;
   b: TensorInfo;
@@ -32,14 +34,6 @@ interface FusedBatchMatMulAttrs extends NamedAttrMap {
   activation: FusableActivation;
 }
 
-// Must match enum in batchMatMul_impl.h.
-enum FusableActivation {
-  linear = 0,
-  relu = 1,
-  relu6 = 2,
-  prelu = 3
-}
-
 let wasmFusedBatchMatMul: (
     aId: number, aShape: Uint8Array, aShapeSize: number, bId: number,
     bShape: Uint8Array, bShapeSize: number, transposeA: boolean,
diff --git a/tfjs-backend-wasm/src/kernels/FusedConv2D.ts b/tfjs-backend-wasm/src/kernels/FusedConv2D.ts
index cf3c70e9fe3..7fd7054c2fd 100644
--- a/tfjs-backend-wasm/src/kernels/FusedConv2D.ts
+++ b/tfjs-backend-wasm/src/kernels/FusedConv2D.ts
@@ -19,6 +19,8 @@ import {backend_util, KernelFunc, NamedTensorInfoMap, registerKernel, TensorInfo
 
 import {BackendWasm} from '../backend_wasm';
 
+import {FusableActivation} from './types';
+
 interface FusedConv2DInputs extends NamedTensorInfoMap {
   x: TensorInfo;
   filter: TensorInfo;
@@ -61,14 +63,6 @@ function setup(backend: BackendWasm) {
   ]);
 }
 
-// Must match enum in conv2d_impl.h.
-enum FusableActivation {
-  linear = 0,
-  relu = 1,
-  relu6 = 2,
-  prelu = 3
-}
-
 function fusedConv2d(args: {
   inputs: FusedConv2DInputs,
   backend: BackendWasm,
diff --git a/tfjs-backend-wasm/src/kernels/FusedDepthwiseConv2D.ts b/tfjs-backend-wasm/src/kernels/FusedDepthwiseConv2D.ts
index ba482015887..03d28d6e172 100644
--- a/tfjs-backend-wasm/src/kernels/FusedDepthwiseConv2D.ts
+++ b/tfjs-backend-wasm/src/kernels/FusedDepthwiseConv2D.ts
@@ -18,6 +18,7 @@
 import {backend_util, KernelFunc, NamedTensorInfoMap, registerKernel, TensorInfo} from '@tensorflow/tfjs-core';
 
 import {BackendWasm} from '../backend_wasm';
+import {FusableActivation} from './types';
 
 interface FusedDepthwiseConv2DInputs extends NamedTensorInfoMap {
   x: TensorInfo;
@@ -62,14 +63,6 @@ function setup(backend: BackendWasm) {
       ]);
 }
 
-// Must match enum in conv2d_impl.h.
-enum FusableActivation {
-  linear = 0,
-  relu = 1,
-  relu6 = 2,
-  prelu = 3
-}
-
 function fusedDepthwiseConv2d(args: {
   inputs: FusedDepthwiseConv2DInputs,
   backend: BackendWasm,
diff --git a/tfjs-backend-wasm/src/kernels/types.ts b/tfjs-backend-wasm/src/kernels/types.ts
index 2a778e4aa30..d13b2e0b871 100644
--- a/tfjs-backend-wasm/src/kernels/types.ts
+++ b/tfjs-backend-wasm/src/kernels/types.ts
@@ -23,3 +23,11 @@ export enum CppDType {
   string = 3,
   complex64 = 4
 }
+
+// Must match enum in cc/fusable_activations.h.
+export enum FusableActivation {
+  linear = 0,
+  relu = 1,
+  relu6 = 2,
+  prelu = 3
+}

From 0d528e45c4b8c666d97bd851b8e8edfc273446ed Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Thu, 5 Mar 2020 14:04:20 -0500
Subject: [PATCH 16/35] rename

---
 tfjs-backend-wasm/src/cc/BUILD                         | 10 +++++-----
 .../cc/{batchMatMul_impl.cc => batch_mat_mul_impl.cc}  |  2 +-
 .../cc/{batchMatMul_impl.h => batch_mat_mul_impl.h}    |  0
 tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc        |  2 +-
 tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc   |  2 +-
 tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.h    |  2 --
 6 files changed, 8 insertions(+), 10 deletions(-)
 rename tfjs-backend-wasm/src/cc/{batchMatMul_impl.cc => batch_mat_mul_impl.cc} (99%)
 rename tfjs-backend-wasm/src/cc/{batchMatMul_impl.h => batch_mat_mul_impl.h} (100%)

diff --git a/tfjs-backend-wasm/src/cc/BUILD b/tfjs-backend-wasm/src/cc/BUILD
index eaa4f2d9c3c..8e6f8646597 100644
--- a/tfjs-backend-wasm/src/cc/BUILD
+++ b/tfjs-backend-wasm/src/cc/BUILD
@@ -95,9 +95,9 @@ tfjs_cc_library(
 )
 
 tfjs_cc_library(
-    name = "batchMatMul_impl",
-    srcs = ["batchMatMul_impl.cc"],
-    hdrs = ["batchMatMul_impl.h"],
+    name = "batch_mat_mul_impl",
+    srcs = ["batch_mat_mul_impl.cc"],
+    hdrs = ["batch_mat_mul_impl.h"],
     deps = [
         ":backend",
         ":prelu_impl",
@@ -271,7 +271,7 @@ tfjs_cc_library(
     deps = [
         ":backend",
         ":util",
-        ":batchMatMul_impl",
+        ":batch_mat_mul_impl",
     ],
 )
 
@@ -282,7 +282,7 @@ tfjs_cc_library(
     deps = [
         ":backend",
         ":util",
-        ":batchMatMul_impl",
+        ":batch_mat_mul_impl",
     ],
 )
 
diff --git a/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc b/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.cc
similarity index 99%
rename from tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
rename to tfjs-backend-wasm/src/cc/batch_mat_mul_impl.cc
index e98f5f71c29..b80d60294a1 100644
--- a/tfjs-backend-wasm/src/cc/batchMatMul_impl.cc
+++ b/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.cc
@@ -32,7 +32,7 @@
 #include "src/cc/prelu_impl.h"
 #include "src/cc/util.h"
 
-#include "src/cc/batchMatMul_impl.h"
+#include "src/cc/batch_mat_mul_impl.h"
 
 const size_t kBlockSize = 48;
 
diff --git a/tfjs-backend-wasm/src/cc/batchMatMul_impl.h b/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.h
similarity index 100%
rename from tfjs-backend-wasm/src/cc/batchMatMul_impl.h
rename to tfjs-backend-wasm/src/cc/batch_mat_mul_impl.h
diff --git a/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc b/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc
index aa95cfc680e..93f1e3c0507 100644
--- a/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc
@@ -19,7 +19,7 @@
 #include <cstddef>
 
 #include "src/cc/backend.h"
-#include "src/cc/batchMatMul_impl.h"
+#include "src/cc/batch_mat_mul_impl.h"
 #include "src/cc/kernels/BatchMatMul.h"
 
 namespace tfjs {
diff --git a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc
index 31c6dbbc6d2..6ac640f76df 100644
--- a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc
@@ -19,7 +19,7 @@
 #include <cstddef>
 
 #include "src/cc/backend.h"
-#include "src/cc/batchMatMul_impl.h"
+#include "src/cc/batch_mat_mul_impl.h"
 #include "src/cc/kernels/FusedBatchMatMul.h"
 
 namespace tfjs {
diff --git a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.h b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.h
index 6af656c47b3..bc88ea75493 100644
--- a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.h
+++ b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.h
@@ -17,8 +17,6 @@
 
 #include <cstddef>
 
-#include "src/cc/batchMatMul_impl.h"
-
 namespace tfjs {
 namespace wasm {
 extern "C" {

From 01be763496971a9fb19e232bd6b9a96225bd664c Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Thu, 5 Mar 2020 14:20:53 -0500
Subject: [PATCH 17/35] add tests

---
 tfjs-backend-wasm/src/cc/BUILD                |  8 ++
 .../src/cc/kernels/FusedBatchMatMul_test.cc   | 93 +++++++++++++++++++
 .../src/cc/kernels/FusedConv2D_test.cc        |  7 +-
 .../cc/kernels/FusedDepthwiseConv2D_test.cc   |  9 +-
 4 files changed, 106 insertions(+), 11 deletions(-)
 create mode 100644 tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc

diff --git a/tfjs-backend-wasm/src/cc/BUILD b/tfjs-backend-wasm/src/cc/BUILD
index 8e6f8646597..040fd9a51cf 100644
--- a/tfjs-backend-wasm/src/cc/BUILD
+++ b/tfjs-backend-wasm/src/cc/BUILD
@@ -294,6 +294,14 @@ tfjs_unit_test(
     ],
 )
 
+tfjs_unit_test(
+    name = "FusedBatchMatMul_test",
+    srcs = ["kernels/FusedBatchMatMul_test.cc"],
+    deps = [
+        ":FusedBatchMatMul",
+    ],
+)
+
 tfjs_cc_library(
     name = "ClipByValue",
     srcs = ["kernels/ClipByValue.cc"],
diff --git a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc
new file mode 100644
index 00000000000..f912c5f0450
--- /dev/null
+++ b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc
@@ -0,0 +1,93 @@
+/* Copyright 2020 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ===========================================================================*/
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <vector>
+
+#include "src/cc/backend.h"
+#include "src/cc/kernels/FusedBatchMatMul.h"
+
+TEST(FUSEDBATCH_MATMUL, xnn_operator_lfietime) {
+  tfjs::wasm::init();
+
+  ASSERT_EQ(0, tfjs::backend::num_tensors());
+
+  size_t a0_id = 1;
+  size_t a1_id = 2;
+  size_t size = 2;
+  float a_values[2] = {1, 2};
+  std::vector<size_t> a_shape = {1, 2, 1};
+  size_t* a_shape_ptr = a_shape.data();
+
+  size_t b0_id = 3;
+  size_t b1_id = 4;
+  float b_values[2] = {1, 2};
+  std::vector<size_t> b_shape = {1, 1, 2};
+  size_t* b_shape_ptr = b_shape.data();
+
+  size_t out_id = 5;
+  float out_values[2] = {0, 0};
+
+  tfjs::wasm::register_tensor(a0_id, size, a_values);
+  tfjs::wasm::register_tensor(a1_id, size, a_values);
+  tfjs::wasm::register_tensor(b0_id, size, b_values);
+  tfjs::wasm::register_tensor(b1_id, size, b_values);
+  tfjs::wasm::register_tensor(out_id, size, out_values);
+
+  ASSERT_EQ(5, tfjs::backend::num_tensors());
+  ASSERT_EQ(0, tfjs::backend::xnn_operator_count);
+
+  // One new xnn_operator should be created for the first call to BatchMatMul.
+  tfjs::wasm::FusedBatchMatMul(
+      a0_id, a_shape_ptr, a_shape.size(), b0_id, b_shape_ptr, b_shape.size(),
+      false /* transpose_a */, false /* transpose_b */, out_id);
+  ASSERT_EQ(1, tfjs::backend::xnn_operator_count);
+
+  // No new xnn_operators should be created for the second call to BatchMatMul
+  // with the same b's.
+  tfjs::wasm::FusedBatchMatMul(
+      a0_id, a_shape_ptr, a_shape.size(), b0_id, b_shape_ptr, b_shape.size(),
+      false /* transpose_a */, false /* transpose_b */, out_id);
+  ASSERT_EQ(1, tfjs::backend::xnn_operator_count);
+
+  // One new xnn_operator should be created for another call to BatchMatMul with
+  // new b's.
+  tfjs::wasm::FusedBatchMatMul(
+      a0_id, a_shape_ptr, a_shape.size(), b1_id, b_shape_ptr, b_shape.size(),
+      false /* transpose_a */, false /* transpose_b */, out_id);
+  ASSERT_EQ(2, tfjs::backend::xnn_operator_count);
+
+  // No new xnn_operators should be created for the next call to BatchMatMul
+  // with the same b's.
+  tfjs::wasm::FusedBatchMatMul(
+      a0_id, a_shape_ptr, a_shape.size(), b1_id, b_shape_ptr, b_shape.size(),
+      false /* transpose_a */, false /* transpose_b */, out_id);
+  ASSERT_EQ(2, tfjs::backend::xnn_operator_count);
+
+  // Disposing a's should not remove xnn operators.
+  tfjs::wasm::dispose_data(a0_id);
+  tfjs::wasm::dispose_data(a1_id);
+  ASSERT_EQ(2, tfjs::backend::xnn_operator_count);
+
+  // Disposing b's should remove xnn operators.
+  tfjs::wasm::dispose_data(b0_id);
+  ASSERT_EQ(1, tfjs::backend::xnn_operator_count);
+
+  tfjs::wasm::dispose_data(b1_id);
+  ASSERT_EQ(0, tfjs::backend::xnn_operator_count);
+
+  tfjs::wasm::dispose();
+}
diff --git a/tfjs-backend-wasm/src/cc/kernels/FusedConv2D_test.cc b/tfjs-backend-wasm/src/cc/kernels/FusedConv2D_test.cc
index 6a75526979d..1b73d35d514 100644
--- a/tfjs-backend-wasm/src/cc/kernels/FusedConv2D_test.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/FusedConv2D_test.cc
@@ -18,7 +18,6 @@
 #include <cstddef>
 
 #include "src/cc/backend.h"
-#include "src/cc/conv2d_impl.h"
 #include "src/cc/kernels/FusedConv2D.h"
 #include "src/cc/util.h"
 
@@ -76,8 +75,7 @@ TEST(FUSEDCONV2D, xnn_operator_lifetime) {
   const size_t input_channels = 1;
   const size_t output_channels = 1;
 
-  const tfjs::wasm::FusableActivation activation =
-      tfjs::wasm::FusableActivation::LINEAR;
+  const FusableActivation activation = FusableActivation::LINEAR;
   const size_t prelu_weights_id = 0;
 
   tfjs::wasm::FusedConv2D(
@@ -173,8 +171,7 @@ TEST(FUSEDCONV2D, xnn_operator_lifetime) {
 
   // One new XNN operator should be created for the next call to conv2d with a
   // different activation.
-  const tfjs::wasm::FusableActivation activation2 =
-      tfjs::wasm::FusableActivation::RELU6;
+  const FusableActivation activation2 = FusableActivation::RELU6;
   tfjs::wasm::FusedConv2D(
       x1_id, batch_size, input_height, input_width, weights1_id, filter_height,
       filter_width, bias1_id, pad_top1, pad_right, pad_bottom1, pad_left,
diff --git a/tfjs-backend-wasm/src/cc/kernels/FusedDepthwiseConv2D_test.cc b/tfjs-backend-wasm/src/cc/kernels/FusedDepthwiseConv2D_test.cc
index 5f362c9126b..37afa3c7b12 100644
--- a/tfjs-backend-wasm/src/cc/kernels/FusedDepthwiseConv2D_test.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/FusedDepthwiseConv2D_test.cc
@@ -76,8 +76,7 @@ TEST(FUSEDDEPTHWISECONV2D, xnn_operator_lifetime) {
   const size_t input_channels = 1;
   const size_t output_channels = 1;
 
-  const tfjs::wasm::FusableActivation activation =
-      tfjs::wasm::FusableActivation::LINEAR;
+  const FusableActivation activation = FusableActivation::LINEAR;
 
   tfjs::wasm::FusedDepthwiseConv2D(
       x0_id, batch_size, input_height, input_width, weights0_id, filter_height,
@@ -89,8 +88,7 @@ TEST(FUSEDDEPTHWISECONV2D, xnn_operator_lifetime) {
 
   // One new xnn operator should be created for second call to conv2d with no
   // bias and prelu activation.
-  const tfjs::wasm::FusableActivation prelu_activation =
-      tfjs::wasm::FusableActivation::PRELU;
+  const FusableActivation prelu_activation = FusableActivation::PRELU;
 
   const size_t prelu_weights_id = 8;
   const size_t prelu_size = 8;
@@ -190,8 +188,7 @@ TEST(FUSEDDEPTHWISECONV2D, xnn_operator_lifetime) {
 
   // One new XNN operator should be created for the next call to conv2d with a
   // different activation.
-  const tfjs::wasm::FusableActivation activation2 =
-      tfjs::wasm::FusableActivation::RELU6;
+  const FusableActivation activation2 = FusableActivation::RELU6;
   tfjs::wasm::FusedDepthwiseConv2D(
       x1_id, batch_size, input_height, input_width, weights1_id, filter_height,
       filter_width, bias1_id, pad_top1, pad_right, pad_bottom1, pad_left,

From c14ffef60f2e6e96a690a2fd7815c8dad60f71b5 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Mon, 9 Mar 2020 09:44:37 -0400
Subject: [PATCH 18/35] got test to pass

---
 .../src/cc/kernels/FusedBatchMatMul_test.cc   | 32 ++++++++++++-------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc
index f912c5f0450..9086c799ba5 100644
--- a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc
@@ -50,31 +50,39 @@ TEST(FUSEDBATCH_MATMUL, xnn_operator_lfietime) {
   ASSERT_EQ(5, tfjs::backend::num_tensors());
   ASSERT_EQ(0, tfjs::backend::xnn_operator_count);
 
+  const FusableActivation activation = FusableActivation::LINEAR;
+  size_t bias_id = 0;
+  size_t prelu_weights_id = 0;
+
   // One new xnn_operator should be created for the first call to BatchMatMul.
-  tfjs::wasm::FusedBatchMatMul(
-      a0_id, a_shape_ptr, a_shape.size(), b0_id, b_shape_ptr, b_shape.size(),
-      false /* transpose_a */, false /* transpose_b */, out_id);
+  tfjs::wasm::FusedBatchMatMul(a0_id, a_shape_ptr, a_shape.size(), b0_id,
+                               b_shape_ptr, b_shape.size(),
+                               false /* transpose_a */, false /* transpose_b */,
+                               activation, bias_id, prelu_weights_id, out_id);
   ASSERT_EQ(1, tfjs::backend::xnn_operator_count);
 
   // No new xnn_operators should be created for the second call to BatchMatMul
   // with the same b's.
-  tfjs::wasm::FusedBatchMatMul(
-      a0_id, a_shape_ptr, a_shape.size(), b0_id, b_shape_ptr, b_shape.size(),
-      false /* transpose_a */, false /* transpose_b */, out_id);
+  tfjs::wasm::FusedBatchMatMul(a0_id, a_shape_ptr, a_shape.size(), b0_id,
+                               b_shape_ptr, b_shape.size(),
+                               false /* transpose_a */, false /* transpose_b */,
+                               activation, bias_id, prelu_weights_id, out_id);
   ASSERT_EQ(1, tfjs::backend::xnn_operator_count);
 
   // One new xnn_operator should be created for another call to BatchMatMul with
   // new b's.
-  tfjs::wasm::FusedBatchMatMul(
-      a0_id, a_shape_ptr, a_shape.size(), b1_id, b_shape_ptr, b_shape.size(),
-      false /* transpose_a */, false /* transpose_b */, out_id);
+  tfjs::wasm::FusedBatchMatMul(a0_id, a_shape_ptr, a_shape.size(), b1_id,
+                               b_shape_ptr, b_shape.size(),
+                               false /* transpose_a */, false /* transpose_b */,
+                               activation, bias_id, prelu_weights_id, out_id);
   ASSERT_EQ(2, tfjs::backend::xnn_operator_count);
 
   // No new xnn_operators should be created for the next call to BatchMatMul
   // with the same b's.
-  tfjs::wasm::FusedBatchMatMul(
-      a0_id, a_shape_ptr, a_shape.size(), b1_id, b_shape_ptr, b_shape.size(),
-      false /* transpose_a */, false /* transpose_b */, out_id);
+  tfjs::wasm::FusedBatchMatMul(a0_id, a_shape_ptr, a_shape.size(), b1_id,
+                               b_shape_ptr, b_shape.size(),
+                               false /* transpose_a */, false /* transpose_b */,
+                               activation, bias_id, prelu_weights_id, out_id);
   ASSERT_EQ(2, tfjs::backend::xnn_operator_count);
 
   // Disposing a's should not remove xnn operators.

From 5836bc20e9bb09b2aef076f60a16dcdad1704982 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Mon, 9 Mar 2020 10:09:47 -0400
Subject: [PATCH 19/35] add test

---
 .../src/cc/kernels/FusedBatchMatMul_test.cc   | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc
index 9086c799ba5..95bf8530655 100644
--- a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc
@@ -54,7 +54,8 @@ TEST(FUSEDBATCH_MATMUL, xnn_operator_lfietime) {
   size_t bias_id = 0;
   size_t prelu_weights_id = 0;
 
-  // One new xnn_operator should be created for the first call to BatchMatMul.
+  // One new xnn_operator should be created for the first call to BatchMatMul
+  // with no bias.
   tfjs::wasm::FusedBatchMatMul(a0_id, a_shape_ptr, a_shape.size(), b0_id,
                                b_shape_ptr, b_shape.size(),
                                false /* transpose_a */, false /* transpose_b */,
@@ -62,15 +63,23 @@ TEST(FUSEDBATCH_MATMUL, xnn_operator_lfietime) {
   ASSERT_EQ(1, tfjs::backend::xnn_operator_count);
 
   // No new xnn_operators should be created for the second call to BatchMatMul
-  // with the same b's.
+  // with the same arguments.
   tfjs::wasm::FusedBatchMatMul(a0_id, a_shape_ptr, a_shape.size(), b0_id,
                                b_shape_ptr, b_shape.size(),
                                false /* transpose_a */, false /* transpose_b */,
                                activation, bias_id, prelu_weights_id, out_id);
   ASSERT_EQ(1, tfjs::backend::xnn_operator_count);
 
-  // One new xnn_operator should be created for another call to BatchMatMul with
-  // new b's.
+  // No new xnn_operators should be created for calling BatchMatMul
+  // with a new a.
+  tfjs::wasm::FusedBatchMatMul(a1_id, a_shape_ptr, a_shape.size(), b0_id,
+                               b_shape_ptr, b_shape.size(),
+                               false /* transpose_a */, false /* transpose_b */,
+                               activation, bias_id, prelu_weights_id, out_id);
+  ASSERT_EQ(1, tfjs::backend::xnn_operator_count);
+
+  // One new xnn_operator should be created for calling BatchMatMul
+  // with a new b.
   tfjs::wasm::FusedBatchMatMul(a0_id, a_shape_ptr, a_shape.size(), b1_id,
                                b_shape_ptr, b_shape.size(),
                                false /* transpose_a */, false /* transpose_b */,
@@ -78,7 +87,7 @@ TEST(FUSEDBATCH_MATMUL, xnn_operator_lfietime) {
   ASSERT_EQ(2, tfjs::backend::xnn_operator_count);
 
   // No new xnn_operators should be created for the next call to BatchMatMul
-  // with the same b's.
+  // with the same b.
   tfjs::wasm::FusedBatchMatMul(a0_id, a_shape_ptr, a_shape.size(), b1_id,
                                b_shape_ptr, b_shape.size(),
                                false /* transpose_a */, false /* transpose_b */,

From 6dc84aa25db104de05d07582fbae98035aafa6db Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Mon, 9 Mar 2020 10:27:10 -0400
Subject: [PATCH 20/35] update test

---
 .../src/cc/batch_mat_mul_impl.cc              |  5 +--
 .../src/cc/kernels/FusedBatchMatMul_test.cc   | 33 ++++++++++++++-----
 2 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.cc b/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.cc
index b80d60294a1..2cdb51ad712 100644
--- a/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.cc
+++ b/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.cc
@@ -72,8 +72,9 @@ void erase_from_cache(const size_t tensor_id,
   }
 }
 
-void delete_xnn_operators(const size_t weights_id) {
-  erase_from_cache(weights_id, b_operator_cache_key_map);
+void delete_xnn_operators(const size_t tensor_id) {
+  erase_from_cache(tensor_id, b_operator_cache_key_map);
+  erase_from_cache(tensor_id, bias_operator_cache_key_map);
 }
 
 void associate_tensor_with_key(
diff --git a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc
index 95bf8530655..ce96bf5ccfc 100644
--- a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc
@@ -54,23 +54,23 @@ TEST(FUSEDBATCH_MATMUL, xnn_operator_lfietime) {
   size_t bias_id = 0;
   size_t prelu_weights_id = 0;
 
-  // One new xnn_operator should be created for the first call to BatchMatMul
-  // with no bias.
+  // One new xnn_operator should be created for the first call to
+  // FusedBatchMatMul with no bias.
   tfjs::wasm::FusedBatchMatMul(a0_id, a_shape_ptr, a_shape.size(), b0_id,
                                b_shape_ptr, b_shape.size(),
                                false /* transpose_a */, false /* transpose_b */,
                                activation, bias_id, prelu_weights_id, out_id);
   ASSERT_EQ(1, tfjs::backend::xnn_operator_count);
 
-  // No new xnn_operators should be created for the second call to BatchMatMul
-  // with the same arguments.
+  // No new xnn_operators should be created for the second call to
+  // FusedBatchMatMul with the same arguments.
   tfjs::wasm::FusedBatchMatMul(a0_id, a_shape_ptr, a_shape.size(), b0_id,
                                b_shape_ptr, b_shape.size(),
                                false /* transpose_a */, false /* transpose_b */,
                                activation, bias_id, prelu_weights_id, out_id);
   ASSERT_EQ(1, tfjs::backend::xnn_operator_count);
 
-  // No new xnn_operators should be created for calling BatchMatMul
+  // No new xnn_operators should be created for calling FusedBatchMatMul
   // with a new a.
   tfjs::wasm::FusedBatchMatMul(a1_id, a_shape_ptr, a_shape.size(), b0_id,
                                b_shape_ptr, b_shape.size(),
@@ -78,7 +78,7 @@ TEST(FUSEDBATCH_MATMUL, xnn_operator_lfietime) {
                                activation, bias_id, prelu_weights_id, out_id);
   ASSERT_EQ(1, tfjs::backend::xnn_operator_count);
 
-  // One new xnn_operator should be created for calling BatchMatMul
+  // One new xnn_operator should be created for calling FusedBatchMatMul
   // with a new b.
   tfjs::wasm::FusedBatchMatMul(a0_id, a_shape_ptr, a_shape.size(), b1_id,
                                b_shape_ptr, b_shape.size(),
@@ -86,17 +86,34 @@ TEST(FUSEDBATCH_MATMUL, xnn_operator_lfietime) {
                                activation, bias_id, prelu_weights_id, out_id);
   ASSERT_EQ(2, tfjs::backend::xnn_operator_count);
 
-  // No new xnn_operators should be created for the next call to BatchMatMul
-  // with the same b.
+  // No new xnn_operators should be created for the next call to
+  // FusedBatchMatMul with the same b.
   tfjs::wasm::FusedBatchMatMul(a0_id, a_shape_ptr, a_shape.size(), b1_id,
                                b_shape_ptr, b_shape.size(),
                                false /* transpose_a */, false /* transpose_b */,
                                activation, bias_id, prelu_weights_id, out_id);
   ASSERT_EQ(2, tfjs::backend::xnn_operator_count);
 
+  const size_t bias1_id = 6;
+  const size_t bias_size = 1;
+  float bias_values[bias_size] = {1};
+  tfjs::wasm::register_tensor(bias1_id, bias_size, bias_values);
+  // One new xnn_operator should be created for calling FusedBatchMatMul with a
+  // new bias.
+  tfjs::wasm::FusedBatchMatMul(a0_id, a_shape_ptr, a_shape.size(), b1_id,
+                               b_shape_ptr, b_shape.size(),
+                               false /* transpose_a */, false /* transpose_b */,
+                               activation, bias1_id, prelu_weights_id, out_id);
+  ASSERT_EQ(3, tfjs::backend::xnn_operator_count);
+
   // Disposing a's should not remove xnn operators.
   tfjs::wasm::dispose_data(a0_id);
   tfjs::wasm::dispose_data(a1_id);
+  ASSERT_EQ(3, tfjs::backend::xnn_operator_count);
+
+  // Disposing the second bias should remove the xnn_operator it's associated
+  // with.
+  tfjs::wasm::dispose_data(bias1_id);
   ASSERT_EQ(2, tfjs::backend::xnn_operator_count);
 
   // Disposing b's should remove xnn operators.

From 83265b1f2cea5015bd33054f2f0e633613bc59a4 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Mon, 9 Mar 2020 10:34:02 -0400
Subject: [PATCH 21/35] add activ test

---
 .../src/cc/kernels/FusedBatchMatMul_test.cc         | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc
index ce96bf5ccfc..de0c41ca6b0 100644
--- a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc
@@ -106,12 +106,21 @@ TEST(FUSEDBATCH_MATMUL, xnn_operator_lfietime) {
                                activation, bias1_id, prelu_weights_id, out_id);
   ASSERT_EQ(3, tfjs::backend::xnn_operator_count);
 
+  // One new xnn_operator should be created for calling FusedBatchMatMul with a
+  // different activation.
+  const FusableActivation activation2 = FusableActivation::RELU;
+  tfjs::wasm::FusedBatchMatMul(a0_id, a_shape_ptr, a_shape.size(), b1_id,
+                               b_shape_ptr, b_shape.size(),
+                               false /* transpose_a */, false /* transpose_b */,
+                               activation2, bias1_id, prelu_weights_id, out_id);
+  ASSERT_EQ(4, tfjs::backend::xnn_operator_count);
+
   // Disposing a's should not remove xnn operators.
   tfjs::wasm::dispose_data(a0_id);
   tfjs::wasm::dispose_data(a1_id);
-  ASSERT_EQ(3, tfjs::backend::xnn_operator_count);
+  ASSERT_EQ(4, tfjs::backend::xnn_operator_count);
 
-  // Disposing the second bias should remove the xnn_operator it's associated
+  // Disposing the second bias should remove the xnn_operators it's associated
   // with.
   tfjs::wasm::dispose_data(bias1_id);
   ASSERT_EQ(2, tfjs::backend::xnn_operator_count);

From 1e2830733b3198bd97648851ced181b9558a216b Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Mon, 9 Mar 2020 10:57:04 -0400
Subject: [PATCH 22/35] tests

---
 tfjs-backend-wasm/src/cc/kernels/Pow.cc      | 2 +-
 tfjs-backend-wasm/src/index_test.ts          | 4 ++--
 tfjs-backend-wasm/src/kernels/all_kernels.ts | 1 +
 tfjs-backend-wasm/src/setup_test.ts          | 1 +
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/kernels/Pow.cc b/tfjs-backend-wasm/src/cc/kernels/Pow.cc
index b9c5bed7388..646fe355fa1 100644
--- a/tfjs-backend-wasm/src/cc/kernels/Pow.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/Pow.cc
@@ -25,7 +25,7 @@
 namespace {
 template <class T>
 inline T power(T a, T b) {
-  return std::pow(a, b);
+  return pow(a, b);
 }
 }  // namespace
 
diff --git a/tfjs-backend-wasm/src/index_test.ts b/tfjs-backend-wasm/src/index_test.ts
index 894c2002c91..b25d937f1b3 100644
--- a/tfjs-backend-wasm/src/index_test.ts
+++ b/tfjs-backend-wasm/src/index_test.ts
@@ -103,7 +103,7 @@ describeWithFlags('wasm init', BROWSER_ENVS, () => {
     console.log(Array.from(data));
   });
 
-  fit('fused batch mm', async () => {
+  it('fused batch mm', async () => {
     const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
     const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
     const alpha = tf.tensor2d([0.5, 0.5], [1, 2]);
@@ -115,7 +115,7 @@ describeWithFlags('wasm init', BROWSER_ENVS, () => {
     console.log(data);  // 0, 8, -3, 20
   });
 
-  fit('fused batch mm with bias', async () => {
+  it('fused batch mm with bias', async () => {
     const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
     const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
     const c = tf.tensor1d([1, 1]);
diff --git a/tfjs-backend-wasm/src/kernels/all_kernels.ts b/tfjs-backend-wasm/src/kernels/all_kernels.ts
index be3f297fdb2..602eb6f91e1 100644
--- a/tfjs-backend-wasm/src/kernels/all_kernels.ts
+++ b/tfjs-backend-wasm/src/kernels/all_kernels.ts
@@ -37,6 +37,7 @@ import './FloorDiv';
 import './FusedBatchNorm';
 import './FusedConv2D';
 import './FusedDepthwiseConv2D';
+import './FusedBatchMatMul';
 import './Gather';
 import './GatherNd';
 import './Greater';
diff --git a/tfjs-backend-wasm/src/setup_test.ts b/tfjs-backend-wasm/src/setup_test.ts
index 4d942718f47..10493424f3e 100644
--- a/tfjs-backend-wasm/src/setup_test.ts
+++ b/tfjs-backend-wasm/src/setup_test.ts
@@ -44,6 +44,7 @@ const TEST_FILTERS: TestFilter[] = [
                                                          // supported yet.
     ]
   },
+  // {include: 'fused matmul'},
   {
     include: 'add ',
     excludes: [

From 572bc152a9788b84386a7c8e43f943ba7422c58f Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Mon, 9 Mar 2020 12:06:06 -0400
Subject: [PATCH 23/35] add tests

---
 tfjs-backend-wasm/src/setup_test.ts | 27 +++++++++++++-----------
 tfjs-core/src/ops/fused_test.ts     | 32 ++++++++++++++---------------
 2 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/tfjs-backend-wasm/src/setup_test.ts b/tfjs-backend-wasm/src/setup_test.ts
index 10493424f3e..01d7c8bbc81 100644
--- a/tfjs-backend-wasm/src/setup_test.ts
+++ b/tfjs-backend-wasm/src/setup_test.ts
@@ -44,7 +44,6 @@ const TEST_FILTERS: TestFilter[] = [
                                                          // supported yet.
     ]
   },
-  // {include: 'fused matmul'},
   {
     include: 'add ',
     excludes: [
@@ -63,11 +62,12 @@ const TEST_FILTERS: TestFilter[] = [
   {
     include: 'relu',
     excludes: [
-      'derivative',         // Not yet implemented.
-      'gradient',           // Not yet implemented.
-      'valueAndGradients',  // Not yet implemented.
-      'fused matmul',       // Not yet implemented.
-      'broadcasted bias',   // Not yet implemented.
+      'derivative',               // Not yet implemented.
+      'gradient',                 // Not yet implemented.
+      'valueAndGradients',        // Not yet implemented.
+      'broadcasted bias',         // Not yet implemented.
+      'fused A x B with 2d bias'  // Fused matMul with 2D bias not yet
+                                  // supported.
     ]
   },
   {
@@ -89,12 +89,15 @@ const TEST_FILTERS: TestFilter[] = [
   {
     include: 'matmul ',
     excludes: [
-      'valueAndGradients',       // Gradients not defined yet
-      'gradient',                // Gradients not defined yet
-      'fused matmul',            // Fused kernels aren't ready yet
-      'zero in its shape',       // Zero in shapes aren't supported yet
-      'matmul followed by mul',  // mul not supported yet
-      'upcasts',                 // Upcasting not supported yet.
+      'valueAndGradients',         // Gradients not defined yet
+      'gradient',                  // Gradients not defined yet
+      'zero in its shape',         // Zero in shapes aren't supported yet
+      'matmul followed by mul',    // mul not supported yet
+      'upcasts',                   // Upcasting not supported yet.
+      'fused A x B with elu',      // Fused matMul with elu activation not yet
+                                   // supported.
+      'fused A x B with 2d bias',  // Fused matMul with 2D bias not yet
+                                   // supported.
     ]
   },
   {
diff --git a/tfjs-core/src/ops/fused_test.ts b/tfjs-core/src/ops/fused_test.ts
index e8c6f9c91dc..bdffb90d6f2 100644
--- a/tfjs-core/src/ops/fused_test.ts
+++ b/tfjs-core/src/ops/fused_test.ts
@@ -20,7 +20,7 @@ import {ALL_ENVS, describeWithFlags} from '../jasmine_util';
 import {expectArraysClose} from '../test_util';
 
 describeWithFlags('fused matmul', ALL_ENVS, () => {
-  it('A x B', async () => {
+  it('fused A x B', async () => {
     const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
     const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
 
@@ -30,7 +30,7 @@ describeWithFlags('fused matmul', ALL_ENVS, () => {
     expectArraysClose(await c.data(), [0, 8, -3, 20]);
   });
 
-  it('A x B with relu', async () => {
+  it('fused A x B with relu', async () => {
     const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
     const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
     const transposeA = false;
@@ -43,7 +43,7 @@ describeWithFlags('fused matmul', ALL_ENVS, () => {
     expectArraysClose(await c.data(), [0, 8, 0, 20]);
   });
 
-  it('A x B with elu', async () => {
+  it('fused A x B with elu', async () => {
     const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
     const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
     const transposeA = false;
@@ -56,7 +56,7 @@ describeWithFlags('fused matmul', ALL_ENVS, () => {
     expectArraysClose(await c.data(), [0, 8, -0.9502, 20]);
   });
 
-  it('A x B with relu6', async () => {
+  it('fused A x B with relu6', async () => {
     const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
     const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
     const transposeA = false;
@@ -69,7 +69,7 @@ describeWithFlags('fused matmul', ALL_ENVS, () => {
     expectArraysClose(await c.data(), [0, 6, 0, 6]);
   });
 
-  it('A x B with prelu', async () => {
+  it('fused A x B with prelu', async () => {
     const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
     const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
     const alpha = tf.tensor2d([0.5, 0.5], [1, 2]);
@@ -90,7 +90,7 @@ describeWithFlags('fused matmul', ALL_ENVS, () => {
     expectArraysClose(await c.data(), [0, 8, -1.5, 20]);
   });
 
-  it('A x B with relu transpose', async () => {
+  it('fused A x B with relu transpose', async () => {
     const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
     const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [2, 3]);
     const transposeA = false;
@@ -103,7 +103,7 @@ describeWithFlags('fused matmul', ALL_ENVS, () => {
     expectArraysClose(await c.data(), [0, 9, 0, 24]);
   });
 
-  it('A x B with relu and bias', async () => {
+  it('fused A x B with 2d bias and relu', async () => {
     const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
     const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
     const c = tf.tensor2d([1, 1, 1, 1], [2, 2]);
@@ -117,7 +117,7 @@ describeWithFlags('fused matmul', ALL_ENVS, () => {
     expectArraysClose(await d.data(), [1, 9, 0, 21]);
   });
 
-  it('A x B with relu and broadcasted bias', async () => {
+  it('fused A x B with relu and broadcasted bias', async () => {
     const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
     const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
     const c = tf.tensor1d([1, 1]);
@@ -132,7 +132,7 @@ describeWithFlags('fused matmul', ALL_ENVS, () => {
     expectArraysClose(await d.data(), [1, 9, 0, 21]);
   });
 
-  it('A x B with elu and broadcasted bias', async () => {
+  it('fused A x B with elu and broadcasted bias', async () => {
     const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
     const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
     const c = tf.tensor1d([1, 1]);
@@ -147,7 +147,7 @@ describeWithFlags('fused matmul', ALL_ENVS, () => {
     expectArraysClose(await d.data(), [1, 9, -0.8647, 21]);
   });
 
-  it('A x B with relu and broadcasted bias different rank', async () => {
+  it('fused A x B with relu and broadcasted bias different rank', async () => {
     const a = tf.tensor3d([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [2, 2, 3]);
     const b = tf.tensor3d([0, 1, -3, 2, 2, 1, 0, 1, -3, 2, 2, 1], [2, 3, 2]);
     const c = tf.tensor2d([1, 2], [1, 2]);
@@ -162,7 +162,7 @@ describeWithFlags('fused matmul', ALL_ENVS, () => {
     expectArraysClose(await d.data(), [2, 6, 0, 18, 0, 30, 0, 42]);
   });
 
-  it('A x B with bias only', async () => {
+  it('fused A x B with 2d bias only', async () => {
     const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
     const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
     const c = tf.tensor2d([1, 1, 1, 1], [2, 2]);
@@ -176,7 +176,7 @@ describeWithFlags('fused matmul', ALL_ENVS, () => {
     expectArraysClose(await d.data(), [1, 9, -2, 21]);
   });
 
-  it('A x B with relu gradient', async () => {
+  it('fused A x B with relu gradient', async () => {
     const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
     const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
     const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);
@@ -224,7 +224,7 @@ describeWithFlags('fused matmul', ALL_ENVS, () => {
     expect(fusedDb.shape).toEqual(b.shape);
   });
 
-  it('A x B with relu bias gradient', async () => {
+  it('fused A x B with relu bias gradient', async () => {
     const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
     const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
     const c = tf.tensor2d([1, 1, 1, 1], [2, 2]);
@@ -252,7 +252,7 @@ describeWithFlags('fused matmul', ALL_ENVS, () => {
     expectArraysClose(await dc.array(), await fusedDc.array());
   });
 
-  it('A x B with relu bias gradient transpose', async () => {
+  it('fused A x B with relu bias gradient transpose', async () => {
     const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [3, 2]);
     const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
     const c = tf.tensor2d([1, 1, 1, 1], [2, 2]);
@@ -280,7 +280,7 @@ describeWithFlags('fused matmul', ALL_ENVS, () => {
     expectArraysClose(await dc.array(), await fusedDc.array());
   });
 
-  it('A x B with relu and broadcasted bias gradient', async () => {
+  it('fused A x B with relu and broadcasted bias gradient', async () => {
     const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
     const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
     const c = tf.tensor2d([[1]]);
@@ -1021,7 +1021,7 @@ describeWithFlags('fused conv2d', ALL_ENVS, () => {
     expectArraysClose(await dbiasFused.array(), await dbias.array());
   });
 
-  it('fused matmul with relu6', async () => {
+  it('fused matmul with relu6 and gradients', async () => {
     const a = tf.tensor2d([1, 2, 3, 10, 20, -30], [2, 3]);
     const b = tf.tensor2d([2, 3, 4, -1, 2, 3], [3, 2]);
     const dy = tf.tensor2d([1, 10, 20, 30], [2, 2]);

From a82b6101079ffb422393ab31008433f2cf30b207 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Mon, 9 Mar 2020 12:12:40 -0400
Subject: [PATCH 24/35] fix setup test

---
 tfjs-backend-wasm/src/cc/BUILD      |  1 -
 tfjs-backend-wasm/src/index_test.ts | 42 ++---------------------------
 2 files changed, 2 insertions(+), 41 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/BUILD b/tfjs-backend-wasm/src/cc/BUILD
index 040fd9a51cf..2e5afd7d6b1 100644
--- a/tfjs-backend-wasm/src/cc/BUILD
+++ b/tfjs-backend-wasm/src/cc/BUILD
@@ -281,7 +281,6 @@ tfjs_cc_library(
     hdrs = ["kernels/FusedBatchMatMul.h"],
     deps = [
         ":backend",
-        ":util",
         ":batch_mat_mul_impl",
     ],
 )
diff --git a/tfjs-backend-wasm/src/index_test.ts b/tfjs-backend-wasm/src/index_test.ts
index b25d937f1b3..370615f2cf3 100644
--- a/tfjs-backend-wasm/src/index_test.ts
+++ b/tfjs-backend-wasm/src/index_test.ts
@@ -58,8 +58,8 @@ describeWithFlags('wasm init', BROWSER_ENVS, () => {
     }, 100);
 
     // Silences backend registration warnings.
-    // spyOn(console, 'warn');
-    // spyOn(console, 'log');
+    spyOn(console, 'warn');
+    spyOn(console, 'log');
   });
 
   afterEach(() => {
@@ -92,42 +92,4 @@ describeWithFlags('wasm init', BROWSER_ENVS, () => {
     expect(() => setWasmPath('too/late'))
         .toThrowError(/The WASM backend was already initialized. Make sure/);
   });
-
-  it('pow', async () => {
-    const a = tf.tensor2d([1, -2, -3, 0, 7, 1], [2, 3]);
-    const b = tf.tensor2d([5, 3, 4, 5, 2, -3], [2, 3], 'int32');
-    // const expected = [1, -8, 81, 0, 49, 1];
-    const result = tf.pow(a, b);
-    // const result = tf.div(a, b);
-    const data = await result.data();
-    console.log(Array.from(data));
-  });
-
-  it('fused batch mm', async () => {
-    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
-    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
-    const alpha = tf.tensor2d([0.5, 0.5], [1, 2]);
-
-    const c = tf.fused.matMul(
-        {a, b, activation: 'prelu', preluActivationWeights: alpha});
-    // const c = tf.matMul(a, b);
-    const data = await c.data();
-    console.log(data);  // 0, 8, -3, 20
-  });
-
-  it('fused batch mm with bias', async () => {
-    const a = tf.tensor2d([1, 2, 3, 4, 5, 6], [2, 3]);
-    const b = tf.tensor2d([0, 1, -3, 2, 2, 1], [3, 2]);
-    const c = tf.tensor1d([1, 1]);
-    const transposeA = false;
-    const transposeB = false;
-
-    const d = tf.fused.matMul(
-        {a, b, transposeA, transposeB, bias: c, activation: 'relu'});
-
-    expect(d.shape).toEqual([2, 2]);
-    const data = await d.data();
-    console.log(data);
-    // expectArraysClose(await d.data(), [1, 9, 0, 21]);
-  });
 });

From ee53a7269daf82cab2a6ea86e6dc7ff6b8295a97 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Mon, 9 Mar 2020 12:57:01 -0400
Subject: [PATCH 25/35] lint

---
 tfjs-backend-wasm/src/cc/BUILD | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/BUILD b/tfjs-backend-wasm/src/cc/BUILD
index 2e5afd7d6b1..82c69b5f141 100644
--- a/tfjs-backend-wasm/src/cc/BUILD
+++ b/tfjs-backend-wasm/src/cc/BUILD
@@ -167,7 +167,6 @@ tfjs_cc_library(
         ":ArgMax",
         ":AvgPool",
         ":BatchMatMul",
-        ":FusedBatchMatMul",
         ":ClipByValue",
         ":Conv2D",
         ":CropAndResize",
@@ -175,6 +174,7 @@ tfjs_cc_library(
         ":Div",
         ":Exp",
         ":FloorDiv",
+        ":FusedBatchMatMul",
         ":FusedBatchNorm",
         ":FusedConv2D",
         ":FusedDepthwiseConv2D",
@@ -270,8 +270,8 @@ tfjs_cc_library(
     hdrs = ["kernels/BatchMatMul.h"],
     deps = [
         ":backend",
-        ":util",
         ":batch_mat_mul_impl",
+        ":util",
     ],
 )
 

From 72a5bbf2b498c9347754fe5a7f9decd92069b8e4 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Mon, 9 Mar 2020 12:57:31 -0400
Subject: [PATCH 26/35] lint

---
 tfjs-backend-wasm/src/cc/batch_mat_mul_impl.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.h b/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.h
index 73a16b95a65..b022d7badf5 100644
--- a/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.h
+++ b/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.h
@@ -12,8 +12,8 @@
  * limitations under the License.
  * ===========================================================================*/
 
-#ifndef BATCHMATMUL_IMPL_H_
-#define BATCHMATMUL_IMPL_H_
+#ifndef BATCH_MAT_MUL_IMPL_H_
+#define BATCH_MAT_MUL_IMPL_H_
 
 #include <cstddef>
 
@@ -30,4 +30,4 @@ void batchMatMul(const size_t a_id, const size_t* a_shape_ptr,
 }  // namespace wasm
 }  // namespace tfjs
 
-#endif  // BATCHMATMUL_IMPL_H_
+#endif  // BATCH_MAT_MUL_IMPL_H_

From fa3c9a797f181e513c4b749886f3365a63e81a51 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Mon, 9 Mar 2020 12:57:59 -0400
Subject: [PATCH 27/35] linrt

---
 tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.h b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.h
index bc88ea75493..2a0dcaf3594 100644
--- a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.h
+++ b/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.h
@@ -12,8 +12,8 @@
  * limitations under the License.
  * ===========================================================================*/
 
-#ifndef KERNELS_BATCHMATMUL_H_
-#define KERNELS_BATCHMATMUL_H_
+#ifndef KERNELS_FUSEDBATCHMATMUL_H_
+#define KERNELS_FUSEDBATCHMATMUL_H_
 
 #include <cstddef>
 
@@ -32,4 +32,4 @@ void FusedBatchMatMul(const size_t a_id, const size_t* a_shape_ptr,
 }  // namespace wasm
 }  // namespace tfjs
 
-#endif  // KERNELS_BATCHMATMUL_H_
+#endif  // KERNELS_FUSEDBATCHMATMUL_H_

From 0038b72f1bf1c6dc025998a92304e192760772de Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Mon, 9 Mar 2020 16:19:17 -0400
Subject: [PATCH 28/35] rename

---
 tfjs-backend-wasm/src/cc/BUILD                | 14 ++--
 .../src/cc/batch_mat_mul_impl.cc              |  2 +-
 tfjs-backend-wasm/src/cc/batch_mat_mul_impl.h |  2 +-
 .../src/cc/kernels/BatchMatMul.cc             |  2 +-
 .../src/cc/kernels/BatchMatMul.h              |  2 +-
 .../src/cc/kernels/BatchMatMul_test.cc        |  2 +-
 .../{FusedBatchMatMul.cc => _FusedMatMul.cc}  | 14 ++--
 .../{FusedBatchMatMul.h => _FusedMatMul.h}    | 18 ++---
 ...tchMatMul_test.cc => _FusedMatMul_test.cc} | 74 +++++++++----------
 .../{FusedBatchMatMul.ts => _FusedMatMul.ts}  | 51 +++++++------
 tfjs-backend-wasm/src/kernels/all_kernels.ts  |  2 +-
 tfjs-core/src/ops/fused_ops.ts                |  2 +-
 12 files changed, 92 insertions(+), 93 deletions(-)
 rename tfjs-backend-wasm/src/cc/kernels/{FusedBatchMatMul.cc => _FusedMatMul.cc} (72%)
 rename tfjs-backend-wasm/src/cc/kernels/{FusedBatchMatMul.h => _FusedMatMul.h} (59%)
 rename tfjs-backend-wasm/src/cc/kernels/{FusedBatchMatMul_test.cc => _FusedMatMul_test.cc} (55%)
 rename tfjs-backend-wasm/src/kernels/{FusedBatchMatMul.ts => _FusedMatMul.ts} (75%)

diff --git a/tfjs-backend-wasm/src/cc/BUILD b/tfjs-backend-wasm/src/cc/BUILD
index 82c69b5f141..eca4b8f81ce 100644
--- a/tfjs-backend-wasm/src/cc/BUILD
+++ b/tfjs-backend-wasm/src/cc/BUILD
@@ -161,6 +161,7 @@ tfjs_cc_library(
 tfjs_cc_library(
     name = "all_kernels",
     deps = [
+        ":_FusedMatMul",
         ":Abs",
         ":Add",
         ":AddN",
@@ -174,7 +175,6 @@ tfjs_cc_library(
         ":Div",
         ":Exp",
         ":FloorDiv",
-        ":FusedBatchMatMul",
         ":FusedBatchNorm",
         ":FusedConv2D",
         ":FusedDepthwiseConv2D",
@@ -276,9 +276,9 @@ tfjs_cc_library(
 )
 
 tfjs_cc_library(
-    name = "FusedBatchMatMul",
-    srcs = ["kernels/FusedBatchMatMul.cc"],
-    hdrs = ["kernels/FusedBatchMatMul.h"],
+    name = "_FusedMatMul",
+    srcs = ["kernels/_FusedMatMul.cc"],
+    hdrs = ["kernels/_FusedMatMul.h"],
     deps = [
         ":backend",
         ":batch_mat_mul_impl",
@@ -294,10 +294,10 @@ tfjs_unit_test(
 )
 
 tfjs_unit_test(
-    name = "FusedBatchMatMul_test",
-    srcs = ["kernels/FusedBatchMatMul_test.cc"],
+    name = "_FusedMatMul_test",
+    srcs = ["kernels/_FusedMatMul_test.cc"],
     deps = [
-        ":FusedBatchMatMul",
+        ":_FusedMatMul",
     ],
 )
 
diff --git a/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.cc b/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.cc
index 2cdb51ad712..29759a17d0c 100644
--- a/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.cc
+++ b/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 Google Inc. All Rights Reserved.
+/* Copyright 2020 Google LLC. All Rights Reserved.
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
diff --git a/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.h b/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.h
index b022d7badf5..8df58ada339 100644
--- a/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.h
+++ b/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.h
@@ -1,4 +1,4 @@
-/* Copyright 2019 Google Inc. All Rights Reserved.
+/* Copyright 2020 Google LLC. All Rights Reserved.
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
diff --git a/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc b/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc
index 93f1e3c0507..9aa757a1e3c 100644
--- a/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 Google Inc. All Rights Reserved.
+/* Copyright 2020 Google LLC. All Rights Reserved.
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
diff --git a/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.h b/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.h
index 0b9570f3839..bc5b5728c4b 100644
--- a/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.h
+++ b/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.h
@@ -1,4 +1,4 @@
-/* Copyright 2019 Google Inc. All Rights Reserved.
+/* Copyright 2020 Google LLC. All Rights Reserved.
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
diff --git a/tfjs-backend-wasm/src/cc/kernels/BatchMatMul_test.cc b/tfjs-backend-wasm/src/cc/kernels/BatchMatMul_test.cc
index a5bb986dd3e..1ac04d3d4a9 100644
--- a/tfjs-backend-wasm/src/cc/kernels/BatchMatMul_test.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/BatchMatMul_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 Google Inc. All Rights Reserved.
+/* Copyright 2020 Google LLC. All Rights Reserved.
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
diff --git a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc b/tfjs-backend-wasm/src/cc/kernels/_FusedMatMul.cc
similarity index 72%
rename from tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc
rename to tfjs-backend-wasm/src/cc/kernels/_FusedMatMul.cc
index 6ac640f76df..1e3b43368f5 100644
--- a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/_FusedMatMul.cc
@@ -20,7 +20,7 @@
 
 #include "src/cc/backend.h"
 #include "src/cc/batch_mat_mul_impl.h"
-#include "src/cc/kernels/FusedBatchMatMul.h"
+#include "src/cc/kernels/_FusedMatMul.h"
 
 namespace tfjs {
 namespace wasm {
@@ -30,12 +30,12 @@ extern "C" {
 #ifdef __EMSCRIPTEN__
 EMSCRIPTEN_KEEPALIVE
 #endif
-void FusedBatchMatMul(const size_t a_id, const size_t* a_shape_ptr,
-                      const size_t a_shape_len, const size_t b_id,
-                      const size_t* b_shape_ptr, const size_t b_shape_len,
-                      const bool transpose_a, const bool transpose_b,
-                      const FusableActivation activation, const size_t bias_id,
-                      const size_t prelu_weights_id, const size_t out_id) {
+void _FusedMatMul(const size_t a_id, const size_t* a_shape_ptr,
+                  const size_t a_shape_len, const size_t b_id,
+                  const size_t* b_shape_ptr, const size_t b_shape_len,
+                  const bool transpose_a, const bool transpose_b,
+                  const FusableActivation activation, const size_t bias_id,
+                  const size_t prelu_weights_id, const size_t out_id) {
   tfjs::wasm::batchMatMul(a_id, a_shape_ptr, a_shape_len, b_id, b_shape_ptr,
                           b_shape_len, transpose_a, transpose_b, activation,
                           bias_id, prelu_weights_id, out_id);
diff --git a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.h b/tfjs-backend-wasm/src/cc/kernels/_FusedMatMul.h
similarity index 59%
rename from tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.h
rename to tfjs-backend-wasm/src/cc/kernels/_FusedMatMul.h
index 2a0dcaf3594..f300781ff3a 100644
--- a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul.h
+++ b/tfjs-backend-wasm/src/cc/kernels/_FusedMatMul.h
@@ -12,8 +12,8 @@
  * limitations under the License.
  * ===========================================================================*/
 
-#ifndef KERNELS_FUSEDBATCHMATMUL_H_
-#define KERNELS_FUSEDBATCHMATMUL_H_
+#ifndef KERNELS_FUSEDMATMUL_H_
+#define KERNELS_FUSEDMATMUL_H_
 
 #include <cstddef>
 
@@ -21,15 +21,15 @@ namespace tfjs {
 namespace wasm {
 extern "C" {
 
-void FusedBatchMatMul(const size_t a_id, const size_t* a_shape_ptr,
-                      const size_t a_shape_len, const size_t b_id,
-                      const size_t* b_shape_ptr, const size_t b_shape_len,
-                      const bool transpose_a, const bool transpose_b,
-                      const FusableActivation activation, const size_t bias_id,
-                      const size_t prelu_weights_id, const size_t out_id);
+void _FusedMatMul(const size_t a_id, const size_t* a_shape_ptr,
+                  const size_t a_shape_len, const size_t b_id,
+                  const size_t* b_shape_ptr, const size_t b_shape_len,
+                  const bool transpose_a, const bool transpose_b,
+                  const FusableActivation activation, const size_t bias_id,
+                  const size_t prelu_weights_id, const size_t out_id);
 }
 
 }  // namespace wasm
 }  // namespace tfjs
 
-#endif  // KERNELS_FUSEDBATCHMATMUL_H_
+#endif  // KERNELS_FUSEDMATMUL_H_
diff --git a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc b/tfjs-backend-wasm/src/cc/kernels/_FusedMatMul_test.cc
similarity index 55%
rename from tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc
rename to tfjs-backend-wasm/src/cc/kernels/_FusedMatMul_test.cc
index de0c41ca6b0..96ec6f90181 100644
--- a/tfjs-backend-wasm/src/cc/kernels/FusedBatchMatMul_test.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/_FusedMatMul_test.cc
@@ -18,9 +18,9 @@
 #include <vector>
 
 #include "src/cc/backend.h"
-#include "src/cc/kernels/FusedBatchMatMul.h"
+#include "src/cc/kernels/_FusedMatMul.h"
 
-TEST(FUSEDBATCH_MATMUL, xnn_operator_lfietime) {
+TEST(_FUSED_MATMUL, xnn_operator_lfietime) {
   tfjs::wasm::init();
 
   ASSERT_EQ(0, tfjs::backend::num_tensors());
@@ -55,64 +55,64 @@ TEST(FUSEDBATCH_MATMUL, xnn_operator_lfietime) {
   size_t prelu_weights_id = 0;
 
   // One new xnn_operator should be created for the first call to
-  // FusedBatchMatMul with no bias.
-  tfjs::wasm::FusedBatchMatMul(a0_id, a_shape_ptr, a_shape.size(), b0_id,
-                               b_shape_ptr, b_shape.size(),
-                               false /* transpose_a */, false /* transpose_b */,
-                               activation, bias_id, prelu_weights_id, out_id);
+  // _FusedMatMul with no bias.
+  tfjs::wasm::_FusedMatMul(a0_id, a_shape_ptr, a_shape.size(), b0_id,
+                           b_shape_ptr, b_shape.size(), false /* transpose_a */,
+                           false /* transpose_b */, activation, bias_id,
+                           prelu_weights_id, out_id);
   ASSERT_EQ(1, tfjs::backend::xnn_operator_count);
 
   // No new xnn_operators should be created for the second call to
-  // FusedBatchMatMul with the same arguments.
-  tfjs::wasm::FusedBatchMatMul(a0_id, a_shape_ptr, a_shape.size(), b0_id,
-                               b_shape_ptr, b_shape.size(),
-                               false /* transpose_a */, false /* transpose_b */,
-                               activation, bias_id, prelu_weights_id, out_id);
+  // _FusedMatMul with the same arguments.
+  tfjs::wasm::_FusedMatMul(a0_id, a_shape_ptr, a_shape.size(), b0_id,
+                           b_shape_ptr, b_shape.size(), false /* transpose_a */,
+                           false /* transpose_b */, activation, bias_id,
+                           prelu_weights_id, out_id);
   ASSERT_EQ(1, tfjs::backend::xnn_operator_count);
 
-  // No new xnn_operators should be created for calling FusedBatchMatMul
+  // No new xnn_operators should be created for calling _FusedMatMul
   // with a new a.
-  tfjs::wasm::FusedBatchMatMul(a1_id, a_shape_ptr, a_shape.size(), b0_id,
-                               b_shape_ptr, b_shape.size(),
-                               false /* transpose_a */, false /* transpose_b */,
-                               activation, bias_id, prelu_weights_id, out_id);
+  tfjs::wasm::_FusedMatMul(a1_id, a_shape_ptr, a_shape.size(), b0_id,
+                           b_shape_ptr, b_shape.size(), false /* transpose_a */,
+                           false /* transpose_b */, activation, bias_id,
+                           prelu_weights_id, out_id);
   ASSERT_EQ(1, tfjs::backend::xnn_operator_count);
 
-  // One new xnn_operator should be created for calling FusedBatchMatMul
+  // One new xnn_operator should be created for calling _FusedMatMul
   // with a new b.
-  tfjs::wasm::FusedBatchMatMul(a0_id, a_shape_ptr, a_shape.size(), b1_id,
-                               b_shape_ptr, b_shape.size(),
-                               false /* transpose_a */, false /* transpose_b */,
-                               activation, bias_id, prelu_weights_id, out_id);
+  tfjs::wasm::_FusedMatMul(a0_id, a_shape_ptr, a_shape.size(), b1_id,
+                           b_shape_ptr, b_shape.size(), false /* transpose_a */,
+                           false /* transpose_b */, activation, bias_id,
+                           prelu_weights_id, out_id);
   ASSERT_EQ(2, tfjs::backend::xnn_operator_count);
 
   // No new xnn_operators should be created for the next call to
-  // FusedBatchMatMul with the same b.
-  tfjs::wasm::FusedBatchMatMul(a0_id, a_shape_ptr, a_shape.size(), b1_id,
-                               b_shape_ptr, b_shape.size(),
-                               false /* transpose_a */, false /* transpose_b */,
-                               activation, bias_id, prelu_weights_id, out_id);
+  // _FusedMatMul with the same b.
+  tfjs::wasm::_FusedMatMul(a0_id, a_shape_ptr, a_shape.size(), b1_id,
+                           b_shape_ptr, b_shape.size(), false /* transpose_a */,
+                           false /* transpose_b */, activation, bias_id,
+                           prelu_weights_id, out_id);
   ASSERT_EQ(2, tfjs::backend::xnn_operator_count);
 
   const size_t bias1_id = 6;
   const size_t bias_size = 1;
   float bias_values[bias_size] = {1};
   tfjs::wasm::register_tensor(bias1_id, bias_size, bias_values);
-  // One new xnn_operator should be created for calling FusedBatchMatMul with a
+  // One new xnn_operator should be created for calling _FusedMatMul with a
   // new bias.
-  tfjs::wasm::FusedBatchMatMul(a0_id, a_shape_ptr, a_shape.size(), b1_id,
-                               b_shape_ptr, b_shape.size(),
-                               false /* transpose_a */, false /* transpose_b */,
-                               activation, bias1_id, prelu_weights_id, out_id);
+  tfjs::wasm::_FusedMatMul(a0_id, a_shape_ptr, a_shape.size(), b1_id,
+                           b_shape_ptr, b_shape.size(), false /* transpose_a */,
+                           false /* transpose_b */, activation, bias1_id,
+                           prelu_weights_id, out_id);
   ASSERT_EQ(3, tfjs::backend::xnn_operator_count);
 
-  // One new xnn_operator should be created for calling FusedBatchMatMul with a
+  // One new xnn_operator should be created for calling _FusedMatMul with a
   // different activation.
   const FusableActivation activation2 = FusableActivation::RELU;
-  tfjs::wasm::FusedBatchMatMul(a0_id, a_shape_ptr, a_shape.size(), b1_id,
-                               b_shape_ptr, b_shape.size(),
-                               false /* transpose_a */, false /* transpose_b */,
-                               activation2, bias1_id, prelu_weights_id, out_id);
+  tfjs::wasm::_FusedMatMul(a0_id, a_shape_ptr, a_shape.size(), b1_id,
+                           b_shape_ptr, b_shape.size(), false /* transpose_a */,
+                           false /* transpose_b */, activation2, bias1_id,
+                           prelu_weights_id, out_id);
   ASSERT_EQ(4, tfjs::backend::xnn_operator_count);
 
   // Disposing a's should not remove xnn operators.
diff --git a/tfjs-backend-wasm/src/kernels/FusedBatchMatMul.ts b/tfjs-backend-wasm/src/kernels/_FusedMatMul.ts
similarity index 75%
rename from tfjs-backend-wasm/src/kernels/FusedBatchMatMul.ts
rename to tfjs-backend-wasm/src/kernels/_FusedMatMul.ts
index 0b0253df2b1..51d4111c3af 100644
--- a/tfjs-backend-wasm/src/kernels/FusedBatchMatMul.ts
+++ b/tfjs-backend-wasm/src/kernels/_FusedMatMul.ts
@@ -21,54 +21,53 @@ import {BackendWasm} from '../backend_wasm';
 
 import {FusableActivation} from './types';
 
-interface FusedBatchMatMulInputs extends NamedTensorInfoMap {
+interface FusedMatMulInputs extends NamedTensorInfoMap {
   a: TensorInfo;
   b: TensorInfo;
   bias?: TensorInfo;
   preluActivationWeights?: TensorInfo;
 }
 
-interface FusedBatchMatMulAttrs extends NamedAttrMap {
+interface FusedMatMulAttrs extends NamedAttrMap {
   transposeA: boolean;
   transposeB: boolean;
   activation: FusableActivation;
 }
 
-let wasmFusedBatchMatMul: (
+let wasmFusedMatMul: (
     aId: number, aShape: Uint8Array, aShapeSize: number, bId: number,
     bShape: Uint8Array, bShapeSize: number, transposeA: boolean,
     transposeB: boolean, activation: number, biasId: number,
     preluActivationWeightsId: number, outId: number) => void;
 
 function setup(backend: BackendWasm) {
-  wasmFusedBatchMatMul =
-      backend.wasm.cwrap('FusedBatchMatMul', null /* void */, [
-        'number',  // a_id
-        'array',   // a_shape
-        'number',  // a_shape.length
-        'number',  // b_id
-        'array',   // b_shape
-        'number',  // b_shape.length
-        'number',  // transpose_a
-        'number',  // transpose_b
-        'number',  // activation
-        'number',  // biasId
-        'number',  // preluActivationWeightsId
-        'number'   // out_id
-      ]);
+  wasmFusedMatMul = backend.wasm.cwrap('_FusedMatMul', null /* void */, [
+    'number',  // a_id
+    'array',   // a_shape
+    'number',  // a_shape.length
+    'number',  // b_id
+    'array',   // b_shape
+    'number',  // b_shape.length
+    'number',  // transpose_a
+    'number',  // transpose_b
+    'number',  // activation
+    'number',  // biasId
+    'number',  // preluActivationWeightsId
+    'number'   // out_id
+  ]);
 }
 
-function fusedBatchMatMul(args: {
-  inputs: FusedBatchMatMulInputs,
+function fusedMatMul(args: {
+  inputs: FusedMatMulInputs,
   backend: BackendWasm,
-  attrs: FusedBatchMatMulAttrs
+  attrs: FusedMatMulAttrs
 }) {
   const {inputs, backend, attrs} = args;
   const {a, b, bias, preluActivationWeights} = inputs;
 
   if (a.dtype !== 'float32' || b.dtype !== 'float32') {
     throw new Error(
-        `FusedBatchMatMul for non non-float32 tensors not yet supported.`);
+        `_FusedMatMul for non non-float32 tensors not yet supported.`);
   }
 
   const {transposeA, transposeB, activation} = attrs;
@@ -80,7 +79,7 @@ function fusedBatchMatMul(args: {
     const biasData = backend.dataIdMap.get(bias.dataId);
     if (biasData.shape.length !== 1) {
       throw new Error(
-          `FusedBatchMatMul only supports rank-1 bias but got ` +
+          `_FusedMatMul only supports rank-1 bias but got ` +
           `rank ${biasData.shape.length}.`);
     }
     biasId = biasData.id;
@@ -106,7 +105,7 @@ function fusedBatchMatMul(args: {
   const aShapeBytes = new Uint8Array(new Int32Array(a.shape).buffer);
   const bShapeBytes = new Uint8Array(new Int32Array(b.shape).buffer);
 
-  wasmFusedBatchMatMul(
+  wasmFusedMatMul(
       aId, aShapeBytes, a.shape.length, bId, bShapeBytes, b.shape.length,
       transposeA, transposeB, fusedActivation, biasId, preluActivationWeightsId,
       outId);
@@ -115,8 +114,8 @@ function fusedBatchMatMul(args: {
 }
 
 registerKernel({
-  kernelName: 'FusedBatchMatMul',
+  kernelName: '_FusedMatMul',
   backendName: 'wasm',
   setupFunc: setup,
-  kernelFunc: fusedBatchMatMul
+  kernelFunc: fusedMatMul
 });
diff --git a/tfjs-backend-wasm/src/kernels/all_kernels.ts b/tfjs-backend-wasm/src/kernels/all_kernels.ts
index 602eb6f91e1..56cec9e79f0 100644
--- a/tfjs-backend-wasm/src/kernels/all_kernels.ts
+++ b/tfjs-backend-wasm/src/kernels/all_kernels.ts
@@ -18,6 +18,7 @@
 // We explicitly import the modular kernels so they get registered in the
 // global registry when we compile the library. A modular build would replace
 // the contents of this file and import only the kernels that are needed.
+import './_FusedMatMul';
 import './Abs';
 import './Add';
 import './AddN';
@@ -37,7 +38,6 @@ import './FloorDiv';
 import './FusedBatchNorm';
 import './FusedConv2D';
 import './FusedDepthwiseConv2D';
-import './FusedBatchMatMul';
 import './Gather';
 import './GatherNd';
 import './Greater';
diff --git a/tfjs-core/src/ops/fused_ops.ts b/tfjs-core/src/ops/fused_ops.ts
index dce09ebbf42..0977cbae2d4 100644
--- a/tfjs-core/src/ops/fused_ops.ts
+++ b/tfjs-core/src/ops/fused_ops.ts
@@ -248,7 +248,7 @@ function fusedMatMul_<T extends Tensor>({
         save([a3D, b3D, y]);
         return y;
       },
-      inputs, grad, 'FusedBatchMatMul', {transposeA, transposeB, activation},
+      inputs, grad, '_FusedMatMul', {transposeA, transposeB, activation},
       inputsToSave, outputsToSave);
   return res.reshape(outShape) as T;
 }

From 241efac1dbfdae9eb08b3165ef05a9f0d516643f Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Mon, 9 Mar 2020 16:46:01 -0400
Subject: [PATCH 29/35] rename

---
 tfjs-backend-wasm/src/cc/batch_mat_mul_impl.cc   | 13 +++++++------
 tfjs-backend-wasm/src/cc/batch_mat_mul_impl.h    | 13 +++++++------
 tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc  |  6 +++---
 tfjs-backend-wasm/src/cc/kernels/_FusedMatMul.cc |  6 +++---
 4 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.cc b/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.cc
index 29759a17d0c..2475b50f81e 100644
--- a/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.cc
+++ b/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.cc
@@ -259,12 +259,13 @@ void slow_batch_matmul(const size_t a_id, const size_t* a_shape_ptr,
 
 namespace tfjs {
 namespace wasm {
-void batchMatMul(const size_t a_id, const size_t* a_shape_ptr,
-                 const size_t a_shape_len, const size_t b_id,
-                 const size_t* b_shape_ptr, const size_t b_shape_len,
-                 const bool transpose_a, const bool transpose_b,
-                 const FusableActivation activation, const size_t bias_id,
-                 const size_t prelu_weights_id, const size_t out_id) {
+void fused_batch_mat_mul(const size_t a_id, const size_t* a_shape_ptr,
+                         const size_t a_shape_len, const size_t b_id,
+                         const size_t* b_shape_ptr, const size_t b_shape_len,
+                         const bool transpose_a, const bool transpose_b,
+                         const FusableActivation activation,
+                         const size_t bias_id, const size_t prelu_weights_id,
+                         const size_t out_id) {
   FusableActivation clamp_method = activation;
   if (activation == FusableActivation::PRELU) {
     clamp_method = FusableActivation::LINEAR;
diff --git a/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.h b/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.h
index 8df58ada339..476eeb5d088 100644
--- a/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.h
+++ b/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.h
@@ -20,12 +20,13 @@
 namespace tfjs {
 namespace wasm {
 
-void batchMatMul(const size_t a_id, const size_t* a_shape_ptr,
-                 const size_t a_shape_len, const size_t b_id,
-                 const size_t* b_shape_ptr, const size_t b_shape_len,
-                 const bool transpose_a, const bool transpose_b,
-                 const FusableActivation activation, const size_t bias_id,
-                 const size_t prelu_weights_id, const size_t out_id);
+void fused_batch_mat_mul(const size_t a_id, const size_t* a_shape_ptr,
+                         const size_t a_shape_len, const size_t b_id,
+                         const size_t* b_shape_ptr, const size_t b_shape_len,
+                         const bool transpose_a, const bool transpose_b,
+                         const FusableActivation activation,
+                         const size_t bias_id, const size_t prelu_weights_id,
+                         const size_t out_id);
 
 }  // namespace wasm
 }  // namespace tfjs
diff --git a/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc b/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc
index 9aa757a1e3c..32e05d3e54b 100644
--- a/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc
@@ -38,9 +38,9 @@ void BatchMatMul(const size_t a_id, const size_t* a_shape_ptr,
   const size_t bias_id = 0;
   const size_t prelu_weights_id = 0;
   const FusableActivation activation = FusableActivation::LINEAR;
-  tfjs::wasm::batchMatMul(a_id, a_shape_ptr, a_shape_len, b_id, b_shape_ptr,
-                          b_shape_len, transpose_a, transpose_b, activation,
-                          bias_id, prelu_weights_id, out_id);
+  tfjs::wasm::fused_batch_mat_mul(
+      a_id, a_shape_ptr, a_shape_len, b_id, b_shape_ptr, b_shape_len,
+      transpose_a, transpose_b, activation, bias_id, prelu_weights_id, out_id);
 }
 
 }  // extern "C"
diff --git a/tfjs-backend-wasm/src/cc/kernels/_FusedMatMul.cc b/tfjs-backend-wasm/src/cc/kernels/_FusedMatMul.cc
index 1e3b43368f5..5eef8ff02c1 100644
--- a/tfjs-backend-wasm/src/cc/kernels/_FusedMatMul.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/_FusedMatMul.cc
@@ -36,9 +36,9 @@ void _FusedMatMul(const size_t a_id, const size_t* a_shape_ptr,
                   const bool transpose_a, const bool transpose_b,
                   const FusableActivation activation, const size_t bias_id,
                   const size_t prelu_weights_id, const size_t out_id) {
-  tfjs::wasm::batchMatMul(a_id, a_shape_ptr, a_shape_len, b_id, b_shape_ptr,
-                          b_shape_len, transpose_a, transpose_b, activation,
-                          bias_id, prelu_weights_id, out_id);
+  tfjs::wasm::fused_batch_mat_mul(
+      a_id, a_shape_ptr, a_shape_len, b_id, b_shape_ptr, b_shape_len,
+      transpose_a, transpose_b, activation, bias_id, prelu_weights_id, out_id);
 }
 
 }  // extern "C"

From 98ae17fc1f84d4a95ea7b85f89110eda363fd35b Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Mon, 9 Mar 2020 16:49:56 -0400
Subject: [PATCH 30/35] rm

---
 tfjs-backend-wasm/src/cc/batch_mat_mul_impl.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.cc b/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.cc
index 2475b50f81e..3cf5b15e979 100644
--- a/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.cc
+++ b/tfjs-backend-wasm/src/cc/batch_mat_mul_impl.cc
@@ -12,10 +12,6 @@
  * limitations under the License.
  * ===========================================================================*/
 
-#ifdef __EMSCRIPTEN__
-#include <emscripten.h>
-#endif
-
 #include <xnnpack.h>
 #include <algorithm>
 #include <cmath>

From 416fdf69e3748e10b4511cc13a5b7056e8512ea9 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Mon, 9 Mar 2020 17:12:17 -0400
Subject: [PATCH 31/35] use inline

---
 tfjs-backend-wasm/src/cc/kernels/Pow.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tfjs-backend-wasm/src/cc/kernels/Pow.cc b/tfjs-backend-wasm/src/cc/kernels/Pow.cc
index 646fe355fa1..ccb57c116aa 100644
--- a/tfjs-backend-wasm/src/cc/kernels/Pow.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/Pow.cc
@@ -42,7 +42,7 @@ void Pow(const size_t a_id, const size_t* a_shape_ptr, const size_t a_shape_len,
          const DType dtype, const size_t out_id) {
   switch (dtype) {
     case DType::float32:
-      binary_f32(a_id, b_id, out_id, power<float>);
+      binary_f32(a_id, b_id, out_id, pow<float>);
       break;
     case DType::int32:
       binary_i32(a_id, b_id, out_id, power<int32_t>);

From 5b2c51d94497f3b7d96b0be249b19da3f0d62c01 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Mon, 9 Mar 2020 17:13:55 -0400
Subject: [PATCH 32/35] lint

---
 tfjs-backend-wasm/src/cc/kernels/_FusedMatMul.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tfjs-backend-wasm/src/cc/kernels/_FusedMatMul.h b/tfjs-backend-wasm/src/cc/kernels/_FusedMatMul.h
index f300781ff3a..98501aabd55 100644
--- a/tfjs-backend-wasm/src/cc/kernels/_FusedMatMul.h
+++ b/tfjs-backend-wasm/src/cc/kernels/_FusedMatMul.h
@@ -12,8 +12,8 @@
  * limitations under the License.
  * ===========================================================================*/
 
-#ifndef KERNELS_FUSEDMATMUL_H_
-#define KERNELS_FUSEDMATMUL_H_
+#ifndef KERNELS__FUSEDMATMUL_H_
+#define KERNELS__FUSEDMATMUL_H_
 
 #include <cstddef>
 
@@ -32,4 +32,4 @@ void _FusedMatMul(const size_t a_id, const size_t* a_shape_ptr,
 }  // namespace wasm
 }  // namespace tfjs
 
-#endif  // KERNELS_FUSEDMATMUL_H_
+#endif  // KERNELS__FUSEDMATMUL_H_

From 30e1278bdfce36f00e701e3d0072db6016dfa220 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Mon, 9 Mar 2020 17:14:55 -0400
Subject: [PATCH 33/35] lint

---
 tfjs-backend-wasm/src/cc/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tfjs-backend-wasm/src/cc/BUILD b/tfjs-backend-wasm/src/cc/BUILD
index eca4b8f81ce..e3951df8944 100644
--- a/tfjs-backend-wasm/src/cc/BUILD
+++ b/tfjs-backend-wasm/src/cc/BUILD
@@ -161,7 +161,6 @@ tfjs_cc_library(
 tfjs_cc_library(
     name = "all_kernels",
     deps = [
-        ":_FusedMatMul",
         ":Abs",
         ":Add",
         ":AddN",
@@ -206,6 +205,7 @@ tfjs_cc_library(
         ":Sub",
         ":Tile",
         ":Transpose",
+        ":_FusedMatMul",
     ],
 )
 

From a030effc97489856fa7ff8a7a6af32430214d519 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Tue, 10 Mar 2020 07:56:14 -0400
Subject: [PATCH 34/35] hbn

---
 tfjs-backend-wasm/src/cc/kernels/Pow.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tfjs-backend-wasm/src/cc/kernels/Pow.cc b/tfjs-backend-wasm/src/cc/kernels/Pow.cc
index ccb57c116aa..646fe355fa1 100644
--- a/tfjs-backend-wasm/src/cc/kernels/Pow.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/Pow.cc
@@ -42,7 +42,7 @@ void Pow(const size_t a_id, const size_t* a_shape_ptr, const size_t a_shape_len,
          const DType dtype, const size_t out_id) {
   switch (dtype) {
     case DType::float32:
-      binary_f32(a_id, b_id, out_id, pow<float>);
+      binary_f32(a_id, b_id, out_id, power<float>);
       break;
     case DType::int32:
       binary_i32(a_id, b_id, out_id, power<int32_t>);

From 7c899d93fe0f458d2c0df2633545bc8cc76acbf0 Mon Sep 17 00:00:00 2001
From: Ann Yuan <annyuan@gmail.com>
Date: Tue, 10 Mar 2020 08:08:09 -0400
Subject: [PATCH 35/35] rename kernel func

---
 tfjs-backend-wasm/src/kernels/_FusedMatMul.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tfjs-backend-wasm/src/kernels/_FusedMatMul.ts b/tfjs-backend-wasm/src/kernels/_FusedMatMul.ts
index 51d4111c3af..8d5c4130751 100644
--- a/tfjs-backend-wasm/src/kernels/_FusedMatMul.ts
+++ b/tfjs-backend-wasm/src/kernels/_FusedMatMul.ts
@@ -57,7 +57,7 @@ function setup(backend: BackendWasm) {
   ]);
 }
 
-function fusedMatMul(args: {
+function fusedBatchMatMul(args: {
   inputs: FusedMatMulInputs,
   backend: BackendWasm,
   attrs: FusedMatMulAttrs
@@ -117,5 +117,5 @@ registerKernel({
   kernelName: '_FusedMatMul',
   backendName: 'wasm',
   setupFunc: setup,
-  kernelFunc: fusedMatMul
+  kernelFunc: fusedBatchMatMul
 });