diff --git a/.vscode/settings.json b/.vscode/settings.json
index ba34977dc31..a45d2ae7b26 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -127,6 +127,7 @@
     "queue": "cpp",
     "stack": "cpp",
     "*.ipp": "cpp",
-    "forward_list": "cpp"
+    "forward_list": "cpp",
+    "hash_map": "cpp"
   }
 }
diff --git a/tfjs-backend-wasm/scripts/cpplint.js b/tfjs-backend-wasm/scripts/cpplint.js
index 947dcf88e00..1a2cb633880 100755
--- a/tfjs-backend-wasm/scripts/cpplint.js
+++ b/tfjs-backend-wasm/scripts/cpplint.js
@@ -29,4 +29,4 @@ console.log(result);
 const cwd = process.cwd() + '/' + CC_FILEPATH;
 
 const filenameArgument = result.join(' ');
-exec(`python tools/cpplint.py --root ${cwd} ${filenameArgument}`);
+exec(`python2 tools/cpplint.py --root ${cwd} ${filenameArgument}`);
diff --git a/tfjs-backend-wasm/src/backend_wasm.ts b/tfjs-backend-wasm/src/backend_wasm.ts
index 3b1b71677e6..ed660469b84 100644
--- a/tfjs-backend-wasm/src/backend_wasm.ts
+++ b/tfjs-backend-wasm/src/backend_wasm.ts
@@ -124,16 +124,18 @@ export class BackendWasm extends KernelBackend {
     return {dataId, shape, dtype};
   }
 
-  typedArrayFromHeap(offset: number, dtype: DataType, size: number):
+  typedArrayFromHeap({shape, dtype, dataId}: TensorInfo):
       backend_util.TypedArray {
     const buffer = this.wasm.HEAPU8.buffer;
+    const {memoryOffset} = this.dataIdMap.get(dataId);
+    const size = util.sizeFromShape(shape);
     switch (dtype) {
       case 'float32':
-        return new Float32Array(buffer, offset, size);
+        return new Float32Array(buffer, memoryOffset, size);
       case 'int32':
-        return new Int32Array(buffer, offset, size);
+        return new Int32Array(buffer, memoryOffset, size);
       case 'bool':
-        return new Uint8Array(buffer, offset, size);
+        return new Uint8Array(buffer, memoryOffset, size);
       default:
         throw new Error(`Uknown dtype ${dtype}`);
     }
diff --git a/tfjs-backend-wasm/src/cc/BUILD b/tfjs-backend-wasm/src/cc/BUILD
index b4d1fce0d94..bee5ce2b546 100644
--- a/tfjs-backend-wasm/src/cc/BUILD
+++ b/tfjs-backend-wasm/src/cc/BUILD
@@ -69,6 +69,7 @@ tfjs_cc_library(
     ":Min",
     ":Sigmoid",
     ":Sub",
+    ":Transpose",
   ]
 )
 
@@ -185,9 +186,19 @@ tfjs_cc_library(
   ],
 )
 
+tfjs_cc_library(
+  name = "Transpose",
+  srcs = ["kernels/Transpose.cc"],
+  deps = [
+    ":backend",
+    ":util",
+  ],
+)
+
 tfjs_cc_library(
   name = "util",
-  srcs = ["util.h"],
+  hdrs = ["util.h"],
+  srcs = ["util.cc"],
 )
 
 test_suite(
@@ -195,10 +206,11 @@ test_suite(
 )
 
 tfjs_unit_test(
-  name = "backend_test",
-  srcs = ["backend_test.cc"],
+  name = "backend_tests",
+  srcs = glob(["*_test.cc"]),
   deps = [
     ":backend",
+    ":util",
     ":Prelu",
   ]
 )
diff --git a/tfjs-backend-wasm/src/cc/backend.cc b/tfjs-backend-wasm/src/cc/backend.cc
index ddaa10cab6b..390e96bc0df 100644
--- a/tfjs-backend-wasm/src/cc/backend.cc
+++ b/tfjs-backend-wasm/src/cc/backend.cc
@@ -37,7 +37,7 @@ std::unordered_map<int, std::vector<tfjs::backend::DisposeFunction>>
 
 namespace tfjs {
 namespace backend {
-TensorInfo get_tensor_info(int tensor_id) { return data.at(tensor_id); }
+TensorInfo &get_tensor_info(int tensor_id) { return data.at(tensor_id); }
 
 int xnn_operator_count = 0;
 
diff --git a/tfjs-backend-wasm/src/cc/backend.h b/tfjs-backend-wasm/src/cc/backend.h
index 096739520f0..391c552865a 100644
--- a/tfjs-backend-wasm/src/cc/backend.h
+++ b/tfjs-backend-wasm/src/cc/backend.h
@@ -45,7 +45,7 @@ namespace tfjs {
 namespace backend {
 // Returns the tensor information object associated with a given tensor_id
 // bucket.
-TensorInfo get_tensor_info(int tensor_id);
+TensorInfo &get_tensor_info(int tensor_id);
 
 // Registers a function callback to be called when a tensor with a given ID is
 // disposed.
diff --git a/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc b/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc
index 09076dd3bce..d387bbc21d7 100644
--- a/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc
+++ b/tfjs-backend-wasm/src/cc/kernels/BatchMatMul.cc
@@ -51,7 +51,7 @@ void BatchMatMul(int a_id, int b_id, int shared_dim, int left_dim,
   // Zero out the output buffer because it might have been used before.
   std::fill(out_buf, out_buf + batch_dim * size, 0);
 
-  for (int b = 0; b < batch_dim; b++) {
+  for (int b = 0; b < batch_dim; ++b) {
     for (int i0 = 0; i0 < left_dim; i0 += kBlockSize) {
       for (int j0 = 0; j0 < right_dim; j0 += kBlockSize) {
         for (int k0 = 0; k0 < shared_dim; k0 += kBlockSize) {
@@ -60,11 +60,11 @@ void BatchMatMul(int a_id, int b_id, int shared_dim, int left_dim,
           int j_block = std::min(j0 + kBlockSize, right_dim);
           int k_block = std::min(k0 + kBlockSize, shared_dim);
 
-          for (int i = i0; i < i_block; i++) {
-            for (int j = j0; j < j_block; j++) {
+          for (int i = i0; i < i_block; ++i) {
+            for (int j = j0; j < j_block; ++j) {
               float sum = 0.0;
 
-              for (int k = k0; k < k_block; k++) {
+              for (int k = k0; k < k_block; ++k) {
                 sum +=
                     a_buf[b * a_batch + i * a_outer_step + k * a_inner_step] *
                     b_buf[k * b_inner_step + j * b_outer_step + b * b_batch];
diff --git a/tfjs-backend-wasm/src/cc/kernels/Transpose.cc b/tfjs-backend-wasm/src/cc/kernels/Transpose.cc
new file mode 100644
index 00000000000..83a67cfca77
--- /dev/null
+++ b/tfjs-backend-wasm/src/cc/kernels/Transpose.cc
@@ -0,0 +1,305 @@
+/* Copyright 2019 Google Inc. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ===========================================================================*/
+
+#ifdef __EMSCRIPTEN__
+#include <emscripten.h>
+#endif
+
+#include "src/cc/backend.h"
+#include "src/cc/util.h"
+
+namespace {
+
+// Optimized transpose 2D that uses direct pointer arithmetic instead of bracket
+// indexing.
+template <typename T>
+void transpose_2d(const T* x_data, const std::vector<int>& x_shape,
+                  T* out_data) {
+  const int d0 = x_shape[0];
+  const int d1 = x_shape[1];
+  const T* input = x_data;
+  for (int i = 0; i < d0; ++i) {
+    T* output = out_data + i;
+    for (int j = 0; j < d1; ++j) {
+      *output = *input;
+      output += d0;
+      ++input;
+    }
+  }
+}
+
+// Optimized transpose 3D. Reference:
+// https://github.com/tensorflow/tensorflow/blob/87388b7b6040bbf0baa67e4ef1ddc3e930ff6edd/tensorflow/lite/kernels/internal/optimized/optimized_ops.h#L7248
+template <typename T>
+void transpose_3d(const T* x_data, const std::vector<int>& x_shape,
+                  const std::vector<int>& perm, T* out_data) {
+  int s1, s2, s3;
+  s1 = x_shape[0];
+  s2 = x_shape[1];
+  s3 = x_shape[2];
+
+  int p1, p2, p3;
+  if (perm[0] == 2) {
+    p1 = 1;
+  } else if (perm[1] == 2) {
+    p2 = 1;
+  } else {
+    p3 = 1;
+  }
+
+  if (perm[0] == 1) {
+    p1 = s3;
+  } else if (perm[1] == 1) {
+    p2 = s3;
+  } else {
+    p3 = s3;
+  }
+
+  if (perm[0] == 0) {
+    p1 = s2 * s3;
+  } else if (perm[1] == 0) {
+    p2 = s2 * s3;
+  } else {
+    p3 = s2 * s3;
+  }
+
+  int out_shape[3];
+  out_shape[0] = x_shape[perm[0]];
+  out_shape[1] = x_shape[perm[1]];
+  out_shape[2] = x_shape[perm[2]];
+  const int out_stride1 = out_shape[1] * out_shape[2];
+  const int out_stride2 = out_shape[2];
+
+  for (int i1 = 0; i1 < out_shape[0]; ++i1) {
+    for (int i2 = 0; i2 < out_shape[1]; ++i2) {
+      for (int i3 = 0; i3 < out_shape[2]; ++i3) {
+        const int i = tfjs::util::offset(i1, i2, i3, 0, p1, p2, p3);
+        const int o = tfjs::util::offset(i1, i2, i3, out_stride1, out_stride2);
+        out_data[o] = x_data[i];
+      }
+    }
+  }
+}
+
+// Optimized transpose 4D. For reference see `tranpose_3d`.
+template <typename T>
+void transpose_4d(const T* x_data, const std::vector<int>& x_shape,
+                  const std::vector<int>& perm, T* out_data) {
+  int s1, s2, s3, s4;
+  s1 = x_shape[0];
+  s2 = x_shape[1];
+  s3 = x_shape[2];
+  s4 = x_shape[3];
+
+  int p1, p2, p3, p4;
+  if (perm[0] == 3) {
+    p1 = 1;
+  } else if (perm[1] == 3) {
+    p2 = 1;
+  } else if (perm[2] == 3) {
+    p3 = 1;
+  } else {
+    p4 = 1;
+  }
+
+  if (perm[0] == 2) {
+    p1 = s4;
+  } else if (perm[1] == 2) {
+    p2 = s4;
+  } else if (perm[2] == 2) {
+    p3 = s4;
+  } else {
+    p4 = s4;
+  }
+
+  if (perm[0] == 1) {
+    p1 = s3 * s4;
+  } else if (perm[1] == 1) {
+    p2 = s3 * s4;
+  } else if (perm[2] == 1) {
+    p3 = s3 * s4;
+  } else {
+    p4 = s3 * s4;
+  }
+
+  if (perm[0] == 0) {
+    p1 = s2 * s3 * s4;
+  } else if (perm[1] == 0) {
+    p2 = s2 * s3 * s4;
+  } else if (perm[2] == 0) {
+    p3 = s2 * s3 * s4;
+  } else {
+    p4 = s2 * s3 * s4;
+  }
+
+  int out_shape[4];
+  out_shape[0] = x_shape[perm[0]];
+  out_shape[1] = x_shape[perm[1]];
+  out_shape[2] = x_shape[perm[2]];
+  out_shape[3] = x_shape[perm[3]];
+  const int out_stride1 = out_shape[1] * out_shape[2] * out_shape[3];
+  const int out_stride2 = out_shape[2] * out_shape[3];
+  const int out_stride3 = out_shape[3];
+
+  for (int i1 = 0; i1 < out_shape[0]; ++i1) {
+    for (int i2 = 0; i2 < out_shape[1]; ++i2) {
+      for (int i3 = 0; i3 < out_shape[2]; ++i3) {
+        for (int i4 = 0; i4 < out_shape[3]; ++i4) {
+          const int i = tfjs::util::offset(i1, i2, i3, i4, 0, p1, p2, p3, p4);
+          const int o = tfjs::util::offset(i1, i2, i3, i4, out_stride1,
+                                           out_stride2, out_stride3);
+          out_data[o] = x_data[i];
+        }
+      }
+    }
+  }
+}
+
+// Generic transpose implementation for n-dim tensors.
+template <typename T>
+void slow_transpose_nd(const T* x_data, const std::vector<int>& x_shape,
+                       const std::vector<int>& perm, T* out_data) {
+  const int size = tfjs::util::size_from_shape(x_shape);
+  const auto x_strides = tfjs::util::compute_strides(x_shape);
+  std::vector<int> out_shape(x_shape.size());
+  for (int i = 0; i < x_shape.size(); ++i) {
+    out_shape[i] = x_shape[perm[i]];
+  }
+  const auto out_strides = tfjs::util::compute_strides(out_shape);
+
+  for (int i = 0; i < size; ++i) {
+    const auto loc = tfjs::util::offset_to_loc(i, x_strides);
+
+    // Permute location.
+    std::vector<int> new_loc(loc.size());
+    for (int i = 0; i < loc.size(); ++i) {
+      new_loc[i] = loc[perm[i]];
+    }
+
+    const int new_i = tfjs::util::loc_to_offset(new_loc, out_strides);
+    out_data[new_i] = x_data[i];
+  }
+}
+
+// Flatten finds the dimensions that can be flatten, shrinks the given shapes
+// and the given perm parameter to reflect the non-flatten dimensions, and
+// returns the total size of the non-flatten dimensions.
+//
+// E.g, Given shape [2, 3, 4, 5] and perm [0,1,3,2] case,
+// this method flattens the first two dimensions and returns a new shape [5,4],
+// new perm [1,0] and 4*5=20 as the total size of the non-flat dims. Reference:
+// https://github.com/tensorflow/tensorflow/blob/1f404fcaad58bf61a107d4fa7c4f6004168a50fa/tensorflow/lite/kernels/internal/transpose_utils.h#L42
+int flatten(const std::vector<int>& x_shape, const std::vector<int>& perm,
+            std::vector<int>* new_x_shape_ptr, std::vector<int>* new_perm_ptr) {
+  auto& new_input_shape = *new_x_shape_ptr;
+  auto& new_perm = *new_perm_ptr;
+
+  // Calculate the total size of non-flatten dimensions.
+  int num_dims_to_skip = 0;
+  int rank = perm.size();
+  int flat_size = tfjs::util::size_from_shape(x_shape);
+  for (int i = 0; i < rank; ++i) {
+    if (perm[i] == i) {
+      flat_size /= x_shape[i];
+      ++num_dims_to_skip;
+    } else {
+      break;
+    }
+  }
+  // Shrink the shapes and re-calculate the perm parameter.
+  const int new_rank = rank - num_dims_to_skip;
+  new_perm.resize(new_rank);
+  new_input_shape.resize(new_rank);
+
+  for (int i = num_dims_to_skip; i < rank; ++i) {
+    new_input_shape[i - num_dims_to_skip] = x_shape[i];
+    new_perm[i - num_dims_to_skip] = perm[i];
+  }
+  for (int i = 0; i < new_rank; ++i) {
+    int min_val_idx = -1;
+    for (int j = 0; j < new_rank; ++j) {
+      if (new_perm[j] >= i &&
+          (min_val_idx == -1 || new_perm[min_val_idx] > new_perm[j])) {
+        min_val_idx = j;
+      }
+    }
+    new_perm[min_val_idx] = i;
+  }
+  return flat_size;
+}
+
+template <typename T>
+void transpose_impl(const T* x_data, const std::vector<int>& x_shape,
+                    const std::vector<int>& perm, T* out_data) {
+  if (x_shape.size() == 2) {
+    transpose_2d(x_data, x_shape, out_data);
+  } else if (x_shape.size() == 3) {
+    transpose_3d(x_data, x_shape, perm, out_data);
+  } else if (x_shape.size() == 4) {
+    transpose_4d(x_data, x_shape, perm, out_data);
+  } else {
+    slow_transpose_nd(x_data, x_shape, perm, out_data);
+  }
+}
+
+template <typename T>
+void transpose(const T* x_data, const std::vector<int>& x_shape,
+               const std::vector<int>& perm, T* out_data) {
+  std::vector<int> new_x_shape;
+  std::vector<int> new_perm;
+  // Try to reduce the rank of the transpose by flattening any outer-most
+  // dimensions.
+  const int non_flatten_size = flatten(x_shape, perm, &new_x_shape, &new_perm);
+  const int total_size = tfjs::util::size_from_shape(x_shape);
+  for (int offset = 0; offset < total_size; offset += non_flatten_size) {
+    transpose_impl(x_data + offset, new_x_shape, new_perm, out_data + offset);
+  }
+}
+
+}  // namespace
+
+namespace tfjs {
+namespace wasm {
+// We use C-style API to interface with Javascript.
+extern "C" {
+
+#ifdef __EMSCRIPTEN__
+EMSCRIPTEN_KEEPALIVE
+#endif
+void Transpose(int x_id, int* x_shape_ptr, int x_shape_length, int out_id,
+               int* perm_ptr, int perm_length) {
+  auto x_shape = std::vector<int>(x_shape_ptr, x_shape_ptr + x_shape_length);
+  auto perm = std::vector<int>(perm_ptr, perm_ptr + perm_length);
+  const TensorInfo x_info = backend::get_tensor_info(x_id);
+  const TensorInfo out_info = backend::get_tensor_info(out_id);
+
+  switch (x_info.dtype) {
+    case DType::float32:
+      transpose<float>(x_info.buf.f32, x_shape, perm, out_info.buf.f32);
+      break;
+    case DType::int32:
+      transpose<int>(x_info.buf.i32, x_shape, perm, out_info.buf.i32);
+      break;
+    case DType::boolean:
+      transpose<bool>(x_info.buf.b, x_shape, perm, out_info.buf.b);
+      break;
+    default:
+      util::warn("Transpose for tensor id %d failed. Unknown dtype %d", x_id,
+                 x_info.dtype);
+  }
+}
+
+}  // extern "C"
+}  // namespace wasm
+}  // namespace tfjs
diff --git a/tfjs-backend-wasm/src/cc/util.cc b/tfjs-backend-wasm/src/cc/util.cc
new file mode 100644
index 00000000000..a91dffe0df2
--- /dev/null
+++ b/tfjs-backend-wasm/src/cc/util.cc
@@ -0,0 +1,35 @@
+/* Copyright 2019 Google Inc. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ===========================================================================*/
+
+#include <vector>
+
+#include "src/cc/util.h"
+
+namespace tfjs {
+namespace util {
+
+std::vector<int> compute_strides(const std::vector<int> shape) {
+  int rank = shape.size();
+  std::vector<int> strides(rank - 1);
+  // Last dimension has implicit stride of 1, thus having D-1 (instead of D)
+  // strides.
+  strides[rank - 2] = shape[rank - 1];
+  for (int i = rank - 3; i >= 0; --i) {
+    strides[i] = strides[i + 1] * shape[i + 1];
+  }
+  return strides;
+}
+
+}  // namespace util
+}  // namespace tfjs
diff --git a/tfjs-backend-wasm/src/cc/util.h b/tfjs-backend-wasm/src/cc/util.h
index 5a7200b23ee..97802b8c6af 100644
--- a/tfjs-backend-wasm/src/cc/util.h
+++ b/tfjs-backend-wasm/src/cc/util.h
@@ -15,7 +15,6 @@
 #ifndef UTIL_H_
 #define UTIL_H_
 
-#include <string.h>
 #include <cstdarg>
 #include <cstdio>
 #include <vector>
@@ -83,6 +82,65 @@ inline int size_from_shape(const std::vector<int>& shape) {
   return prod;
 }
 
+// Returns the indices of an n-dim tensor given the flat offset and its strides.
+inline std::vector<int> offset_to_loc(int index,
+                                      const std::vector<int>& strides) {
+  int rank = strides.size() + 1;
+  std::vector<int> loc(rank);
+  if (rank == 0) {
+    return loc;
+  } else if (rank == 1) {
+    loc[0] = index;
+    return loc;
+  }
+  for (int i = 0; i < rank - 1; ++i) {
+    int stride = strides[i];
+    loc[i] = index / stride;
+    index -= loc[i] * stride;
+  }
+  loc[rank - 1] = index;
+  return loc;
+}
+
+// Returns the flat offset of an n-dim tensor given the indices and strides.
+inline int loc_to_offset(const std::vector<int>& loc,
+                         const std::vector<int>& strides) {
+  int rank = loc.size();
+  if (rank == 0) {
+    return 0;
+  } else if (rank == 1) {
+    return loc[0];
+  }
+  int index = loc[loc.size() - 1];
+  for (int i = 0; i < loc.size() - 1; ++i) {
+    index += strides[i] * loc[i];
+  }
+  return index;
+}
+
+// Returns the flat offset of a 2D tensor given the indices and the stride.
+inline int offset(int i1, int i2, int s1) { return i1 * s1 + i2; }
+
+// Returns the flat offset of a 3D tensor given the indices and the strides.
+inline int offset(int i1, int i2, int i3, int s1, int s2) {
+  return i1 * s1 + i2 * s2 + i3;
+}
+
+// Returns the flat offset of a 4D tensor given the indices and the strides.
+inline int offset(int i1, int i2, int i3, int i4, int s1, int s2, int s3) {
+  return i1 * s1 + i2 * s2 + i3 * s3 + i4;
+}
+
+// Returns the flat offset of a 5D tensor given the indices and the strides.
+inline int offset(int i1, int i2, int i3, int i4, int i5, int s1, int s2,
+                  int s3, int s4) {
+  return i1 * s1 + i2 * s2 + i3 * s3 + i4 * s4 + i5;
+}
+
+// Returns the strides of a tensor given its shape. Note that the strides
+// are of length R-1 where R is the rank of the tensor.
+std::vector<int> compute_strides(const std::vector<int> shape);
+
 }  // namespace util
 }  // namespace tfjs
 #endif  // UTIL_H_
diff --git a/tfjs-backend-wasm/src/cc/util_test.cc b/tfjs-backend-wasm/src/cc/util_test.cc
new file mode 100644
index 00000000000..3b3e585a131
--- /dev/null
+++ b/tfjs-backend-wasm/src/cc/util_test.cc
@@ -0,0 +1,146 @@
+/* Copyright 2019 Google Inc. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ===========================================================================*/
+
+#include "src/cc/util.h"
+
+#include <gtest/gtest.h>
+
+#include <array>
+#include <vector>
+
+namespace tfjs {
+namespace util {
+namespace {
+
+TEST(util, offset_2d) {
+  std::array<int, 2> coord = {0, 0};
+  std::array<int, 1> stride = {0};
+  EXPECT_EQ(0, offset(coord[0], coord[1], stride[0]));
+
+  coord = {2, 3};
+  stride = {5};
+  EXPECT_EQ(13, offset(coord[0], coord[1], stride[0]));
+}
+
+TEST(util, offset_3d) {
+  std::array<int, 3> coord = {0, 0, 0};
+  std::array<int, 2> stride = {0, 0};
+  EXPECT_EQ(0, offset(coord[0], coord[1], coord[2], stride[0], stride[1]));
+
+  coord = {3, 5, 7};
+  stride = {4, 3};
+  EXPECT_EQ(34, offset(coord[0], coord[1], coord[2], stride[0], stride[1]));
+}
+
+TEST(util, offset_4d) {
+  std::array<int, 4> coord = {0, 0, 0, 0};
+  std::array<int, 3> stride = {0, 0, 0};
+  EXPECT_EQ(0, offset(coord[0], coord[1], coord[2], coord[3], stride[0],
+                      stride[1], stride[2]));
+
+  coord = {1, 2, 3, 4};
+  stride = {5, 7, 9};
+  EXPECT_EQ(50, offset(coord[0], coord[1], coord[2], coord[3], stride[0],
+                       stride[1], stride[2]));
+}
+
+TEST(util, offset_5d) {
+  std::array<int, 5> coord = {0, 0, 0, 0, 0};
+  std::array<int, 4> stride = {0, 0, 0, 0};
+  EXPECT_EQ(0, offset(coord[0], coord[1], coord[2], coord[3], coord[4],
+                      stride[0], stride[1], stride[2], stride[3]));
+
+  coord = {1, 2, 3, 4, 5};
+  stride = {5, 7, 9, 11};
+  EXPECT_EQ(95, offset(coord[0], coord[1], coord[2], coord[3], coord[4],
+                       stride[0], stride[1], stride[2], stride[3]));
+}
+
+TEST(util, size_from_shape) {
+  std::vector<int> shape = {};
+  EXPECT_EQ(1, size_from_shape(shape));
+
+  shape = {3};
+  EXPECT_EQ(3, size_from_shape(shape));
+
+  shape = {3, 4};
+  EXPECT_EQ(12, size_from_shape(shape));
+
+  shape = {1, 3, 5};
+  EXPECT_EQ(15, size_from_shape(shape));
+
+  shape = {2, 3, 4};
+  EXPECT_EQ(24, size_from_shape(shape));
+
+  shape = {2, 3, 4, 5};
+  EXPECT_EQ(120, size_from_shape(shape));
+}
+
+TEST(util, loc_to_offset) {
+  std::vector<int> loc = {};
+  std::vector<int> strides = {};
+  EXPECT_EQ(0, loc_to_offset(loc, strides));
+
+  loc = {5};
+  strides = {};
+  EXPECT_EQ(5, loc_to_offset(loc, strides));
+
+  loc = {3, 5};
+  strides = {7};
+  EXPECT_EQ(26, loc_to_offset(loc, strides));
+
+  loc = {6, 0, 3};
+  strides = {8, 4};
+  EXPECT_EQ(51, loc_to_offset(loc, strides));
+
+  loc = {8, 0, 1, 1};
+  strides = {8, 4, 2};
+  EXPECT_EQ(67, loc_to_offset(loc, strides));
+}
+
+TEST(util, offset_to_loc) {
+  int offset = 5;
+  std::vector<int> strides = {};
+  EXPECT_EQ(std::vector<int>({5}), offset_to_loc(offset, strides));
+
+  offset = 26;
+  strides = {7};
+  EXPECT_EQ(std::vector<int>({3, 5}), offset_to_loc(offset, strides));
+
+  offset = 51;
+  strides = {8, 4};  // shape is [7, 2, 4]
+  EXPECT_EQ(std::vector<int>({6, 0, 3}), offset_to_loc(offset, strides));
+
+  offset = 67;
+  strides = {8, 4, 2};  // shape is [9, 2, 2, 2]
+  EXPECT_EQ(std::vector<int>({8, 0, 1, 1}), offset_to_loc(offset, strides));
+}
+
+TEST(util, compute_strides) {
+  std::vector<int> shape = {5, 7};
+  EXPECT_EQ(std::vector<int>({7}), compute_strides(shape));
+
+  shape = {3, 5, 7};
+  EXPECT_EQ(std::vector<int>({35, 7}), compute_strides(shape));
+
+  shape = {3, 5, 7, 9};
+  EXPECT_EQ(std::vector<int>({315, 63, 9}), compute_strides(shape));
+
+  shape = {2, 3, 5, 7, 9};
+  EXPECT_EQ(std::vector<int>({945, 315, 63, 9}), compute_strides(shape));
+}
+
+}  // namespace
+}  // namespace util
+}  // namespace tfjs
diff --git a/tfjs-backend-wasm/src/kernels/Cast.ts b/tfjs-backend-wasm/src/kernels/Cast.ts
index 798375ff37c..1592bb9f42a 100644
--- a/tfjs-backend-wasm/src/kernels/Cast.ts
+++ b/tfjs-backend-wasm/src/kernels/Cast.ts
@@ -15,7 +15,7 @@
  * =============================================================================
  */
 
-import {DataType, NamedAttrMap, NamedTensorInfoMap, registerKernel, util} from '@tensorflow/tfjs-core';
+import {DataType, NamedAttrMap, NamedTensorInfoMap, registerKernel} from '@tensorflow/tfjs-core';
 import {TensorInfo} from '@tensorflow/tfjs-core';
 
 import {BackendWasm} from '../backend_wasm';
@@ -32,11 +32,8 @@ function cast(
     args: {inputs: CastInputs, attrs: CastAttrs, backend: BackendWasm}) {
   const {inputs: {x}, attrs: {dtype}, backend} = args;
   const out = backend.makeOutput(x.shape, dtype);
-  const {memoryOffset: inOffset} = backend.dataIdMap.get(x.dataId);
-  const {memoryOffset: outOffset} = backend.dataIdMap.get(out.dataId);
-  const size = util.sizeFromShape(x.shape);
-  const inVals = backend.typedArrayFromHeap(inOffset, x.dtype, size);
-  const outVals = backend.typedArrayFromHeap(outOffset, dtype, size);
+  const inVals = backend.typedArrayFromHeap(x);
+  const outVals = backend.typedArrayFromHeap(out);
   outVals.set(inVals);
   return out;
 }
diff --git a/tfjs-backend-wasm/src/kernels/Concat.ts b/tfjs-backend-wasm/src/kernels/Concat.ts
new file mode 100644
index 00000000000..5cf36b35593
--- /dev/null
+++ b/tfjs-backend-wasm/src/kernels/Concat.ts
@@ -0,0 +1,58 @@
+/**
+ * @license
+ * Copyright 2019 Google Inc. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {backend_util, KernelFunc, NamedAttrMap, registerKernel, TensorInfo, util} from '@tensorflow/tfjs-core';
+
+import {BackendWasm} from '../backend_wasm';
+
+interface ConcatAttrs extends NamedAttrMap {
+  axis: number;
+}
+
+function concat(
+    args: {inputs: TensorInfo[], backend: BackendWasm, attrs: ConcatAttrs}) {
+  const {inputs, backend, attrs: {axis}} = args;
+  const outShape = backend_util.computeOutShape(inputs.map(t => t.shape), axis);
+  const out = backend.makeOutput(outShape, inputs[0].dtype);
+
+  const batchDim = util.sizeFromShape(inputs[0].shape.slice(0, axis));
+  let sumInnerDims = 0;
+  const innerDims = inputs.map(input => {
+    const innerDim = util.sizeFromShape(input.shape.slice(axis));
+    sumInnerDims += innerDim;
+    return innerDim;
+  });
+  const inVals = inputs.map(input => backend.typedArrayFromHeap(input));
+  const outVals = backend.typedArrayFromHeap(out);
+  for (let b = 0; b < batchDim; b++) {
+    let outOffset = b * sumInnerDims;
+    for (let i = 0; i < inVals.length; i++) {
+      const innerDim = innerDims[i];
+      const inOffset = b * innerDim;
+      const vals = inVals[i].subarray(inOffset, inOffset + innerDim);
+      outVals.set(vals, outOffset);
+      outOffset += innerDim;
+    }
+  }
+  return out;
+}
+
+registerKernel({
+  kernelName: 'Concat',
+  backendName: 'wasm',
+  kernelFunc: concat as {} as KernelFunc,
+});
diff --git a/tfjs-backend-wasm/src/kernels/Slice.ts b/tfjs-backend-wasm/src/kernels/Slice.ts
index eae6517f51c..91ed047adf5 100644
--- a/tfjs-backend-wasm/src/kernels/Slice.ts
+++ b/tfjs-backend-wasm/src/kernels/Slice.ts
@@ -33,13 +33,9 @@ function slice(
     args: {inputs: SliceInputs, attrs: SliceAttrs, backend: BackendWasm}) {
   const {inputs: {x}, attrs: {begin, size}, backend} = args;
   const isContinous = slice_util.isSliceContinous(x.shape, begin, size);
-  const {memoryOffset: xOffset} = backend.dataIdMap.get(x.dataId);
-  const xVals =
-      backend.typedArrayFromHeap(xOffset, x.dtype, util.sizeFromShape(x.shape));
+  const xVals = backend.typedArrayFromHeap(x);
   const out = backend.makeOutput(size, x.dtype);
-  const {memoryOffset: outOffset} = backend.dataIdMap.get(out.dataId);
-  const outVals = backend.typedArrayFromHeap(
-      outOffset, out.dtype, util.sizeFromShape(out.shape));
+  const outVals = backend.typedArrayFromHeap(out);
   const xStrides = util.computeStrides(x.shape);
   if (isContinous) {
     const flatOffset = slice_util.computeFlatOffset(begin, xStrides);
diff --git a/tfjs-backend-wasm/src/kernels/Transpose.ts b/tfjs-backend-wasm/src/kernels/Transpose.ts
new file mode 100644
index 00000000000..ea2709c4ba5
--- /dev/null
+++ b/tfjs-backend-wasm/src/kernels/Transpose.ts
@@ -0,0 +1,116 @@
+/**
+ * @license
+ * Copyright 2019 Google Inc. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+
+import {NamedAttrMap, NamedTensorInfoMap, registerKernel, TensorInfo} from '@tensorflow/tfjs-core';
+
+import {BackendWasm} from '../backend_wasm';
+
+interface TransposeInputs extends NamedTensorInfoMap {
+  x: TensorInfo;
+}
+
+interface TransposeAttrs extends NamedAttrMap {
+  perm: number[];
+}
+
+let wasmTranspose: (
+    xId: number, xShape: Uint8Array, xShapeLength: number, outId: number,
+    perm: Uint8Array, permLength: number) => void;
+
+function setup(backend: BackendWasm) {
+  wasmTranspose = backend.wasm.cwrap('Transpose', null /* void */, [
+    'number',  // xId
+    'array',   // x.shape
+    'number',  // x.shape.length
+    'number',  // outId
+    'array',   // perm
+    'number',  // perm.length
+  ]);
+}
+
+function transpose(
+    args:
+        {inputs: TransposeInputs, backend: BackendWasm, attrs: TransposeAttrs}):
+    TensorInfo {
+  const {inputs, backend, attrs} = args;
+  // Reduce any dimensions with size one. Lower-rank transpose kernel performs
+  // better due to simpler memory access pattern.
+  const [reducedShape, perm] = removeOneSizeDims(inputs.x.shape, attrs.perm);
+  const x = {
+    dataId: inputs.x.dataId,
+    shape: reducedShape,
+    dtype: inputs.x.dtype
+  };
+  let permIsNoOp = true;
+  for (let i = 0; i < perm.length; i++) {
+    if (perm[i] !== i) {
+      permIsNoOp = false;
+    }
+  }
+  const outShape = computeOutShape(inputs.x.shape, attrs.perm);
+  if (permIsNoOp) {
+    return {dataId: x.dataId, shape: outShape, dtype: x.dtype};
+  }
+  const out = backend.makeOutput(outShape, x.dtype);
+  const xId = backend.dataIdMap.get(x.dataId).id;
+  const outId = backend.dataIdMap.get(out.dataId).id;
+  const permBytes = new Uint8Array(new Int32Array(perm).buffer);
+  const xShapeBytes = new Uint8Array(new Int32Array(x.shape).buffer);
+  wasmTranspose(
+      xId, xShapeBytes, x.shape.length, outId, permBytes, perm.length);
+  return out;
+}
+
+function computeOutShape(inShape: number[], perm: number[]): number[] {
+  const outShape = new Array(inShape.length);
+  for (let i = 0; i < outShape.length; i++) {
+    outShape[i] = inShape[perm[i]];
+  }
+  return outShape;
+}
+
+function removeOneSizeDims(
+    shape: number[], perm: number[]): [number[], number[]] {
+  const newShape: number[] = [];
+  const newPerm: number[] = [];
+  for (let i = 0; i < shape.length; ++i) {
+    if (shape[i] !== 1) {
+      newShape.push(shape[i]);
+    }
+    if (shape[perm[i]] !== 1) {
+      newPerm.push(perm[i]);
+    }
+  }
+  for (let i = 0; i < newPerm.length; ++i) {
+    let minValIdx = -1;
+    for (let j = 0; j < newPerm.length; ++j) {
+      if (newPerm[j] >= i &&
+          (minValIdx === -1 || newPerm[minValIdx] > newPerm[j])) {
+        minValIdx = j;
+      }
+    }
+    newPerm[minValIdx] = i;
+  }
+  return [newShape, newPerm];
+}
+
+registerKernel({
+  kernelName: 'Transpose',
+  backendName: 'wasm',
+  kernelFunc: transpose,
+  setupFunc: setup,
+});
diff --git a/tfjs-backend-wasm/src/kernels/all_kernels.ts b/tfjs-backend-wasm/src/kernels/all_kernels.ts
index 639277977db..0125b92195b 100644
--- a/tfjs-backend-wasm/src/kernels/all_kernels.ts
+++ b/tfjs-backend-wasm/src/kernels/all_kernels.ts
@@ -23,6 +23,7 @@ import './Add';
 import './BatchMatMul';
 import './FusedBatchNorm';
 import './Cast';
+import './Concat';
 import './Div';
 import './Mul';
 import './Min';
@@ -31,4 +32,6 @@ import './Prelu';
 import './Reshape';
 import './Sigmoid';
 import './Slice';
+import './Square';
 import './Sub';
+import './Transpose';
diff --git a/tfjs-backend-wasm/src/setup_test.ts b/tfjs-backend-wasm/src/setup_test.ts
index 31f6d6383c5..0ac21676735 100644
--- a/tfjs-backend-wasm/src/setup_test.ts
+++ b/tfjs-backend-wasm/src/setup_test.ts
@@ -118,7 +118,9 @@ const TEST_FILTERS: TestFilter[] = [
       'gradient'  // Gradient is missing.
     ]
   },
-  {include: 'slice '}, {include: 'square '}, {
+  {include: 'slice '},
+  {include: 'square '},
+  {
     startsWith: 'min ',
     excludes: [
       'derivative: 1D tensor with max or min value',  // Clip not yet
@@ -137,7 +139,18 @@ const TEST_FILTERS: TestFilter[] = [
       '2D, axis=0'  // Permuted axes requires transpose, which is not yet
                     // implemented.
     ]
-  }
+  },
+  {
+    include: 'concat',
+    excludes: [
+      'complex',  // Complex numbers not supported yet
+      'gradient'  // Split is not yet implemented
+    ]
+  },
+  {
+    include: 'transpose',
+    excludes: ['oneHot']  // oneHot not yet implemented.
+  },
 ];
 
 const customInclude = (testName: string) => {
diff --git a/tfjs-core/src/jasmine_util.ts b/tfjs-core/src/jasmine_util.ts
index 43b20260805..4a84d8b44bb 100644
--- a/tfjs-core/src/jasmine_util.ts
+++ b/tfjs-core/src/jasmine_util.ts
@@ -197,7 +197,7 @@ export function describeWithFlags(
     env().setFlags(testEnv.flags);
     if (envSatisfiesConstraints(env(), testEnv, constraints)) {
       const testName =
-          name + ' ' + testEnv.name + ' ' + JSON.stringify(testEnv.flags);
+          name + ' ' + testEnv.name + ' ' + JSON.stringify(testEnv.flags || {});
       executeTests(testName, tests, testEnv);
     }
   });
diff --git a/tfjs-core/src/ops/concat_split.ts b/tfjs-core/src/ops/concat_split.ts
index a8651041f0b..92909ed19eb 100644
--- a/tfjs-core/src/ops/concat_split.ts
+++ b/tfjs-core/src/ops/concat_split.ts
@@ -191,8 +191,10 @@ function concat_<T extends Tensor>(tensors: Array<T|TensorLike>, axis = 0): T {
     return derTensors.map(t => () => t) as {};
   };
   const inputs = $tensors as {};
+  const attr = {axis};
   return ENGINE.runKernelFunc(
-      backend => backend.concat($tensors, axis) as T, inputs, der);
+      backend => backend.concat($tensors, axis) as T, inputs, der, 'Concat',
+      attr);
 }
 
 /**
diff --git a/tfjs-core/src/ops/transpose.ts b/tfjs-core/src/ops/transpose.ts
index 601dcefcca3..0f9a178ea89 100644
--- a/tfjs-core/src/ops/transpose.ts
+++ b/tfjs-core/src/ops/transpose.ts
@@ -64,10 +64,11 @@ function transpose_<T extends Tensor>(x: T|TensorLike, perm?: number[]): T {
 
   const der = (dy: T) => {
     const undoPerm = axis_util.getUndoAxesPermutation(perm);
-    return {$x: () => dy.transpose(undoPerm)};
+    return {x: () => dy.transpose(undoPerm)};
   };
+  const attrs = {perm};
   return ENGINE.runKernelFunc(
-      backend => backend.transpose($x, perm), {$x}, der);
+      backend => backend.transpose($x, perm), {x: $x}, der, 'Transpose', attrs);
 }
 
 export const transpose = op({transpose_});
diff --git a/tfjs-core/src/ops/transpose_test.ts b/tfjs-core/src/ops/transpose_test.ts
index fcbf2c0b171..8571210a0f0 100644
--- a/tfjs-core/src/ops/transpose_test.ts
+++ b/tfjs-core/src/ops/transpose_test.ts
@@ -64,6 +64,14 @@ describeWithFlags('transpose', ALL_ENVS, () => {
     expectArraysClose(await t2.data(), [1, 3, 11, 33, 2, 4, 22, 44]);
   });
 
+  it('2D, shape has ones', async () => {
+    const t = tf.tensor2d([1, 2, 3, 4], [1, 4]);
+    const t2 = tf.transpose(t, [1, 0]);
+
+    expect(t2.shape).toEqual([4, 1]);
+    expectArraysClose(await t2.data(), [1, 2, 3, 4]);
+  });
+
   it('3D [r, c, d] => [d, r, c]', async () => {
     const t = tf.tensor3d([1, 11, 2, 22, 3, 33, 4, 44], [2, 2, 2]);
     const t2 = tf.transpose(t, [2, 0, 1]);
@@ -80,6 +88,33 @@ describeWithFlags('transpose', ALL_ENVS, () => {
     expectArraysClose(await t2.data(), [1, 3, 2, 4, 11, 33, 22, 44]);
   });
 
+  it('3D [r, c, d] => [d, r, c], shape has ones', async () => {
+    const perm = [2, 0, 1];
+
+    const t = tf.tensor3d([1, 2, 3, 4], [2, 1, 2]);
+    const tt = tf.transpose(t, perm);
+    expect(tt.shape).toEqual([2, 2, 1]);
+    expectArraysClose(await tt.data(), [1, 3, 2, 4]);
+
+    const t2 = tf.tensor3d([1, 2, 3, 4], [2, 2, 1]);
+    const tt2 = tf.transpose(t2, perm);
+    expect(tt2.shape).toEqual([1, 2, 2]);
+    expectArraysClose(await tt2.data(), [1, 2, 3, 4]);
+
+    const t3 = tf.tensor3d([1, 2, 3, 4], [1, 2, 2]);
+    const tt3 = tf.transpose(t3, perm);
+    expect(tt3.shape).toEqual([2, 1, 2]);
+    expectArraysClose(await tt3.data(), [1, 3, 2, 4]);
+  });
+
+  it('3D [r, c, d] => [r, d, c]', async () => {
+    const perm = [0, 2, 1];
+    const t = tf.tensor3d([1, 2, 3, 4, 5, 6, 7, 8], [2, 2, 2]);
+    const tt = tf.transpose(t, perm);
+    expect(tt.shape).toEqual([2, 2, 2]);
+    expectArraysClose(await tt.data(), [1, 3, 2, 4, 5, 7, 6, 8]);
+  });
+
   it('5D [r, c, d, e, f] => [r, c, d, f, e]', async () => {
     const t = tf.tensor5d(
         new Array(32).fill(0).map((x, i) => i + 1), [2, 2, 2, 2, 2]);
@@ -92,6 +127,50 @@ describeWithFlags('transpose', ALL_ENVS, () => {
     ]);
   });
 
+  it('4D [r, c, d, e] => [c, r, d, e]', async () => {
+    const t =
+        tf.tensor4d(new Array(16).fill(0).map((x, i) => i + 1), [2, 2, 2, 2]);
+    const t2 = tf.transpose(t, [1, 0, 2, 3]);
+
+    expect(t2.shape).toEqual([2, 2, 2, 2]);
+    expectArraysClose(
+        await t2.data(),
+        [1, 2, 3, 4, 9, 10, 11, 12, 5, 6, 7, 8, 13, 14, 15, 16]);
+  });
+
+  it('4D [r, c, d, e] => [c, r, e, d]', async () => {
+    const t =
+        tf.tensor4d(new Array(16).fill(0).map((x, i) => i + 1), [2, 2, 2, 2]);
+    const t2 = tf.transpose(t, [1, 0, 3, 2]);
+
+    expect(t2.shape).toEqual([2, 2, 2, 2]);
+    expectArraysClose(
+        await t2.data(),
+        [1, 3, 2, 4, 9, 11, 10, 12, 5, 7, 6, 8, 13, 15, 14, 16]);
+  });
+
+  it('4D [r, c, d, e] => [e, r, c, d]', async () => {
+    const t =
+        tf.tensor4d(new Array(16).fill(0).map((x, i) => i + 1), [2, 2, 2, 2]);
+    const t2 = tf.transpose(t, [3, 0, 1, 2]);
+
+    expect(t2.shape).toEqual([2, 2, 2, 2]);
+    expectArraysClose(
+        await t2.data(),
+        [1, 3, 5, 7, 9, 11, 13, 15, 2, 4, 6, 8, 10, 12, 14, 16]);
+  });
+
+  it('4D [r, c, d, e] => [d, c, e, r]', async () => {
+    const t =
+        tf.tensor4d(new Array(16).fill(0).map((x, i) => i + 1), [2, 2, 2, 2]);
+    const t2 = tf.transpose(t, [2, 1, 3, 0]);
+
+    expect(t2.shape).toEqual([2, 2, 2, 2]);
+    expectArraysClose(
+        await t2.data(),
+        [1, 9, 2, 10, 5, 13, 6, 14, 3, 11, 4, 12, 7, 15, 8, 16]);
+  });
+
   it('5D [r, c, d, e, f] => [c, r, d, e, f]', async () => {
     const t = tf.tensor5d(
         new Array(32).fill(0).map((x, i) => i + 1), [2, 2, 2, 2, 2]);