tensorflow · drpngx · Jun 26, 2017 · Jun 5, 2017 · Jun 14, 2017 · Jun 17, 2017
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -7,6 +7,7 @@ tensorflow/core/kernels/transpose_functor_cpu.cc
 tensorflow/core/kernels/training_op_helpers.cc
 tensorflow/core/kernels/training_ops.cc
 tensorflow/core/kernels/topk_op.cc
+tensorflow/core/kernels/tile_functor_cpu.cc
 tensorflow/core/kernels/tile_ops.cc
 tensorflow/core/kernels/tile_ops_cpu_impl_1.cc
 tensorflow/core/kernels/tile_ops_cpu_impl_2.cc

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
@@ -722,6 +722,12 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "tile_ops",
+    srcs = ["tile_functor_cpu.cc"],
+    hdrs = ["tile_functor.h"],
+    gpu_srcs = [
+        "tile_functor.h",
+        "tile_functor_gpu.cu.cc",
+    ],
     prefix = "tile_ops",
     deps = ARRAY_DEPS,
 )
@@ -4137,6 +4143,7 @@ filegroup(
         "spacetobatch_functor.h",
         "spacetodepth_op.h",
         "tensor_array.h",
+        "tile_functor.h",
         "tile_ops_cpu_impl.h",
         "tile_ops_impl.h",
         "training_op_helpers.h",
@@ -4270,6 +4277,7 @@ filegroup(
         "summary_op.cc",
         "tensor_array.cc",
         "tensor_array_ops.cc",
+        "tile_functor_cpu.cc",
         "tile_ops.cc",
         "tile_ops_cpu_impl_1.cc",
         "tile_ops_cpu_impl_2.cc",

diff --git a/tensorflow/core/kernels/tile_functor.h b/tensorflow/core/kernels/tile_functor.h
@@ -0,0 +1,115 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_TILE_FUNCTOR_H_
+#define TENSORFLOW_KERNELS_TILE_FUNCTOR_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+namespace internal {
+
+// Helper to compute 'strides' given a tensor 'shape'. I.e.,
+// strides[i] = prod(shape.dim_size[(i+1):])
+template <typename Index>
+gtl::InlinedVector<Index, 8> ComputeStride(const TensorShape& shape) {
+  const int ndims = shape.dims();
+  gtl::InlinedVector<Index, 8> strides(ndims);
+  Index stride = 1;
+  for (int i = ndims - 1; i >= 0; --i) {
+    strides[i] = stride;
+    stride *= static_cast<Index>(shape.dim_size(i));
+  }
+  return strides;
+}
+
+
+// Device-specific naive implementation for tile.
+template <typename Device, typename T>
+void TileSimple(const Device& d, Tensor* out, const Tensor& in);
+
+template <typename Device, typename T, int NDIM>
+void TileUsingEigen(const Device& d, Tensor* out, const Tensor& in,
+                    const gtl::ArraySlice<int32>& broadcast_array) {
+  auto x = in.tensor<T, NDIM>();
+  auto y = out->tensor<T, NDIM>();
+
+  Eigen::array<int32, NDIM> b;
+  for (int i = 0; i < NDIM; ++i) b[i] = broadcast_array[i];
+  if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
+    // Use 32bit indexing to speed up the computations
+    To32Bit(y).device(d) = To32Bit(x).broadcast(b);
+  } else {
+    y.device(d) = x.broadcast(b);
+  }
+}
+
+template <typename Device, typename T>
+void TileUsingEigen(const Device& d, Tensor* out, const Tensor& in,
+                    const gtl::ArraySlice<int32>&) {
+  auto x = in.tensor<T, 0>();
+  auto y = out->tensor<T, 0>();
+  // In the scalar case we simply copy the input.
+  y.device(d) = x;
+}
+
+}  // end namespace internal
+
+namespace functor {
+
+template <typename Device, typename T>
+struct Tile {
+  void operator()(const Device& d, Tensor* out, const Tensor& in,
+                  const gtl::ArraySlice<int32> broadcast_array) const {
+    switch (in.dims()) {
+      case 0:
+        internal::TileUsingEigen<Device, T>(d, out, in, broadcast_array);
+        break;
+      case 1:
+        internal::TileUsingEigen<Device, T, 1>(d, out, in, broadcast_array);
+        break;
+      case 2:
+        internal::TileUsingEigen<Device, T, 2>(d, out, in, broadcast_array);
+        break;
+      case 3:
+        internal::TileUsingEigen<Device, T, 3>(d, out, in, broadcast_array);
+        break;
+      case 4:
+        internal::TileUsingEigen<Device, T, 4>(d, out, in, broadcast_array);
+        break;
+      case 5:
+        internal::TileUsingEigen<Device, T, 5>(d, out, in, broadcast_array);
+        break;
+      case 6:
+        internal::TileUsingEigen<Device, T, 6>(d, out, in, broadcast_array);
+        break;
+      case 7:
+        internal::TileUsingEigen<Device, T, 7>(d, out, in, broadcast_array);
+        break;
+      default:
+        internal::TileSimple<Device, T>(d, out, in);
+        break;
+    }
+  }
+};
+
+}  // end namespace functor
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_TILE_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/tile_functor_cpu.cc b/tensorflow/core/kernels/tile_functor_cpu.cc
@@ -0,0 +1,85 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/tile_functor.h"
+
+namespace tensorflow {
+
+namespace internal {
+
+template <typename Device, typename T>
+void TileSimple(const Device& d, Tensor* out, const Tensor& in) {
+  const int ndims = in.dims();
+  const int64 nelem = out->NumElements();
+  gtl::InlinedVector<int64, 8> in_strides = ComputeStride<int64>(in.shape());
+  gtl::InlinedVector<int64, 8> out_strides = ComputeStride<int64>(out->shape());
+  const T* p = in.flat<T>().data();
+  T* q = out->flat<T>().data();
+
+  for (int64 o_idx = 0; o_idx < nelem; ++o_idx) {
+    int64 i_idx = 0;
+    int64 t = o_idx;
+    for (int i = 0; i < ndims; ++i) {
+      i_idx += t / out_strides[i] % in.dim_size(i) * in_strides[i];
+      t %= out_strides[i];
+    }
+    q[o_idx] = p[i_idx];
+  }
+}
+
+}  // end namespace internal
+
+namespace functor {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+// Register functors used for Tile functor.
+#define DEFINE_TYPE(T) template struct Tile<CPUDevice, T>;
+
+TF_CALL_bool(DEFINE_TYPE);
+TF_CALL_float(DEFINE_TYPE);
+TF_CALL_double(DEFINE_TYPE);
+TF_CALL_uint8(DEFINE_TYPE);
+TF_CALL_int32(DEFINE_TYPE);
+TF_CALL_int16(DEFINE_TYPE);
+TF_CALL_int64(DEFINE_TYPE);
+TF_CALL_half(DEFINE_TYPE);
+TF_CALL_complex64(DEFINE_TYPE);
+TF_CALL_complex128(DEFINE_TYPE);
+TF_CALL_string(DEFINE_TYPE);
+
+#undef DEFINE_TYPE
+
+#ifdef TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SYCLDevice;
+
+#define DEFINE_TYPE(T) template struct Tile<SYCLDevice, T>;
+
+TF_CALL_bool(DEFINE_TYPE);
+TF_CALL_float(DEFINE_TYPE);
+TF_CALL_double(DEFINE_TYPE);
+TF_CALL_uint8(DEFINE_TYPE);
+TF_CALL_int32(DEFINE_TYPE);
+TF_CALL_int16(DEFINE_TYPE);
+TF_CALL_int64(DEFINE_TYPE);
+
+#undef DEFINE_TYPE
+#endif // TENSORFLOW_USE_SYCL
+
+}  // end namespace functor
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_functor_gpu.cu.cc b/tensorflow/core/kernels/tile_functor_gpu.cu.cc
@@ -0,0 +1,101 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+namespace internal {
+
+template <typename T>
+__global__ void TileKernel(int nthreads, const T* src, const int32* buf,
+                           const int32 ndims, T* dst) {
+  const int32* in_strides = buf;
+  const int32* out_strides = buf + ndims;
+  const int32* in_dim_sizes = buf + ndims * 2;
+  CUDA_1D_KERNEL_LOOP(o_idx, nthreads) {
+    int32 i_idx = 0;
+    int32 t = o_idx;
+    for (int i = 0; i < ndims; ++i) {
+      i_idx += t / out_strides[i] % in_dim_sizes[i] * in_strides[i];
+      t %= out_strides[i];
+    }
+    dst[o_idx] = ldg(src + i_idx);
+  }
+}
+
+template <typename Device, typename T>
+void TileSimple(const Device& d, Tensor* out, const Tensor& in) {
+  // Ensures we can use 32-bit index.
+  const int64 in_nelem = in.NumElements();
+  CHECK_LT(in_nelem, kint32max) << "Tensor too large to transpose on GPU";
+  const int64 out_nelem = out->NumElements();
+  CHECK_LT(out_nelem, kint32max) << "Tensor too large to transpose on GPU";
+  // Pack strides and input dimension sizes into one buffer.
+  const int32 ndims = in.dims();
+  gtl::InlinedVector<int32, 24> host_buf(ndims * 3);
+  gtl::InlinedVector<int32, 8> in_strides = ComputeStride<int32>(in.shape());
+  gtl::InlinedVector<int32, 8> out_strides = ComputeStride<int32>(out->shape());
+  for (int i = 0; i < ndims; ++i) {
+    host_buf[i] = in_strides[i];
+    host_buf[ndims + i] = out_strides[i];
+    host_buf[ndims * 2 + i] = in.dim_size(i);
+  }
+  // Copies the input strides, output strides and input dimension sizes to the device.
+  auto num_bytes = sizeof(int64) * host_buf.size();
+  auto dev_buf = d.allocate(num_bytes);
+  // NOTE: host_buf is not allocated by CudaHostAllocator, and
+  // therefore we are doing a sync copy effectively.
+  d.memcpyHostToDevice(dev_buf, host_buf.data(), num_bytes);
+  // Launch kernel to q[...] = p[...].
+  const T* p = in.flat<T>().data();
+  T* q = out->flat<T>().data();
+  CudaLaunchConfig cfg = GetCudaLaunchConfig(out_nelem, d);
+  TileKernel<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
+      cfg.virtual_thread_count, p, reinterpret_cast<const int32*>(dev_buf),
+      ndims, q);
+  // Safe to deallocate immediately after the kernel launch.
+  d.deallocate(dev_buf);
+}
+
+}  // end namespace internal
+
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+// Register functors used for Tile functor.
+#define DEFINE_TYPE(T) template struct Tile<GPUDevice, T>;
+
+TF_CALL_int16(DEFINE_TYPE);
+TF_CALL_int32(DEFINE_TYPE);
+TF_CALL_int64(DEFINE_TYPE);
+TF_CALL_float(DEFINE_TYPE);
+TF_CALL_double(DEFINE_TYPE);
+TF_CALL_half(DEFINE_TYPE);
+TF_CALL_complex64(DEFINE_TYPE);
+TF_CALL_complex128(DEFINE_TYPE);
+
+#undef DEFINE_TYPE
+
+}  // end namespace functor
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA