tensorflow · gunan · Nov 6, 2017 · Jun 27, 2017 · Jun 27, 2017 · Jun 27, 2017
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
@@ -190,42 +190,26 @@ class SliceOp : public OpKernel {
         }
         return;
       }
-#define HANDLE_DIM(NDIM)                            \
-  if (input_dims == NDIM) {                         \
-    HandleCase<NDIM>(context, begin, size, result); \
-    return;                                         \
+#define HANDLE_DIM(NDIM)                                              \
+  if (input_dims == NDIM) {                                           \
+    functor::Slice<Device, T, NDIM>()(                                \
+        context->eigen_device<Device>(), result, input, begin, size); \
+    return;                                                           \
   }
-
       HANDLE_DIM(1);
       HANDLE_DIM(2);
       HANDLE_DIM(3);
       HANDLE_DIM(4);
       HANDLE_DIM(5);
       HANDLE_DIM(6);
-      HANDLE_DIM(7);
 
 #undef HANDLE_DIM
 
-      OP_REQUIRES(context, false, errors::Unimplemented(
-                                      "SliceOp : Unhandled input dimensions"));
+      // handle cases which dim >= 7
+      functor::Slice<Device, T, 7>()(
+          context->eigen_device<Device>(), result, input, begin, size);
     }
   }
-
- private:
-  template <int NDIM>
-  void HandleCase(OpKernelContext* context, const gtl::ArraySlice<int64>& begin,
-                  const gtl::ArraySlice<int64>& size, Tensor* result) {
-    Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
-    Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes;
-    for (int i = 0; i < NDIM; ++i) {
-      indices[i] = begin[i];
-      sizes[i] = size[i];
-    }
-
-    functor::Slice<Device, T, NDIM>()(
-        context->eigen_device<Device>(), result->tensor<T, NDIM>(),
-        context->input(0).tensor<T, NDIM>(), indices, sizes);
-  }
 };
 
 #ifdef INTEL_MKL
@@ -264,24 +248,13 @@ class MklSliceOp : public OpKernel {
         }
         return;
       }
-#define HANDLE_DIM(NDIM)                            \
-  if (input_dims == NDIM) {                         \
-    HandleCase<NDIM>(context, begin, size, result); \
-    return;                                         \
-  }
-
-      HANDLE_DIM(1);
-      HANDLE_DIM(2);
-      HANDLE_DIM(3);
-      HANDLE_DIM(4);
-      HANDLE_DIM(5);
-      HANDLE_DIM(6);
-      HANDLE_DIM(7);
-
-#undef HANDLE_DIM
-
-      OP_REQUIRES(context, false, errors::Unimplemented(
-                                      "SliceOp : Unhandled input dimensions"));
+      // Special case for handling 4-D tensor slice.
+      if (input_dims == 4) {
+        HandleCase4D(context, begin, size, result);
+      } else {
+        functor::Slice<Device, T, input_dims>()(
+            context->eigen_device<Device>(), result, input, begin, size);
+      }
     }
   }
 
@@ -328,8 +301,7 @@ class MklSliceOp : public OpKernel {
     return false;
   }
 
-  template <int NDIM>
-  void HandleCase(OpKernelContext* context,
+  void HandleCase4D(OpKernelContext* context,
                   const gtl::ArraySlice<int64>& begin,
                   const gtl::ArraySlice<int64>& size, Tensor* result) {
     int slice_dim = -1;
@@ -338,8 +310,7 @@ class MklSliceOp : public OpKernel {
     // differs from the input tensor in only 1 out of 4 dimensions.
     // This case arises in the context of Slice of 4-D tensor in NHWC or NCHW
     // format over channel dimension.
-    if (NDIM == 4 &&
-        DoesSliceShapeDifferInOnly1D(in_shape, begin, size, &slice_dim)) {
+    if (DoesSliceShapeDifferInOnly1D(in_shape, begin, size, &slice_dim)) {
         size_t in_strides[4] = { (size_t) in_shape.dim_size(1) *
                                           in_shape.dim_size(2) *
                                           in_shape.dim_size(3),
@@ -403,30 +374,22 @@ class MklSliceOp : public OpKernel {
         // slice_dim is not 1 or 3, then we fallback to Eigen implementation.
     }
 
-    Eigen::DSizes<Eigen::DenseIndex, NDIM> indices;
-    Eigen::DSizes<Eigen::DenseIndex, NDIM> sizes;
-    for (int i = 0; i < NDIM; ++i) {
-      indices[i] = begin[i];
-      sizes[i] = size[i];
-    }
-
-    functor::Slice<Device, T, NDIM>()(
-        context->eigen_device<Device>(), result->tensor<T, NDIM>(),
-        context->input(0).tensor<T, NDIM>(), indices, sizes);
+    functor::Slice<Device, T, 4>()(
+        context->eigen_device<Device>(), result, input, begin, size);
   }
 };
 #endif
 
 // Forward declarations of the functor specializations for declared in the
 // sharded source files.
 namespace functor {
-#define DECLARE_CPU_SPEC(T, NDIM)                                  \
-  template <>                                                      \
-  void Slice<CPUDevice, T, NDIM>::operator()(                      \
-      const CPUDevice& d, typename TTypes<T, NDIM>::Tensor output, \
-      typename TTypes<T, NDIM>::ConstTensor input,                 \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,       \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);        \
+#define DECLARE_CPU_SPEC(T, NDIM)                        \
+  template <>                                            \
+  void Slice<CPUDevice, T, NDIM>::operator()(            \
+      const CPUDevice& d, Tensor* output,                \
+      const Tensor& input,                               \
+      const gtl::ArraySlice<int64>& slice_indices,       \
+      const gtl::ArraySlice<int64>& slice_sizes);        \
   extern template struct Slice<CPUDevice, T, NDIM>;
 
 #define DECLARE_FOR_N(T)  \
@@ -476,13 +439,14 @@ REGISTER_SLICE(bfloat16);
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T, NDIM)                                  \
-  template <>                                                      \
-  void Slice<GPUDevice, T, NDIM>::operator()(                      \
-      const GPUDevice& d, typename TTypes<T, NDIM>::Tensor output, \
-      typename TTypes<T, NDIM>::ConstTensor input,                 \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,       \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);        \
+#define DECLARE_GPU_SPEC(T, NDIM)                        \
+  template <>                                            \
+  void Slice<GPUDevice, T, NDIM>::operator()(            \
+      const GPUDevice& d,                                \
+      Tensor* output,                                    \
+      const Tensor& input,                               \
+      const gtl::ArraySlice<int64>& slice_indices,       \
+      const gtl::ArraySlice<int64>& slice_sizes);        \
   extern template struct Slice<GPUDevice, T, NDIM>;
 
 #define DECLARE_FOR_N(T)  \
@@ -536,13 +500,14 @@ REGISTER_KERNEL_BUILDER(Name("Slice")
 #ifdef TENSORFLOW_USE_SYCL
 // Forward declarations of the functor specializations for SYCL.
 namespace functor {
-#define DECLARE_SYCL_SPEC(T, NDIM)                                 \
-  template <>                                                      \
-  void Slice<SYCLDevice, T, NDIM>::operator()(                     \
-      const SYCLDevice& d, typename TTypes<T, NDIM>::Tensor output,\
-      typename TTypes<T, NDIM>::ConstTensor input,                 \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,       \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);        \
+#define DECLARE_SYCL_SPEC(T, NDIM)                       \
+  template <>                                            \
+  void Slice<SYCLDevice, T, NDIM>::operator()(           \
+      const SYCLDevice& d,                               \
+      Tensor* output,                                    \
+      const Tensor& input,                               \
+      const gtl::ArraySlice<int64>& slice_indices,       \
+      const gtl::ArraySlice<int64>& slice_sizes);        \
   extern template struct Slice<SYCLDevice, T, NDIM>;
 
 #define DECLARE_FOR_N(T)   \

diff --git a/tensorflow/core/kernels/slice_op.h b/tensorflow/core/kernels/slice_op.h
@@ -19,31 +19,104 @@ limitations under the License.
 // Functor definition for SliceOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/ops_util.h"
 
 namespace tensorflow {
-namespace functor {
+
+namespace internal {
+
+template <typename Device, typename T>
+void SliceSimple(const Device& d, Tensor* out, const Tensor& in,
+                 const gtl::ArraySlice<int64>& slice_indices);
+template <typename Device, typename T>
+void SliceSimpleGpu(const Device& d, Tensor* out, const Tensor& in,
+                 const gtl::ArraySlice<int64>& slice_indices);
+
+template <typename Device, typename T>
+void SliceSimple(const Device& d, Tensor* out, const Tensor& in,
+                 const gtl::ArraySlice<int64>& slice_indices) {
+  const int ndims = in.dims();
+  const int64 nelem = out->NumElements();
+  const gtl::InlinedVector<int64, 8> in_strides = ComputeStride<int64>(in.shape());
+  const gtl::InlinedVector<int64, 8> out_strides = ComputeStride<int64>(out->shape());
+  const T* p = in.flat<T>().data();
+  T* q = out->flat<T>().data();
+
+  std::vector<int64> i_idx(nelem, 0);
+  std::vector<int64> t(nelem, 0);
+
+  for (int64 o_idx = 0; o_idx < nelem; ++o_idx) {
+    t[o_idx] = o_idx;
+  }
+  for (int i = 0; i < ndims; ++i) {
+    int64 n = (nelem + 7) / 8;
+    int64 o_idx = 0;
+    switch (nelem % 8) {
+#define CALC_INPUT_IDX                                                            \
+  i_idx[o_idx] += (t[o_idx] / out_strides[i] + slice_indices[i]) * in_strides[i]; \
+  t[o_idx] %= out_strides[i];                                                     \
+  ++o_idx;
+      case 0: do { CALC_INPUT_IDX;
+      case 7:      CALC_INPUT_IDX;
+      case 6:      CALC_INPUT_IDX;
+      case 5:      CALC_INPUT_IDX;
+      case 4:      CALC_INPUT_IDX;
+      case 3:      CALC_INPUT_IDX;
+      case 2:      CALC_INPUT_IDX;
+      case 1:      CALC_INPUT_IDX;
+#undef CALC_INPUT_IDX
+              } while (--n > 0);
+    }
+  }
+  for (int64 o_idx = 0; o_idx < nelem; ++o_idx) {
+    q[o_idx] = p[i_idx[o_idx]];
+  }
+}
 
 template <typename Device, typename T, int NDIMS>
+void SliceUsingEigen(const Device& d, Tensor* out, const Tensor& in,
+                 const gtl::ArraySlice<int64>& slice_indices,
+                 const gtl::ArraySlice<int64>& slice_sizes) {
+  auto input = in.tensor<T, NDIMS>();
+  auto output = out->tensor<T, NDIMS>();
+  Eigen::DSizes<int, NDIMS> indices;
+  for (int i = 0; i < NDIMS; ++i) {
+    indices[i] = slice_indices[i];
+  }
+  Eigen::DSizes<int, NDIMS> sizes;
+  for (int i = 0; i < NDIMS; ++i) {
+    sizes[i] = slice_sizes[i];
+  }
+  const bool use_64bit = input.size() > Eigen::NumTraits<int>::highest();
+  if (!use_64bit &&
+      Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
+    To32Bit(output).device(d) = To32Bit(input).slice(indices, sizes);
+  } else {
+    output.device(d) = input.slice(indices, sizes);
+  }
+}
+
+} // namespace internal
+
+namespace functor {
+
+// Template parameter NDIM is not neccesary here. The aim of keeping it
+// is to compile struct slice seperately which minimizes the compiling time.
+template <typename Device, typename T, int NDIM>
 struct Slice {
-  void operator()(const Device& d, typename TTypes<T, NDIMS>::Tensor output,
-                  typename TTypes<T, NDIMS>::ConstTensor input,
-                  const Eigen::DSizes<Eigen::DenseIndex, NDIMS>& slice_indices,
-                  const Eigen::DSizes<Eigen::DenseIndex, NDIMS>& slice_sizes) {
-    bool use_64bit = (input.size() > Eigen::NumTraits<int>::highest());
-    if (!use_64bit &&
-        Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
-      Eigen::DSizes<int, NDIMS> indices;
-      for (int i = 0; i < NDIMS; ++i) {
-        indices[i] = slice_indices[i];
-      }
-      Eigen::DSizes<int, NDIMS> sizes;
-      for (int i = 0; i < NDIMS; ++i) {
-        sizes[i] = slice_sizes[i];
-      }
-      To32Bit(output).device(d) = To32Bit(input).slice(indices, sizes);
+  void operator()(const Device& d, Tensor* out, const Tensor& in,
+                  const gtl::ArraySlice<int64>& slice_indices,
+                  const gtl::ArraySlice<int64>& slice_sizes) {
+    if (in.dims() == NDIM) {
+        internal::SliceUsingEigen<Device, T, NDIM>(d, out, in, slice_indices, slice_sizes);
     } else {
-      output.device(d) = input.slice(slice_indices, slice_sizes);
+        if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
+          internal::SliceSimpleGpu<Device, T>(d, out, in, slice_indices);
+        } else {
+          internal::SliceSimple<Device, T>(d, out, in, slice_indices);
+        }
     }
   }
 };

diff --git a/tensorflow/core/kernels/slice_op_gpu.cu.cc b/tensorflow/core/kernels/slice_op_gpu.cu.cc
@@ -21,9 +21,65 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
+namespace internal {
+
+template <typename T>
+__global__ void SliceKernel(int nthreads, const T* src, const int32* buf,
+                            const int32 ndims, T* dst) {
+  const int32* in_strides = buf;
+  const int32* out_strides = buf + ndims;
+  const int32* slice_indices = buf + ndims * 2;
+  CUDA_1D_KERNEL_LOOP(o_idx, nthreads) {
+    int32 i_idx = 0;
+    int32 t = o_idx;
+    for (int i = 0; i < ndims; ++i) {
+      i_idx += (t / out_strides[i] + slice_indices[i]) * in_strides[i];
+      t %= out_strides[i];
+    }
+    dst[o_idx] = ldg(src + i_idx);
+  }
+}
+
+template <typename Device, typename T>
+void SliceSimpleGpu(const Device& d, Tensor* out, const Tensor& in,
+                 const gtl::ArraySlice<int64>& slice_indices) {
+  // Ensures we can use 32-bit index.
+  const int64 in_nelem = in.NumElements();
+  CHECK_LT(in_nelem, kint32max) << "Tensor too large to transpose on GPU";
+  const int64 out_nelem = out->NumElements();
+  CHECK_LT(out_nelem, kint32max) << "Tensor too large to transpose on GPU";
+  // Pack strides and slice indices sizes into one buffer.
+  const int32 ndims = in.dims();
+  gtl::InlinedVector<int32, 24> host_buf(ndims * 3);
+  gtl::InlinedVector<int32, 8> in_strides = ComputeStride<int32>(in.shape());
+  gtl::InlinedVector<int32, 8> out_strides = ComputeStride<int32>(out->shape());
+  for (int i = 0; i < ndims; ++i) {
+    host_buf[i] = in_strides[i];
+    host_buf[ndims + i] = out_strides[i];
+    host_buf[ndims * 2 + i] = slice_indices[i];
+  }
+  auto num_bytes = sizeof(int64) * host_buf.size();
+  auto dev_buf = d.allocate(num_bytes);
+  // NOTE: host_buf is not allocated by CudaHostAllocator, and
+  // therefore we are doing a sync copy effectively.
+  d.memcpyHostToDevice(dev_buf, host_buf.data(), num_bytes);
+  // Launch kernel to q[...] = p[...].
+  const T* p = in.flat<T>().data();
+  T* q = out->flat<T>().data();
+  CudaLaunchConfig cfg = GetCudaLaunchConfig(out_nelem, d);
+  SliceKernel<<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
+      cfg.virtual_thread_count, p, reinterpret_cast<const int32*>(dev_buf),
+      ndims, q);
+  // Safe to deallocate immediately after the kernel launch.
+  d.deallocate(dev_buf);
+}
+
+} // namespace internal
 
 typedef Eigen::GpuDevice GPUDevice;