tensorflow · frankchn · Jul 12, 2017 · Jul 11, 2017 · Jul 11, 2017 · Jul 11, 2017
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -29,6 +29,7 @@ class GatherOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape params_shape = ctx->InputShape(0);
+    const auto params_dims = params_shape.dims();
     const TensorShape indices_shape = ctx->InputShape(1);
     OP_REQUIRES(
         ctx, TensorShapeUtils::IsVectorOrHigher(params_shape),
@@ -38,20 +39,51 @@ class GatherOp : public XlaOpKernel {
     OP_REQUIRES(ctx, index_type == DT_INT32 || index_type == DT_INT64,
                 errors::InvalidArgument("index must be int32 or int64"));
 
+    // GatherV2 added an axis argument. We support both Gather and GatherV2 in
+    // this kernel by defaulting axis to 0 if there are 2 inputs.
+    int64 axis = 0;
+    if (ctx->num_inputs() == 3) {
+      const TensorShape axis_shape = ctx->InputShape(2);
+      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(axis_shape),
+                  errors::InvalidArgument("axis must be scalar"));
+      DataType axis_type = input_type(2);
+      OP_REQUIRES(ctx, axis_type == DT_INT32 || axis_type == DT_INT64,
+                  errors::InvalidArgument("axis must be int32 or int64"));
+
+      xla::Literal literal;
+      OP_REQUIRES_OK(ctx, ctx->ConstantInput(2, &literal));
+      int64 axis_input = axis_type == DT_INT32 ? literal.Get<int32>({})
+                                               : literal.Get<int64>({});
+      axis = axis_input < 0 ? axis_input + params_dims : axis_input;
+      OP_REQUIRES(ctx, 0 <= axis && axis < params_dims,
+                  errors::InvalidArgument("Expected axis in the range [",
+                                          -params_dims, ", ", params_dims,
+                                          "), but got ", axis_input));
+    }
+
     // Check that we have enough index space.
     const int64 limit = index_type == DT_INT32
                             ? std::numeric_limits<int32>::max()
                             : std::numeric_limits<int64>::max();
-    OP_REQUIRES(
-        ctx, params_shape.dim_size(0) <= limit,
-        errors::InvalidArgument("params.shape[0] too large for ",
-                                DataTypeString(index_type), " indexing: ",
-                                params_shape.dim_size(0), " > ", limit));
-
-    // The result shape is indices.shape + params.shape[1:].
-    TensorShape result_shape = indices_shape;
-    for (int i = 1; i < params_shape.dims(); i++) {
+    OP_REQUIRES(ctx, params_shape.dim_size(axis) <= limit,
+                errors::InvalidArgument(
+                    "params.shape[", axis, "] too large for ",
+                    DataTypeString(index_type),
+                    " indexing: ", params_shape.dim_size(axis), " > ", limit));
+
+    // The result shape is params.shape[0:axis] + indices.shape +
+    // params.shape[axis + 1:].
+    TensorShape result_shape;
+    int64 outer_size = 1;
+    int64 inner_size = 1;
+    for (int i = 0; i < axis; i++) {
+      result_shape.AddDim(params_shape.dim_size(i));
+      outer_size *= params_shape.dim_size(i);
+    }
+    result_shape.AppendShape(indices_shape);
+    for (int i = axis + 1; i < params_dims; i++) {
       result_shape.AddDim(params_shape.dim_size(i));
+      inner_size *= params_shape.dim_size(i);
     }
 
     XlaContext& tc = XlaContext::Get(ctx);
@@ -67,10 +99,12 @@ class GatherOp : public XlaOpKernel {
     args.push_back(tc.GetOrCreateRuntimeContextParameter());
     args.push_back(b.ConstantLiteral(
         *xla::Literal::CreateR0<int64>(indices_shape.num_elements())));
+    args.push_back(
+        b.ConstantLiteral(*xla::Literal::CreateR0<int64>(outer_size)));
     args.push_back(b.ConstantLiteral(
-        *xla::Literal::CreateR0<int64>(params_shape.dim_size(0))));
-    args.push_back(b.ConstantLiteral(*xla::Literal::CreateR0<int64>(
-        params_shape.num_elements() / params_shape.dim_size(0))));
+        *xla::Literal::CreateR0<int64>(params_shape.dim_size(axis))));
+    args.push_back(
+        b.ConstantLiteral(*xla::Literal::CreateR0<int64>(inner_size)));
     args.push_back(ctx->Input(0));
     args.push_back(ctx->Input(1));
 
@@ -97,6 +131,10 @@ REGISTER_XLA_OP(Name("Gather")
                     .TypeConstraint("Tparams", DT_FLOAT)
                     .Device(DEVICE_CPU_XLA_JIT),
                 GatherOp);
+REGISTER_XLA_OP(Name("GatherV2")
+                    .TypeConstraint("Tparams", DT_FLOAT)
+                    .Device(DEVICE_CPU_XLA_JIT),
+                GatherOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int32.cc b/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int32.cc
@@ -26,28 +26,31 @@ namespace tensorflow {
 
 EIGEN_STRONG_INLINE void gather_float_int32_xla_impl(float* out, void** data) {
   // data is managed by the JIT code so msan can't tell it's initialized.
-  TF_ANNOTATE_MEMORY_IS_INITIALIZED(data, 6 * sizeof(void*));
+  TF_ANNOTATE_MEMORY_IS_INITIALIZED(data, 7 * sizeof(void*));
 
   int64 indices_size = *static_cast<int64*>(data[1]);
   int64 params_x = *static_cast<int64*>(data[2]);
   int64 params_y = *static_cast<int64*>(data[3]);
+  int64 params_z = *static_cast<int64*>(data[4]);
 
-  float* in = static_cast<float*>(data[4]);
+  float* in = static_cast<float*>(data[5]);
 
-  int32* indices = static_cast<int32*>(data[5]);
-  Eigen::DSizes<Eigen::DenseIndex, 2> in_eig_sizes;
+  int32* indices = static_cast<int32*>(data[6]);
+  Eigen::DSizes<Eigen::DenseIndex, 3> in_eig_sizes;
   in_eig_sizes[0] = params_x;
   in_eig_sizes[1] = params_y;
-  tensorflow::TTypes<float, 2>::ConstMatrix in_eig(in, in_eig_sizes);
+  in_eig_sizes[2] = params_z;
+  tensorflow::TTypes<float, 3>::ConstTensor in_eig(in, in_eig_sizes);
 
   Eigen::DSizes<Eigen::DenseIndex, 1> indices_eig_sizes;
   indices_eig_sizes[0] = indices_size;
   tensorflow::TTypes<int32>::ConstFlat indices_eig(indices, indices_eig_sizes);
 
-  Eigen::DSizes<Eigen::DenseIndex, 2> out_eig_sizes;
-  out_eig_sizes[0] = indices_size;
-  out_eig_sizes[1] = params_y;
-  tensorflow::TTypes<float>::Matrix out_eig(out, out_eig_sizes);
+  Eigen::DSizes<Eigen::DenseIndex, 3> out_eig_sizes;
+  out_eig_sizes[0] = params_x;
+  out_eig_sizes[1] = indices_size;
+  out_eig_sizes[2] = params_z;
+  tensorflow::TTypes<float, 3>::Tensor out_eig(out, out_eig_sizes);
 
   tensorflow::functor::GatherFunctorCPU<float, int32> f;
   const int64 bad_i = f(in_eig, indices_eig, out_eig);

diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int64.cc b/tensorflow/compiler/tf2xla/kernels/gather_op_kernel_float_int64.cc
@@ -26,28 +26,31 @@ namespace tensorflow {
 
 EIGEN_STRONG_INLINE void gather_float_int64_xla_impl(float* out, void** data) {
   // data is managed by the JIT code so msan can't tell it's initialized.
-  TF_ANNOTATE_MEMORY_IS_INITIALIZED(data, 6 * sizeof(void*));
+  TF_ANNOTATE_MEMORY_IS_INITIALIZED(data, 7 * sizeof(void*));
 
   int64 indices_size = *static_cast<int64*>(data[1]);
   int64 params_x = *static_cast<int64*>(data[2]);
   int64 params_y = *static_cast<int64*>(data[3]);
+  int64 params_z = *static_cast<int64*>(data[4]);
 
-  float* in = static_cast<float*>(data[4]);
+  float* in = static_cast<float*>(data[5]);
 
-  int64* indices = static_cast<int64*>(data[5]);
-  Eigen::DSizes<Eigen::DenseIndex, 2> in_eig_sizes;
+  int64* indices = static_cast<int64*>(data[6]);
+  Eigen::DSizes<Eigen::DenseIndex, 3> in_eig_sizes;
   in_eig_sizes[0] = params_x;
   in_eig_sizes[1] = params_y;
-  tensorflow::TTypes<float, 2>::ConstMatrix in_eig(in, in_eig_sizes);
+  in_eig_sizes[2] = params_z;
+  tensorflow::TTypes<float, 3>::ConstTensor in_eig(in, in_eig_sizes);
 
   Eigen::DSizes<Eigen::DenseIndex, 1> indices_eig_sizes;
   indices_eig_sizes[0] = indices_size;
   tensorflow::TTypes<int64>::ConstFlat indices_eig(indices, indices_eig_sizes);
 
-  Eigen::DSizes<Eigen::DenseIndex, 2> out_eig_sizes;
-  out_eig_sizes[0] = indices_size;
-  out_eig_sizes[1] = params_y;
-  tensorflow::TTypes<float>::Matrix out_eig(out, out_eig_sizes);
+  Eigen::DSizes<Eigen::DenseIndex, 3> out_eig_sizes;
+  out_eig_sizes[0] = params_x;
+  out_eig_sizes[1] = indices_size;
+  out_eig_sizes[2] = params_z;
+  tensorflow::TTypes<float, 3>::Tensor out_eig(out, out_eig_sizes);
 
   tensorflow::functor::GatherFunctorCPU<float, int64> f;
   const int64 bad_i = f(in_eig, indices_eig, out_eig);

diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -180,7 +180,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
   } else {
     CHECK(!body.xla_input_shapes.empty());
     body_input_shape = body.xla_input_shapes[0];
-    CHECK(!body.xla_input_shapes.empty());
+    CHECK(!cond.xla_input_shapes.empty());
     cond_input_shape = cond.xla_input_shapes[0];
   }
 

diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
@@ -631,6 +631,18 @@ string Literal::ToString() const {
   return literal;
 }
 
+/* static */ std::unique_ptr<Literal> Literal::MakeTupleOwned(
+    std::vector<std::unique_ptr<Literal>> elements) {
+  auto literal = MakeUnique<Literal>();
+  std::vector<Shape> shape;
+  for (auto& tuple_element : elements) {
+    shape.push_back(tuple_element->shape());
+    literal->add_tuple_literals()->Swap(tuple_element.get());
+  }
+  *literal->mutable_shape() = ShapeUtil::MakeTupleShape(shape);
+  return literal;
+}
+
 const void* Literal::InternalData() const {
   return const_cast<const void*>(
       const_cast<Literal*>(this)->MutableInternalData());

diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
@@ -481,6 +481,16 @@ class Literal {
   static std::unique_ptr<Literal> MakeTuple(
       tensorflow::gtl::ArraySlice<const Literal*> elements);
 
+  // As above, but intended to be invoked with move semantics; i.e.
+  //
+  //  std::vector<std::unique_ptr<Literal>> elements = ...;
+  //  auto result = Literal::MakeTupleOwned(std::move(elements));
+  //
+  // This would have been declared as an overload, but there is ambiguity
+  // in invocation between the above signature and this one.
+  static std::unique_ptr<Literal> MakeTupleOwned(
+      std::vector<std::unique_ptr<Literal>> elements);
+
   // Validates that the data payload of the literal matches the literal shape;
   // if it does not, an appropriate status is returned.
   tensorflow::Status ValidateLiteral() const;

diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -326,11 +326,11 @@ cc_library(
     name = "cpu_runtime",
     srcs = [
         "cpu_runtime.cc",
-        "infeed_manager.cc",
+        "xfeed_manager.cc",
     ],
     hdrs = [
         "cpu_runtime.h",
-        "infeed_manager.h",
+        "xfeed_manager.h",
     ],
     copts = runtime_copts(),
     deps = [
@@ -416,9 +416,9 @@ cc_test(
 )
 
 cc_test(
-    name = "infeed_manager_test",
+    name = "xfeed_manager_test",
     size = "small",
-    srcs = ["infeed_manager_test.cc"],
+    srcs = ["xfeed_manager_test.cc"],
     deps = [
         ":cpu_runtime",
         "//tensorflow/core:lib",

diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -24,8 +24,8 @@ namespace xla {
 namespace cpu {
 namespace runtime {
 
-InfeedManager* GetInfeedManager() {
-  static InfeedManager* manager = new InfeedManager;
+XfeedManager* GetXfeedManager() {
+  static XfeedManager* manager = new XfeedManager;
   return manager;
 }
 
@@ -35,17 +35,36 @@ InfeedManager* GetInfeedManager() {
 
 void* __xla_cpu_runtime_AcquireInfeedBufferForDequeue(
     xla::int32 buffer_length) {
-  xla::cpu::runtime::InfeedManager* infeed =
-      xla::cpu::runtime::GetInfeedManager();
+  VLOG(2) << "AcquireInfeedBufferForDequeue";
+  xla::cpu::runtime::XfeedManager* xfeed = xla::cpu::runtime::GetXfeedManager();
   // Wait until there's a buffer to dequeue.
-  xla::cpu::runtime::InfeedBuffer* buffer = infeed->BlockingDequeueBuffer();
+  xla::cpu::runtime::XfeedBuffer* buffer =
+      xfeed->infeed()->BlockingDequeueBuffer();
   CHECK_EQ(buffer->length(), buffer_length);
   return buffer->data();
 }
 
 void __xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue(xla::int32 buffer_length,
                                                        void* buffer_ptr) {
-  xla::cpu::runtime::InfeedManager* infeed =
-      xla::cpu::runtime::GetInfeedManager();
-  infeed->ReleaseCurrentBuffer(buffer_length, buffer_ptr);
+  VLOG(2) << "ReleaseInfeedBufferAfterDequeue";
+  xla::cpu::runtime::XfeedManager* xfeed = xla::cpu::runtime::GetXfeedManager();
+  xfeed->infeed()->ReleaseCurrentBuffer(buffer_length, buffer_ptr);
+}
+
+void* __xla_cpu_runtime_AcquireOutfeedBufferForPopulation(
+    xla::int32 buffer_length) {
+  VLOG(2) << "AcquireOutfeedBufferForPopulation";
+  xla::cpu::runtime::XfeedManager* xfeed = xla::cpu::runtime::GetXfeedManager();
+  // Wait until there's a buffer to dequeue.
+  xla::cpu::runtime::XfeedBuffer* buffer =
+      xfeed->outfeed()->BlockingDequeueBuffer();
+  CHECK_EQ(buffer->length(), buffer_length);
+  return buffer->data();
+}
+
+void __xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation(
+    xla::int32 buffer_length, void* buffer_ptr) {
+  VLOG(2) << "ReleaseOutfeedBufferAfterPopulation";
+  xla::cpu::runtime::XfeedManager* xfeed = xla::cpu::runtime::GetXfeedManager();
+  xfeed->outfeed()->ReleaseCurrentBuffer(buffer_length, buffer_ptr);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -26,7 +26,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_H_
 
-#include "tensorflow/compiler/xla/service/cpu/infeed_manager.h"
+#include "tensorflow/compiler/xla/service/cpu/xfeed_manager.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
@@ -54,9 +54,13 @@ constexpr char kAcquireInfeedBufferForDequeueSymbolName[] =
     "__xla_cpu_runtime_AcquireInfeedBufferForDequeue";
 constexpr char kReleaseInfeedBufferAfterDequeueSymbolName[] =
     "__xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue";
+constexpr char kAcquireOutfeedBufferForPopulationSymbolName[] =
+    "__xla_cpu_runtime_AcquireOutfeedBufferForPopulation";
+constexpr char kReleaseOutfeedBufferAfterPopulationSymbolName[] =
+    "__xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation";
 
 // Returns the infeed manager used by the CPU runtime.
-InfeedManager* GetInfeedManager();
+XfeedManager* GetXfeedManager();
 
 }  // namespace runtime
 }  // namespace cpu
@@ -86,6 +90,23 @@ extern void* __xla_cpu_runtime_AcquireInfeedBufferForDequeue(
 // that can be returned out of order.
 extern void __xla_cpu_runtime_ReleaseInfeedBufferAfterDequeue(
     xla::int32 buffer_length, void* buffer_ptr);
+
+// Blocks until the next outfeed buffer is available to be populated, then
+// returns it.
+extern void* __xla_cpu_runtime_AcquireOutfeedBufferForPopulation(
+    xla::int32 buffer_length);
+
+// Relinquishes the outfeed buffer after it has been populated.
+// buffer_ptr must have been previously returned by
+// __xla_cpu_runtime_AcquireOutfeedBufferForPopulation.
+// Once this call completes, buffer_ptr may no longer be accessed.
+// buffer_length must match the length passed to the call to
+// __xla_cpu_runtime_AcquireInfeedBufferForDequeue that returned
+// buffer_ptr. This function must be called before the next buffer is
+// acquired, i.e., there may only be one outstanding outfeed buffer in
+// use by the runtime.
+extern void __xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation(
+    xla::int32 buffer_length, void* buffer_ptr);
 }
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_H_