tensorflow · martinwicke · Dec 12, 2017 · Dec 7, 2017 · Dec 7, 2017 · Dec 7, 2017
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
@@ -602,6 +602,7 @@ filegroup(
         "//tensorflow/java/src/main/native:all_files",
         "//tensorflow/python:all_files",
         "//tensorflow/python/data:all_files",
+        "//tensorflow/python/data/kernel_tests:all_files",
         "//tensorflow/python/data/ops:all_files",
         "//tensorflow/python/data/util:all_files",
         "//tensorflow/python/debug:all_files",

diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc
@@ -226,6 +226,11 @@ Status FillFunctionBody(
       }
       node_def->add_input(strings::StrCat("^", normalized));
     }
+
+    // A function is stateful if any of its nodes are stateful.
+    if (node->op_def().is_stateful()) {
+      fdef->mutable_signature()->set_is_stateful(true);
+    }
   }
   return Status::OK();
 }

diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc
@@ -1482,6 +1482,51 @@ TEST_F(CApiFunctionTest, GetOpDef) {
   EXPECT_EQ(op_def.name(), func_name_);
   EXPECT_EQ(op_def.input_arg_size(), 1);
   EXPECT_EQ(op_def.output_arg_size(), 1);
+  EXPECT_FALSE(op_def.is_stateful());
+
+  TF_DeleteBuffer(buffer);
+}
+
+void DefineStatefulFunction(const char* name, TF_Function** func) {
+  std::unique_ptr<TF_Graph, decltype(&TF_DeleteGraph)> func_graph(
+      TF_NewGraph(), TF_DeleteGraph);
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> s(TF_NewStatus(),
+                                                           TF_DeleteStatus);
+
+  TF_Tensor* tensor_shape = Int32Tensor({37, 1});
+  TF_Operation* shape = Const(tensor_shape, func_graph.get(), s.get(), "shape");
+  TF_Operation* random =
+      RandomUniform(shape, TF_FLOAT, func_graph.get(), s.get());
+
+  TF_Output inputs[] = {};
+  TF_Output outputs[] = {{random, 0}};
+  *func = TF_GraphToFunction(func_graph.get(), name, /*append_hash=*/0, -1,
+                             /*opers=*/nullptr, 0, inputs, 1, outputs,
+                             /*output_names=*/nullptr,
+                             /*opts=*/nullptr, "", s.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(s.get())) << TF_Message(s.get());
+  ASSERT_NE(*func, nullptr);
+  TF_DeleteTensor(tensor_shape);
+}
+
+TEST_F(CApiFunctionTest, StatefulOpDef) {
+  DefineStatefulFunction(func_name_, &func_);
+  TF_GraphCopyFunction(host_graph_, func_, nullptr, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  // Test we can retrieve function OpDef from graph
+  TF_Buffer* buffer = TF_NewBuffer();
+  TF_GraphGetOpDef(host_graph_, func_name_, buffer, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  // Sanity check returned OpDef
+  string data(static_cast<const char*>(buffer->data), buffer->length);
+  OpDef op_def;
+  op_def.ParseFromString(data);
+  EXPECT_EQ(op_def.name(), func_name_);
+  EXPECT_EQ(op_def.input_arg_size(), 0);
+  EXPECT_EQ(op_def.output_arg_size(), 1);
+  EXPECT_TRUE(op_def.is_stateful());
 
   TF_DeleteBuffer(buffer);
 }

diff --git a/tensorflow/c/c_test_util.cc b/tensorflow/c/c_test_util.cc
@@ -193,6 +193,15 @@ TF_Operation* LessThan(TF_Output l, TF_Output r, TF_Graph* graph,
   return TF_FinishOperation(desc, s);
 }
 
+TF_Operation* RandomUniform(TF_Operation* shape, TF_DataType dtype,
+                            TF_Graph* graph, TF_Status* s) {
+  TF_OperationDescription* desc =
+      TF_NewOperation(graph, "RandomUniform", "random_uniform");
+  TF_AddInput(desc, {shape, 0});
+  TF_SetAttrType(desc, "dtype", dtype);
+  return TF_FinishOperation(desc, s);
+}
+
 void Split3Helper(TF_Operation* input, TF_Graph* graph, TF_Status* s,
                   const char* name, TF_Operation** op) {
   TF_Operation* zero = ScalarConst(

diff --git a/tensorflow/c/c_test_util.h b/tensorflow/c/c_test_util.h
@@ -74,6 +74,9 @@ TF_Operation* Neg(TF_Operation* n, TF_Graph* graph, TF_Status* s,
 
 TF_Operation* LessThan(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s);
 
+TF_Operation* RandomUniform(TF_Operation* shape, TF_DataType dtype,
+                            TF_Graph* graph, TF_Status* s);
+
 // Split `input` along the first dimention into 3 tensors
 TF_Operation* Split3(TF_Operation* input, TF_Graph* graph, TF_Status* s,
                      const char* name = "split3");

diff --git a/tensorflow/c/python_api.cc b/tensorflow/c/python_api.cc
@@ -87,4 +87,16 @@ void UpdateEdge(TF_Graph* graph, TF_Output new_src, TF_Input dst,
   }
 }
 
+void RemoveAllControlInputs(TF_Graph* graph, TF_Operation* op) {
+  mutex_lock l(graph->mu);
+  std::vector<const Edge*> control_edges;
+  for (const Edge* edge : op->node.in_edges()) {
+    if (!edge->IsControlEdge()) continue;
+    control_edges.push_back(edge);
+  }
+  for (const Edge* edge : control_edges) {
+    graph->graph.RemoveControlEdge(edge);
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/c/python_api.h b/tensorflow/c/python_api.h
@@ -35,6 +35,8 @@ void SetRequestedDevice(TF_Graph* graph, TF_Operation* op, const char* device);
 void UpdateEdge(TF_Graph* graph, TF_Output new_src, TF_Input dst,
                 TF_Status* status);
 
+void RemoveAllControlInputs(TF_Graph* graph, TF_Operation* op);
+
 }  // namespace tensorflow
 
 #endif  // THIRD_PARTY_TENSORFLOW_C_PYTHON_API_H_
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.cc b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
@@ -27,7 +27,6 @@ namespace tensorflow {
 
 // The current implementation simply unrolls the computation along the batch
 // dimension.
-// TODO(andydavis): add batching support to XLA's Dot operator.
 xla::StatusOr<xla::ComputationDataHandle> BatchDot(
     xla::ComputationBuilder* builder, xla::ComputationDataHandle x,
     xla::ComputationDataHandle y, bool transpose_x, bool transpose_y) {
@@ -52,26 +51,20 @@ xla::StatusOr<xla::ComputationDataHandle> BatchDot(
 
   // The batch dimensions must be equal and the matrix dimensions must be
   // valid.
-  std::vector<int64> dimensions;
-  int64 batch_count = 1;
+  std::vector<int64> batch_dimension_numbers;
   for (int i = 0; i < ndims - 2; ++i) {
-    int64 x_size = x_shape->dimensions(i);
-    int64 y_size = y_shape->dimensions(i);
-    if (x_size != y_size) {
+    if (x_shape->dimensions(i) != y_shape->dimensions(i)) {
       return errors::InvalidArgument(
           "Dimension ", i, " of inputs to BatchedDot must be equal: ",
           xla::ShapeUtil::HumanString(*x_shape), " vs ",
           xla::ShapeUtil::HumanString(*y_shape));
     }
-    dimensions.push_back(x_size);
-    batch_count *= x_size;
+    batch_dimension_numbers.push_back(i);
   }
 
   int x_inner_dim = transpose_x ? (ndims - 2) : (ndims - 1);
   int y_inner_dim = transpose_y ? (ndims - 1) : (ndims - 2);
-  int64 x_inner_dim_size = x_shape->dimensions(x_inner_dim);
-  int64 y_inner_dim_size = y_shape->dimensions(y_inner_dim);
-  if (x_inner_dim_size != y_inner_dim_size) {
+  if (x_shape->dimensions(x_inner_dim) != y_shape->dimensions(y_inner_dim)) {
     return errors::InvalidArgument(
         "Dimensions ", x_inner_dim, " and ", y_inner_dim,
         " of arguments to BatchedDot must be equal: ",
@@ -80,75 +73,46 @@ xla::StatusOr<xla::ComputationDataHandle> BatchDot(
         " transpose: ", transpose_y);
   }
 
-  // If there are no batch dimensions, use a regular Dot. This case exists
-  // to improve the readability of the emitted graphs.
-  if (dimensions.empty()) {
-    auto lhs = transpose_x ? builder->Transpose(x, {1, 0}) : x;
-    auto rhs = transpose_y ? builder->Transpose(y, {1, 0}) : y;
-    return builder->Dot(lhs, rhs);
+  // Check for zero lhs/rhs dim size.
+  if (xla::ShapeUtil::HasZeroElements(*x_shape) ||
+      xla::ShapeUtil::HasZeroElements(*y_shape)) {
+    std::vector<int64> dimensions(batch_dimension_numbers.size());
+    for (int i = 0; i < batch_dimension_numbers.size(); ++i) {
+      dimensions[i] = x_shape->dimensions(batch_dimension_numbers[i]);
+    }
+    int x_outer_dim = transpose_x ? (ndims - 1) : (ndims - 2);
+    int y_outer_dim = transpose_y ? (ndims - 2) : (ndims - 1);
+    dimensions.push_back(x_shape->dimensions(x_outer_dim));
+    dimensions.push_back(y_shape->dimensions(y_outer_dim));
+    return builder->Broadcast(
+        builder->ConstantLiteral(xla::Literal::Zero(x_shape->element_type())),
+        dimensions);
   }
 
-  int x_outer_dim = transpose_x ? (ndims - 1) : (ndims - 2);
-  int y_outer_dim = transpose_y ? (ndims - 2) : (ndims - 1);
-  dimensions.push_back(x_shape->dimensions(x_outer_dim));
-  dimensions.push_back(y_shape->dimensions(y_outer_dim));
-
   if (x_shape->element_type() == xla::C64 && transpose_x) {
     x = builder->Conj(x);
   }
   if (y_shape->element_type() == xla::C64 && transpose_y) {
     y = builder->Conj(y);
   }
 
-  // Reshape input tensors into 3D tensors by flattening the batch
-  // dimensions. This makes it easier to unroll the batch dimension.
-  auto x_flat =
-      builder->Reshape(x, {batch_count, x_shape->dimensions(ndims - 2),
-                           x_shape->dimensions(ndims - 1)});
-  auto y_flat =
-      builder->Reshape(y, {batch_count, y_shape->dimensions(ndims - 2),
-                           y_shape->dimensions(ndims - 1)});
-
-  // Slice batches into individual matrices and multiply them.
-  std::vector<xla::ComputationDataHandle> out_slices;
-  for (int64 i = 0; i < batch_count; ++i) {
-    // Slice off individual matrices and reshape to 2D tensors.
-    auto x_slice = builder->Slice(
-        x_flat, {i, 0, 0},
-        {i + 1, x_shape->dimensions(ndims - 2), x_shape->dimensions(ndims - 1)},
-        {1, 1, 1});
-    x_slice = builder->Reshape(x_slice, {x_shape->dimensions(ndims - 2),
-                                         x_shape->dimensions(ndims - 1)});
-    auto y_slice = builder->Slice(
-        y_flat, {i, 0, 0},
-        {i + 1, y_shape->dimensions(ndims - 2), y_shape->dimensions(ndims - 1)},
-        {1, 1, 1});
-    y_slice = builder->Reshape(y_slice, {y_shape->dimensions(ndims - 2),
-                                         y_shape->dimensions(ndims - 1)});
-
-    // Transpose if needed.
-    auto lhs = transpose_x ? builder->Transpose(x_slice, {1, 0}) : x_slice;
-    auto rhs = transpose_y ? builder->Transpose(y_slice, {1, 0}) : y_slice;
-
-    // Multiply matrices and add an outer singleton dimension to the output
-    // so we can concatenate along the flattened batch dimension later.
-    auto out = builder->Dot(lhs, rhs);
-    out = builder->Reshape(out,
-                           {1, dimensions[ndims - 2], dimensions[ndims - 1]});
-    out_slices.push_back(out);
+  // If there are no batch dimensions, use a regular Dot.
+  // TODO(b/69062148) Remove this code when Dot emitters can be passed
+  // dimensions to transpose directly (i.e. without requiring a Transpose HLO).
+  if (batch_dimension_numbers.empty()) {
+    auto lhs = transpose_x ? builder->Transpose(x, {1, 0}) : x;
+    auto rhs = transpose_y ? builder->Transpose(y, {1, 0}) : y;
+    return builder->Dot(lhs, rhs);
   }
 
-  // Concatenate output slices and reshape to original number of dimensions.
-  xla::ComputationDataHandle data;
-  if (out_slices.empty()) {
-    // It is illegal to pass an empty list to ConcatInDim.
-    // The batch count is empty, so both inputs must have zero elements.
-    // Arbitrarily use the left input as the argument to Reshape().
-    data = x;
-  } else {
-    data = builder->ConcatInDim(out_slices, 0);
+  xla::DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(x_inner_dim);
+  dot_dnums.add_rhs_contracting_dimensions(y_inner_dim);
+  for (auto batch_dimension_number : batch_dimension_numbers) {
+    dot_dnums.add_lhs_batch_dimensions(batch_dimension_number);
+    dot_dnums.add_rhs_batch_dimensions(batch_dimension_number);
   }
-  return builder->Reshape(data, dimensions);
+  return builder->DotGeneral(x, y, dot_dnums);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/sharding_util.cc b/tensorflow/compiler/tf2xla/sharding_util.cc
@@ -14,34 +14,59 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
 
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
+namespace {
+const char kDeviceSuffixReplicatedCore[] = "REPLICATED_CORE";
+const char kShardingAttribute[] = "_XlaSharding";
+}  // namespace
 
-static const char DEVICE_SUFFIX_REPLICATED_CORE[] = "REPLICATED_CORE";
+namespace {
+xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
+GetShardingFromNodeDef(const NodeDef& node_def) {
+  if (!HasNodeAttr(node_def, kShardingAttribute)) {
+    return tensorflow::gtl::optional<xla::OpSharding>();
+  }
+  string value;
+  xla::OpSharding sharding;
+  TF_RETURN_IF_ERROR(GetNodeAttr(node_def, kShardingAttribute, &value));
+  if (!sharding.ParseFromString(value)) {
+    return xla::InvalidArgument(
+        "Experimental _XlaSharding attribute was not a valid encoded "
+        "xla::OpSharding proto.");
+  }
+  return tensorflow::gtl::optional<xla::OpSharding>(sharding);
+}
 
-static Status CoreOutOfRangeError(int core, int num_cores_per_replica) {
+Status CoreOutOfRangeError(int core, int num_cores_per_replica) {
   return errors::InvalidArgument(
       "Invalid replicated core id: ", core,
       "; num_cores_per_replica=", num_cores_per_replica);
 }
+}  // namespace
 
 xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
-ParseShardingFromDevice(const string& device_name, int num_cores_per_replica) {
+ParseShardingFromDevice(
+    const string& device_name, int num_cores_per_replica,
+    tensorflow::gtl::optional<xla::OpSharding> explicit_sharding) {
   if (device_name.empty()) {
     return tensorflow::gtl::optional<xla::OpSharding>();
   }
-
   DeviceNameUtils::ParsedName parsed_device;
   if (!DeviceNameUtils::ParseFullName(device_name, &parsed_device)) {
     return errors::InvalidArgument("Malformed assigned device '", device_name,
                                    "'");
   }
-  if (!parsed_device.has_type ||
-      !StringPiece(parsed_device.type)
-           .ends_with(DEVICE_SUFFIX_REPLICATED_CORE)) {
+
+  if (explicit_sharding.has_value()) {
+    return explicit_sharding;
+  } else if (!parsed_device.has_type || !parsed_device.has_id ||
+             !StringPiece(parsed_device.type)
+                  .contains(kDeviceSuffixReplicatedCore)) {
     return tensorflow::gtl::optional<xla::OpSharding>();
   } else {
     const int core = parsed_device.id;
@@ -53,20 +78,34 @@ ParseShardingFromDevice(const string& device_name, int num_cores_per_replica) {
   }
 }
 
+xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
+ParseShardingFromDevice(const NodeDef& node_def, int num_cores_per_replica) {
+  const string& device_name = node_def.device();
+  TF_ASSIGN_OR_RETURN(tensorflow::gtl::optional<xla::OpSharding> sharding,
+                      GetShardingFromNodeDef(node_def));
+  return ParseShardingFromDevice(device_name, num_cores_per_replica, sharding);
+}
+
 xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
 ParseShardingFromDevice(const Node& node, int num_cores_per_replica) {
   string device_name = node.assigned_device_name();
   if (device_name.empty()) {
     device_name = node.requested_device();
   }
-  return ParseShardingFromDevice(device_name, num_cores_per_replica);
+  TF_ASSIGN_OR_RETURN(tensorflow::gtl::optional<xla::OpSharding> sharding,
+                      GetShardingFromNodeDef(node.def()));
+  return ParseShardingFromDevice(device_name, num_cores_per_replica, sharding);
 }
+
 void SetShardingDeviceAssignmentFromNode(const Node& src, Node* dst) {
   string device_name = src.assigned_device_name();
   if (device_name.empty()) {
     device_name = src.requested_device();
   }
   dst->set_assigned_device_name(device_name);
+  if (const AttrValue* attr = src.attrs().Find(kShardingAttribute)) {
+    dst->AddAttr(kShardingAttribute, *attr);
+  }
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/sharding_util.h b/tensorflow/compiler/tf2xla/sharding_util.h
@@ -29,14 +29,21 @@ namespace tensorflow {
 // - if the device name is invalid.
 // - the core is parsed and is out of the range [0, num_cores_per_replica).
 //
-// Otherwise, returns either a non-value or a sharding set as per
-// xla:ShardingBuilder::AssignDevice.
+// Otherwise, returns either:
+// - explicit_sharding if explicit_sharding.has_value()
+// - a non-value if there is no assigned core or
+// - a sharding set as per xla::ShardingBuilder::AssignDevice.
 xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
-ParseShardingFromDevice(const string& device_name, int num_cores_per_replica);
+ParseShardingFromDevice(const string& device_name, int num_cores_per_replica,
+                        tensorflow::gtl::optional<xla::OpSharding>
+                            explicit_sharding = tensorflow::gtl::nullopt);
 
 xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
 ParseShardingFromDevice(const Node& node, int num_cores_per_replica);
 
+xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
+ParseShardingFromDevice(const NodeDef& node_def, int num_cores_per_replica);
+
 void SetShardingDeviceAssignmentFromNode(const Node& src, Node* dst);
 
 }  // namespace tensorflow