tensorflow · caisq · May 2, 2018 · Apr 30, 2018 · Apr 30, 2018 · Apr 30, 2018
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
@@ -380,49 +380,39 @@ Status InitialGradients(const VSpace<Gradient, BackwardFunction>& vspace,
                         gtl::ArraySlice<Gradient*> output_gradients,
                         const TensorTape& tensor_tape,
                         const OpTape<BackwardFunction>& op_tape,
-                        const gtl::FlatMap<int64, int64>& tensor_usage_counts,
                         gtl::FlatMap<int64, std::vector<Gradient*>>* result) {
   for (int i = 0; i < target_tensor_ids.size(); ++i) {
     const int64 id = target_tensor_ids[i];
-    if (tensor_usage_counts.find(id) != tensor_usage_counts.end()) {
-      if (!output_gradients.empty() && output_gradients[i] != nullptr) {
-        // TODO(apassos) figure out how to print debugging information here.
-        return errors::InvalidArgument(
-            "A gradient was provided for a tensor which is used as part of the "
-            "computation.");
-      }
-    } else {
-      if (output_gradients.empty() || output_gradients[i] == nullptr) {
-        auto tensor_it = tensor_tape.find(id);
-        if (tensor_it != tensor_tape.end() && tensor_it->second != -1) {
-          auto op_it = op_tape.find(tensor_it->second);
-          if (op_it == op_tape.end()) {
-            return errors::Internal(
-                "Internal state of the gradient tape is invalid: "
-                "failed to find operation producing a tensor");
-          }
-          bool found = false;
-          for (int j = 0; j < op_it->second.output_tensor_info.size(); ++j) {
-            if (op_it->second.output_tensor_info[j].id == id) {
-              found = true;
-              (*result)[id].push_back(
-                  vspace.Ones(op_it->second.output_tensor_info[j].shape,
-                              op_it->second.output_tensor_info[j].dtype));
-              break;
-            }
-          }
-          if (!found) {
-            return errors::Internal(
-                "Internal state of the gradient tape is invalid: "
-                "none of operations outputs match expected tensor");
+    if (output_gradients.empty() || output_gradients[i] == nullptr) {
+      auto tensor_it = tensor_tape.find(id);
+      if (tensor_it != tensor_tape.end() && tensor_it->second != -1) {
+        auto op_it = op_tape.find(tensor_it->second);
+        if (op_it == op_tape.end()) {
+          return errors::Internal(
+              "Internal state of the gradient tape is invalid: "
+              "failed to find operation producing a tensor");
+        }
+        bool found = false;
+        for (int j = 0; j < op_it->second.output_tensor_info.size(); ++j) {
+          if (op_it->second.output_tensor_info[j].id == id) {
+            found = true;
+            (*result)[id].push_back(
+                vspace.Ones(op_it->second.output_tensor_info[j].shape,
+                            op_it->second.output_tensor_info[j].dtype));
+            break;
           }
-        } else {
-          // No record of the target tensor found on the tape, so no gradient
-          // needs to be computed from it. Do nothing.
+        }
+        if (!found) {
+          return errors::Internal(
+              "Internal state of the gradient tape is invalid: "
+              "none of operations outputs match expected tensor");
         }
       } else {
-        (*result)[id].push_back(output_gradients[i]);
+        // No record of the target tensor found on the tape, so no gradient
+        // needs to be computed from it. Do nothing.
       }
+    } else {
+      (*result)[id].push_back(output_gradients[i]);
     }
   }
   return Status::OK();
@@ -451,8 +441,7 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
       InitialStack(state.op_tape, state.op_missing_tensor);
   gtl::FlatMap<int64, std::vector<Gradient*>> gradients;
   Status s = InitialGradients(vspace, target_tensor_ids, output_gradients,
-                              tensor_tape_, state.op_tape,
-                              state.tensor_usage_counts, &gradients);
+                              tensor_tape_, state.op_tape, &gradients);
   auto cleanup = [this, &state]() {
     if (!persistent_) {
       // Release all backprop functions

diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
@@ -44,7 +44,7 @@ namespace {
 
 // Compiles the XLA computation into executable code.
 Status CompileXla(xla::CompileOnlyClient* client,
-                  const xla::Computation& computation,
+                  const xla::XlaComputation& computation,
                   const xla::cpu::CpuAotCompilationOptions& aot_opts,
                   CompileResult* compile_result) {
   // Retrieves arg and result layouts from the computation.
@@ -62,7 +62,7 @@ Status CompileXla(xla::CompileOnlyClient* client,
   for (int i = 0; i < pshape->parameters_size(); ++i) {
     arg_layouts.push_back(pshape->mutable_parameters(i));
   }
-  xla::CompileOnlyClient::AotComputationInstance instance;
+  xla::CompileOnlyClient::AotXlaComputationInstance instance;
   instance.computation = &computation;
   instance.argument_layouts = std::move(arg_layouts);
   instance.result_layout = &pshape->result();
@@ -93,14 +93,14 @@ Status CompileGraph(const GraphDef& graph_def, const tf2xla::Config& config,
   xla::CompileOnlyClient* client =
       xla::ClientLibrary::GetOrCreateCompileOnlyClient(cpu_platform)
           .ValueOrDie();
-  xla::Computation computation;
+  xla::XlaComputation computation;
   TF_RETURN_IF_ERROR(
       ConvertGraphDefToXla(graph_def, config, client, &computation));
   if (!flags.out_session_module.empty()) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::SessionModule> module,
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::HloSnapshot> module,
                         computation.Snapshot());
-    // Serialize the SessionModule deterministically so that all the outputs of
-    // a tf_library genrule are deterministic.
+    // Serialize the HloSnapshot deterministically so that all the outputs of a
+    // tf_library genrule are deterministic.
     string proto;
     TF_RET_CHECK(SerializeToStringDeterministic(*module, &proto));
     TF_RETURN_IF_ERROR(

diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -525,14 +525,16 @@ TEST(TFCompileTest, HloProfiling) {
   auto header = HasSubstr("Execution profile for");
   auto total_cycles_profile_line = HasSubstr("[total]");
   auto dot_profile_line = HasSubstr(
-      "%dot = f32[2,2]{1,0} dot(f32[2,2]{1,0} %arg0, f32[2,2]{1,0} %arg1)");
+      "%dot.0.2 = f32[2,2]{1,0} dot(f32[2,2]{1,0} %arg0.0.0, f32[2,2]{1,0} "
+      "%arg1.0.1)");
   auto add_profile_line = HasSubstr(
-      "%add = f32[2,2]{1,0} add(f32[2,2]{1,0} %arg0, f32[2,2]{1,0} %arg1)");
+      "%add.0.5 = f32[2,2]{1,0} add(f32[2,2]{1,0} %arg0.0.0, f32[2,2]{1,0} "
+      "%arg1.0.1)");
   auto tuple_profile_line = HasSubstr(
-      "%tuple.2 = (f32[2,2]{1,0}, f32[2,2]{1,0}) tuple(f32[2,2]{1,0} %dot, "
-      "f32[2,2]{1,0} %add)");
-  auto arg0_profile_line = HasSubstr("%arg0 = f32[2,2]{1,0} parameter(0)");
-  auto arg1_profile_line = HasSubstr("%arg1 = f32[2,2]{1,0} parameter(1)");
+      "%tuple.0.8 = (f32[2,2]{1,0}, f32[2,2]{1,0}) tuple(f32[2,2]{1,0} "
+      "%dot.0.2, f32[2,2]{1,0} %add.0.5)");
+  auto arg0_profile_line = HasSubstr("%arg0.0.0 = f32[2,2]{1,0} parameter(0)");
+  auto arg1_profile_line = HasSubstr("%arg1.0.1 = f32[2,2]{1,0} parameter(1)");
 
   hlo_profile_lines.erase(hlo_profile_lines.begin() + 7,
                           hlo_profile_lines.end());

diff --git a/tensorflow/compiler/tests/reduce_ops_test.py b/tensorflow/compiler/tests/reduce_ops_test.py
@@ -19,6 +19,7 @@
 from __future__ import print_function
 
 import functools
+import itertools
 import numpy as np
 
 from tensorflow.compiler.tests.xla_test import XLATestCase
@@ -155,5 +156,68 @@ def testReduceAny(self):
     self._testReduction(math_ops.reduce_any, np.any, np.bool, self.BOOL_DATA)
 
 
+class ReduceOpPrecisionTest(XLATestCase):
+
+  def _testReduceSum(self,
+                     expected_result,
+                     dtype,
+                     test_inputs,
+                     rtol=1e-3,
+                     atol=1e-4):
+    """Tests reduce sum on a list of input arrays.
+
+    For each array in test_inputs, check that performing reduce sum on the array
+    produces a value that is close to the expected result.
+
+    Args:
+      expected_result: the expected result.
+      dtype: the data type of the reduce sum operation.
+      test_inputs: a list of input arrays for the reduce sum operation.
+      rtol: the relative error.
+      atol: the absolute error.
+    """
+
+    for test_input in test_inputs:
+      with self.test_session() as sess:
+        with self.test_scope():
+          a = array_ops.placeholder(dtype)
+          index = array_ops.placeholder(dtypes.int32)
+          out = math_ops.reduce_sum(a, index)
+        result = sess.run(out, {
+            a: np.array(test_input, dtype=dtype),
+            index: [0]
+        })
+        # Compare the results using float32 type.
+        self.assertAllClose(
+            np.float32(result),
+            np.float32(expected_result),
+            rtol=rtol,
+            atol=atol)
+
+  def testReduceSumF16(self):
+    """Tests the reduce sum of float16 doesn't lose too much precision."""
+
+    if np.float16 not in self.all_types:
+      return
+
+    f16_max = np.finfo(np.float16).max
+    self._testReduceSum(
+        f16_max, np.float16,
+        itertools.permutations([f16_max, f16_max, f16_max * (-1.0)], 3))
+
+  def testReduceSumBF16(self):
+    """Tests the reduce sum of bfloat16 doesn't lose too much precision."""
+
+    if dtypes.bfloat16.as_numpy_dtype not in self.all_types:
+      return
+
+    bf16_max = np.float32(dtypes.bfloat16.max)
+    f32_max = dtypes.float32.max
+    value = min(bf16_max, f32_max - bf16_max)
+    self._testReduceSum(
+        dtypes.bfloat16.as_numpy_dtype(value), dtypes.bfloat16.as_numpy_dtype,
+        itertools.permutations([bf16_max, value, bf16_max * (-1.0)], 3))
+
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
@@ -81,7 +81,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla/kernels:xla_cpu_only_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/client",
-        "//tensorflow/compiler/xla/client:computation",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -168,9 +168,9 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -215,7 +215,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:sharding_builder",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",

diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -51,6 +51,7 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
                         const std::vector<const XlaExpression*>& expressions,
                         std::vector<XlaCompiler::Argument>* args) {
   auto builder = ctx->builder();
+  auto client = ctx->compiler()->client();
   std::vector<bool> compile_time_constant_flags(expressions.size());
 
   TF_RETURN_IF_ERROR(
@@ -72,8 +73,10 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
       arg.kind = XlaCompiler::Argument::kConstant;
       TF_RET_CHECK(expressions[i]->resource() == nullptr)
           << "Input with resource is not yet implemented.";
+      TF_ASSIGN_OR_RETURN(auto constant_graph, builder->BuildConstantSubGraph(
+                                                   expressions[i]->handle()));
       TF_ASSIGN_OR_RETURN(auto literal,
-                          builder->ComputeConstant(expressions[i]->handle()));
+                          client->ComputeConstant(constant_graph));
       TF_RETURN_IF_ERROR(
           LiteralToHostTensor(*literal, arg.type, &arg.constant_value));
     } else {
@@ -212,7 +215,7 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
 
   TF_RET_CHECK(arguments.size() == expressions.size());
 
-  std::vector<xla::ComputationDataHandle> handles;
+  std::vector<xla::XlaOp> handles;
   for (int64 i = 0; i < expressions.size(); ++i) {
     if (arguments[i].kind == XlaCompiler::Argument::kConstant) {
       continue;

diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -114,8 +114,8 @@ tf_kernel_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/core:framework",
         "//tensorflow/core:image_ops_op_lib",
         "//tensorflow/core:lib",
@@ -151,7 +151,7 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -167,7 +167,7 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -203,8 +203,8 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels:argmax_op",

diff --git a/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc b/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
@@ -29,7 +29,7 @@ class AddNOp : public XlaOpKernel {
     OP_REQUIRES(ctx, ctx->num_inputs() >= 1,
                 errors::InvalidArgument("AddN requires at least one argument"));
 
-    xla::ComputationDataHandle sum = ctx->Input(0);
+    xla::XlaOp sum = ctx->Input(0);
     for (int i = 1; i < ctx->num_inputs(); ++i) {
       sum = ctx->builder()->Add(sum, ctx->Input(i));
     }

diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
@@ -48,9 +48,9 @@ class FusedBatchNormOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx,
                    DataTypeToPrimitiveType(ctx->input_type(1), &scale_type));
 
-    xla::ComputationBuilder* builder = ctx->builder();
+    xla::XlaBuilder* builder = ctx->builder();
 
-    xla::ComputationDataHandle input = ctx->Input(0);
+    xla::XlaOp input = ctx->Input(0);
     TensorShape input_shape = ctx->InputShape(0);
 
     int feature_index =
@@ -62,7 +62,7 @@ class FusedBatchNormOp : public XlaOpKernel {
     input = builder->ConvertElementType(input, scale_type);
 
     if (is_training_) {
-      xla::ComputationDataHandle output = builder->BatchNormTraining(
+      xla::XlaOp output = builder->BatchNormTraining(
           input, ctx->Input(1), ctx->Input(2), epsilon_, feature_index);
 
       // In training mode, outputs the normalized value as well as the
@@ -79,7 +79,7 @@ class FusedBatchNormOp : public XlaOpKernel {
       ctx->SetOutput(3, builder->GetTupleElement(output, 1));
       ctx->SetOutput(4, builder->GetTupleElement(output, 2));
     } else {
-      xla::ComputationDataHandle output = builder->BatchNormInference(
+      xla::XlaOp output = builder->BatchNormInference(
           input, ctx->Input(1), ctx->Input(2), ctx->Input(3), ctx->Input(4),
           epsilon_, feature_index);
       ctx->SetOutput(0, builder->ConvertElementType(output, input_type));
@@ -118,7 +118,7 @@ class FusedBatchNormGradOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* const b = ctx->builder();
+    xla::XlaBuilder* const b = ctx->builder();
     DataType input_dtype = ctx->input_type(0);
     DataType scale_dtype = ctx->input_type(2);
 
@@ -137,11 +137,11 @@ class FusedBatchNormGradOp : public XlaOpKernel {
     const int feature_index =
         GetTensorFeatureDimIndex(input_dims, data_format_);
 
-    xla::ComputationDataHandle x_backprop;
-    xla::ComputationDataHandle scale_backprop;
-    xla::ComputationDataHandle offset_backprop;
+    xla::XlaOp x_backprop;
+    xla::XlaOp scale_backprop;
+    xla::XlaOp offset_backprop;
     if (is_training_) {
-      xla::ComputationDataHandle output =
+      xla::XlaOp output =
           b->BatchNormGrad(activations, scale, mean, var, grad_backprop,
                            epsilon_, feature_index);