tensorflow · caisq · May 5, 2018 · May 3, 2018 · May 3, 2018 · May 3, 2018
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
@@ -360,6 +360,31 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "xla_launch_util_test",
+    size = "small",
+    srcs = ["xla_launch_util_test.cc"],
+    deps = [
+        ":common",
+        ":xla_compilation_cache",
+        ":xla_launch_util",
+        ":xla_tensor",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core/kernels:variable_ops",
+    ],
+)
+
 # This target can be used by XLA device plugins to prevent circular dependencies, and provides access to all of the required headers for building a device library.
 cc_header_only_library(
     name = "xla_jit_headers_lib",

diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -77,16 +77,16 @@ Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase* mem) {
   return Status::OK();
 }
 
-namespace {
+namespace internal {
 // Return the 'index''th subtree of the given ShapedBuffer as a
 // ScopedShapedBuffer. The returned ScopedShapedBuffer takes ownership of the
 // subtree, and sets the input's buffer pointers to nullptr for the subtree.
 ScopedShapedBuffer ExtractSubShapedBuffer(
     ShapedBuffer* shaped_buffer, int index,
     xla::DeviceMemoryAllocator* allocator) {
-  xla::Shape on_host_shape = xla::ShapeUtil::GetTupleElementShape(
+  const xla::Shape& on_host_shape = xla::ShapeUtil::GetTupleElementShape(
       shaped_buffer->on_host_shape(), index);
-  xla::Shape on_device_shape = xla::ShapeUtil::GetTupleElementShape(
+  const xla::Shape& on_device_shape = xla::ShapeUtil::GetTupleElementShape(
       shaped_buffer->on_device_shape(), index);
 
   ShapedBuffer sub_shaped_buffer(on_host_shape, on_device_shape,
@@ -98,14 +98,18 @@ ScopedShapedBuffer ExtractSubShapedBuffer(
   sub_shape_tree.CopySubtreeFrom(shape_tree,
                                  /*source_base_index=*/{index},
                                  /*target_base_index=*/{});
-  for (auto& index_to_buffer : shape_tree) {
-    if (!index_to_buffer.first.empty() && index_to_buffer.first[0] == index) {
-      index_to_buffer.second = se::DeviceMemoryBase(nullptr, 0);
-    }
-  }
+  shape_tree.ForEachMutableElement(
+      [index](const xla::ShapeIndex& shape_index,
+              tensorflow::se::DeviceMemoryBase* data) {
+        // shape_index is empty for the root node. Ignore that.
+        if (!shape_index.empty() && shape_index[0] == index) {
+          *data = tensorflow::se::DeviceMemoryBase(nullptr, 0);
+        }
+      });
   return ScopedShapedBuffer(std::move(sub_shaped_buffer), allocator);
 }
-}  // namespace
+}  // namespace internal
+using internal::ExtractSubShapedBuffer;
 
 XlaComputationLaunchContext::XlaComputationLaunchContext(
     int64 num_resource_args, xla::LocalClient* client,

diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
@@ -140,6 +140,17 @@ class XlaTensorBuffer : public TensorBuffer {
   Allocator* allocator_;
 };
 
+// Exposed in this header file for microbenchmarking purposes, but this is an
+// internal implementation detail.
+namespace internal {
+// Return the 'index''th subtree of the given ShapedBuffer as a
+// ScopedShapedBuffer. The returned ScopedShapedBuffer takes ownership of the
+// subtree, and sets the input's buffer pointers to nullptr for the subtree.
+xla::ScopedShapedBuffer ExtractSubShapedBuffer(
+    xla::ShapedBuffer* shaped_buffer, int index,
+    xla::DeviceMemoryAllocator* allocator);
+}  // namespace internal
+
 }  // namespace tensorflow
 
 #endif
diff --git a/tensorflow/compiler/jit/xla_launch_util_test.cc b/tensorflow/compiler/jit/xla_launch_util_test.cc
@@ -0,0 +1,64 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Contains microbenchmarks for performance critical functions in
+// xla_launch_util.cc.
+
+#include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+// Test ExtractSubBuffer with different depths (depth of ShapeTree) and fan-outs
+// (cardinality of each non-leaf node's children).
+void BM_ExtractSubBuffer(int iters, int depth, int fan_out) {
+  tensorflow::testing::StopTiming();
+  xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {32, 64, 128});
+  for (int i = 0; i < depth; ++i) {
+    std::vector<xla::Shape> shapes(fan_out, shape);
+    shape = xla::ShapeUtil::MakeTupleShape(shapes);
+  }
+  xla::ShapedBuffer shaped_buffer(shape, shape, /*platform=*/nullptr,
+                                  /*device_ordinal=*/0);
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    // Extract a buffer from approximately the middle of the first level of the
+    // tree.
+    tensorflow::internal::ExtractSubShapedBuffer(&shaped_buffer,
+                                                 /*index=*/fan_out / 2,
+                                                 /*allocator=*/nullptr)
+        .release();
+  }
+}
+
+BENCHMARK(BM_ExtractSubBuffer)
+    ->ArgPair(1, 4)
+    ->ArgPair(1, 8)
+    ->ArgPair(1, 32)
+    ->ArgPair(1, 64)
+    ->ArgPair(1, 128)
+    ->ArgPair(1, 256)
+    ->ArgPair(1, 512)
+    ->ArgPair(2, 4)
+    ->ArgPair(2, 8)
+    ->ArgPair(2, 32)
+    ->ArgPair(2, 64)
+    ->ArgPair(2, 128);
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  tensorflow::testing::RunBenchmarks();
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/compiler/tests/oom_test.py b/tensorflow/compiler/tests/oom_test.py
@@ -22,6 +22,8 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import googletest
 
 
@@ -42,20 +44,33 @@ def testOutputOutOfMemory(self):
     """
 
     def test_loop():
-      size = 2e8
+      size = int(2e8)
       while True:
         with self.test_session():
-          # Force the compiled code to not be constant by feeding in an addend.
-          p = array_ops.placeholder(dtypes.float32, shape=[])
+          # Force the compiled code to not be constant by feeding in a
+          # parameter.
+          p = array_ops.placeholder(dtypes.float32, shape=[2, 1, 1])
           with self.test_scope():
-            # Create a large R1 tensor.
-            c = array_ops.zeros([size, 1]) + p
+            # Create a computation that produces a large R1 tensor as an
+            # intermediate result.  Reduce it down so that if this file was
+            # compiled without --config=cuda, we don't force a D2H copy of a
+            # large tensor and potentially OOM the host.
+            #
+            # This is a bit tricky because XLA:GPU doesn't currently support RNG
+            # ops.  Here we rely on the fact that XLA doesn't do algebraic
+            # simplifications on conv(<ones>, <filter>).
+            c = math_ops.reduce_sum(
+                nn_ops.convolution(
+                    array_ops.ones([1, size, 1]),
+                    p,
+                    padding='SAME',
+                    data_format='NWC'))
 
-            c.eval(feed_dict={p: 1.0})
+            c.eval(feed_dict={p: [[[1.0]], [[2.0]]]})
             size *= 2
 
     self.assertRaises(errors.ResourceExhaustedError, test_loop)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/compiler/xla/client/computation.h b/tensorflow/compiler/xla/client/computation.h
@@ -30,6 +30,8 @@ namespace xla {
 // Wraps a ComputationHandle protobuf with a lifetime. Computation is
 // movable and not copyable to capture the same kind of unique
 // ownership that std::unique_ptr represents.
+//
+// TODO(b/74197823): Deprecated. Use XlaComputation instead.
 class Computation {
  public:
   // Creates a null Computation.

diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
@@ -48,6 +48,8 @@ namespace xla {
 // deferred from being handled until Build() is called.
 //
 // Thread-compatible.
+//
+// TODO(b/74197823): Deprecated. Use XlaBuilder instead.
 class ComputationBuilder {
  public:
   // client: client in which to build the computation.

diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
@@ -22,8 +22,6 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
@@ -43,9 +41,8 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",

diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.cc b/tensorflow/compiler/xla/client/lib/arithmetic.cc
@@ -17,7 +17,8 @@ limitations under the License.
 
 #include <string>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -27,28 +28,6 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using InstructionGenerator =
-    ComputationDataHandle (*)(ComputationBuilder*, const ComputationDataHandle&,
-                              const ComputationDataHandle&);
-
-Computation CreateScalarComputation(const string& name, PrimitiveType type,
-                                    ComputationBuilder* builder,
-                                    InstructionGenerator generator) {
-  std::unique_ptr<ComputationBuilder> b;
-  if (type == PRED) {
-    b = builder->CreateSubBuilder(name);
-  } else {
-    b = builder->CreateSubBuilder(
-        tensorflow::strings::StrCat(name, "_", PrimitiveType_Name(type)));
-  }
-
-  const Shape scalar = ShapeUtil::MakeShape(type, {});
-  auto lhs = b->Parameter(0, scalar, "lhs");
-  auto rhs = b->Parameter(1, scalar, "rhs");
-  generator(b.get(), lhs, rhs);
-  return b->BuildAndNoteError();
-}
-
 using XlaOpGenerator = XlaOp (*)(XlaBuilder*, const XlaOp&, const XlaOp&);
 
 XlaComputation CreateScalarComputation(const string& name, PrimitiveType type,
@@ -71,71 +50,6 @@ XlaComputation CreateScalarComputation(const string& name, PrimitiveType type,
 
 }  // namespace
 
-Computation CreateScalarAddComputation(PrimitiveType type,
-                                       ComputationBuilder* builder) {
-  return CreateScalarComputation(
-      "add", type, builder,
-      [](ComputationBuilder* b, const ComputationDataHandle& lhs,
-         const ComputationDataHandle& rhs) { return b->Add(lhs, rhs); });
-}
-
-Computation CreateScalarMultiplyComputation(PrimitiveType type,
-                                            ComputationBuilder* builder) {
-  return CreateScalarComputation(
-      "mul", type, builder,
-      [](ComputationBuilder* b, const ComputationDataHandle& lhs,
-         const ComputationDataHandle& rhs) { return b->Mul(lhs, rhs); });
-}
-
-Computation CreateScalarGeComputation(PrimitiveType type,
-                                      ComputationBuilder* builder) {
-  return CreateScalarComputation(
-      "ge", type, builder,
-      [](ComputationBuilder* b, const ComputationDataHandle& lhs,
-         const ComputationDataHandle& rhs) { return b->Ge(lhs, rhs); });
-}
-
-Computation CreateScalarMaxComputation(PrimitiveType type,
-                                       ComputationBuilder* builder) {
-  return CreateScalarComputation(
-      "max", type, builder,
-      [](ComputationBuilder* b, const ComputationDataHandle& lhs,
-         const ComputationDataHandle& rhs) { return b->Max(lhs, rhs); });
-}
-
-Computation CreateScalarMinComputation(PrimitiveType type,
-                                       ComputationBuilder* builder) {
-  return CreateScalarComputation(
-      "min", type, builder,
-      [](ComputationBuilder* b, const ComputationDataHandle& lhs,
-         const ComputationDataHandle& rhs) { return b->Min(lhs, rhs); });
-}
-
-Computation CreateScalarAndComputation(ComputationBuilder* builder) {
-  return CreateScalarComputation(
-      "and", PRED, builder,
-      [](ComputationBuilder* b, const ComputationDataHandle& lhs,
-         const ComputationDataHandle& rhs) { return b->And(lhs, rhs); });
-}
-
-Computation CreateScalarOrComputation(ComputationBuilder* builder) {
-  return CreateScalarComputation(
-      "or", PRED, builder,
-      [](ComputationBuilder* b, const ComputationDataHandle& lhs,
-         const ComputationDataHandle& rhs) { return b->Or(lhs, rhs); });
-}
-
-StatusOr<ComputationDataHandle> Any(const ComputationDataHandle& predicates,
-                                    ComputationBuilder* builder) {
-  auto f = builder->ConstantR0<bool>(false);
-  Computation logical_or = CreateScalarOrComputation(builder);
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<Shape> predicates_shape,
-                      builder->GetShape(predicates));
-  std::vector<int64> all_dimensions(ShapeUtil::Rank(*predicates_shape));
-  std::iota(all_dimensions.begin(), all_dimensions.end(), 0);
-  return builder->Reduce(predicates, f, logical_or, all_dimensions);
-}
-
 XlaComputation CreateScalarAddComputation(PrimitiveType type,
                                           XlaBuilder* builder) {
   return CreateScalarComputation(