[PJRT] Enable layering_check in the Bazel BUILD.

PiperOrigin-RevId: 446190811
tensorflow · May 3, 2022 · f679cb4 · f679cb4
1 parent bedee44
commit f679cb4
Show file tree

Hide file tree

Showing 4 changed files with 156 additions and 68 deletions.
diff --git a/tensorflow/compiler/xla/pjrt/BUILD b/tensorflow/compiler/xla/pjrt/BUILD
@@ -6,9 +6,6 @@ load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 
 package(
     default_visibility = ["//tensorflow:internal"],
-    features = [
-        "-layering_check",
-    ],
     licenses = ["notice"],
 )
 
@@ -117,10 +114,15 @@ tf_cc_test(
     srcs = ["cpu_device_test.cc"],
     deps = [
         ":cpu_device",
+        ":pjrt_client",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/client:executable_build_options",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:random",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -213,6 +215,7 @@ cc_library(
         ":metrics",
         ":mlir_to_hlo",
         ":pjrt_client",
+        ":pjrt_future",
         ":tracked_device_buffer",
         ":transpose",
         ":utils",
@@ -275,6 +278,7 @@ cc_library(
         ":local_device_state",
         ":pjrt_stream_executor_client",
         ":tracked_device_buffer",
+        ":utils",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
@@ -356,19 +360,33 @@ cc_library(
         "//tensorflow/stream_executor:device_memory",
         "//tensorflow/stream_executor:tf_allocator_adapter",
     ] + if_cuda([
+        ":nccl_id_store",
+        "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/stream_executor/cuda:cuda_activation_header",
     ]) + if_rocm([
         "@local_config_rocm//rocm:rocm_headers",
-    ]) + if_nccl([":nccl_plugin"]),
+    ]),
 )
 
 # We actually wish we could write if_cuda(if_nccl(...)) in :gpu_device,
 # but Bazel does not allow nested selects. We can work around the problem using
-# an intermediate library.
+# an intermediate library that has the conditional NCCL pieces that is only
+# itself included as a dependency if CUDA is enabled.
 cc_library(
-    name = "nccl_plugin",
-    defines = if_cuda(["NCCL_ENABLED=1"]),
-    deps = if_cuda(["@local_config_nccl//:nccl"]),
+    name = "nccl_id_store",
+    srcs = ["nccl_id_store.cc"],
+    hdrs = ["nccl_id_store.h"],
+    defines = if_nccl(["NCCL_ENABLED=1"]),
+    deps = [
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/synchronization",
+        "//tensorflow/compiler/xla/pjrt/distributed:client",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/service:global_device_id",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
+    ] + if_nccl(["@local_config_nccl//:nccl"]),
 )
 
 tf_cc_test(
@@ -392,6 +410,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:random",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -413,6 +432,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Transforms",
     ],
 )
 
@@ -437,9 +457,11 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:cpu_function_runtime",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/core/platform:platform_port",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@tf_runtime//:hostcontext",
     ],
@@ -552,8 +574,10 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:env",
+        "//tensorflow/core/platform:test_benchmark",
         "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
         "//third_party/eigen3",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/numeric:int128",
     ],
 )
diff --git a/tensorflow/compiler/xla/pjrt/gpu_device.cc b/tensorflow/compiler/xla/pjrt/gpu_device.cc
@@ -27,16 +27,14 @@ limitations under the License.
 #ifdef GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "tensorflow/compiler/xla/pjrt/nccl_id_store.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_ROCM
 #include "rocm/rocm_config.h"
 #endif  // TENSORFLOW_USE_ROCM
 
-#ifdef NCCL_ENABLED
-#include "third_party/nccl/nccl.h"
-#endif  // NCCL_ENABLED
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
@@ -368,63 +366,6 @@ std::unique_ptr<tensorflow::BFCAllocator> GetGpuHostAllocator(
       /*name=*/"xla_gpu_host_bfc", opts);
 }
 
-// A table mapping NcclCliqueKeys to ncclUniqueId values encoded as strings.
-// In a distributed setup the table of NCCL IDs is kept on the master node
-// (node 0). The node of the first participating device will create the unique
-// id.
-class NcclIdStore {
- public:
-  NcclIdStore(int node_id, std::shared_ptr<DistributedRuntimeClient> client,
-              absl::flat_hash_map<GlobalDeviceId, int> device_to_node)
-      : node_id_(node_id),
-        client_(std::move(client)),
-        device_to_node_(std::move(device_to_node)) {}
-
-  StatusOr<std::string> GetNcclUniqueId(const gpu::NcclCliqueKey& key);
-
- private:
-  const int node_id_;
-  const std::shared_ptr<DistributedRuntimeClient> client_;
-  const absl::flat_hash_map<GlobalDeviceId, int> device_to_node_;
-
-  absl::Mutex mu_;
-  absl::flat_hash_map<gpu::NcclCliqueKey, std::string> cache_
-      ABSL_GUARDED_BY(mu_);
-};
-
-StatusOr<std::string> NcclIdStore::GetNcclUniqueId(
-    const gpu::NcclCliqueKey& key) {
-  // The caller must ensure that threads calling this method concurrently have
-  // unique keys, otherwise the global key-value store may hold the wrong value.
-  {
-    absl::MutexLock lock(&mu_);
-    auto it = cache_.find(key);
-    if (it != cache_.end()) {
-      return it->second;
-    }
-  }
-  std::string id_string;
-  int primary_node_id = device_to_node_.at(key.devices()[0]);
-  if (node_id_ == primary_node_id) {
-#ifdef NCCL_ENABLED
-    ncclUniqueId id;
-    ncclResult_t r = ncclGetUniqueId(&id);
-    TF_RET_CHECK(r == ncclSuccess);
-    id_string = std::string(id.internal, NCCL_UNIQUE_ID_BYTES);
-    TF_RETURN_IF_ERROR(client_->KeyValueSet(key.ToString(), id_string));
-#else
-    return FailedPrecondition("NCCL support was not built into XLA binary.");
-#endif
-  } else {
-    TF_ASSIGN_OR_RETURN(id_string, client_->BlockingKeyValueGet(
-                                       key.ToString(), absl::Minutes(5)));
-  }
-  absl::MutexLock lock(&mu_);
-  auto result = cache_.emplace(key, std::move(id_string));
-  TF_RET_CHECK(result.second) << "Unique ID already in cache.";
-  return result.first->second;
-}
-
 std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
     std::vector<std::unique_ptr<LocalDeviceState>> local_device_states) {
   std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices;
@@ -493,12 +434,14 @@ Status BuildDistributedDevices(
   }
   gpu_executable_run_options->set_gpu_global_device_ids(
       std::move(gpu_device_ids));
+#ifdef GOOGLE_CUDA
   auto nccl_id_store = std::make_shared<NcclIdStore>(
       node_id, distributed_client, device_to_node);
   gpu_executable_run_options->set_nccl_unique_id_callback(
       [nccl_id_store](const gpu::NcclCliqueKey& key) {
         return nccl_id_store->GetNcclUniqueId(key);
       });
+#endif  // GOOGLE_CUDA
   return Status::OK();
 }
 

diff --git a/tensorflow/compiler/xla/pjrt/nccl_id_store.cc b/tensorflow/compiler/xla/pjrt/nccl_id_store.cc
@@ -0,0 +1,62 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/pjrt/nccl_id_store.h"
+
+#include <string>
+#include <utility>
+
+#ifdef NCCL_ENABLED
+#include "third_party/nccl/nccl.h"
+#endif  // NCCL_ENABLED
+
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+
+StatusOr<std::string> NcclIdStore::GetNcclUniqueId(
+    const gpu::NcclCliqueKey& key) {
+  // The caller must ensure that threads calling this method concurrently have
+  // unique keys, otherwise the global key-value store may hold the wrong value.
+  {
+    absl::MutexLock lock(&mu_);
+    auto it = cache_.find(key);
+    if (it != cache_.end()) {
+      return it->second;
+    }
+  }
+  std::string id_string;
+  int primary_node_id = device_to_node_.at(key.devices()[0]);
+  if (node_id_ == primary_node_id) {
+#ifdef NCCL_ENABLED
+    ncclUniqueId id;
+    ncclResult_t r = ncclGetUniqueId(&id);
+    TF_RET_CHECK(r == ncclSuccess);
+    id_string = std::string(id.internal, NCCL_UNIQUE_ID_BYTES);
+    TF_RETURN_IF_ERROR(client_->KeyValueSet(key.ToString(), id_string));
+#else
+    return FailedPrecondition("NCCL support was not built into XLA binary.");
+#endif
+  } else {
+    TF_ASSIGN_OR_RETURN(id_string, client_->BlockingKeyValueGet(
+                                       key.ToString(), absl::Minutes(5)));
+  }
+  absl::MutexLock lock(&mu_);
+  auto result = cache_.emplace(key, std::move(id_string));
+  TF_RET_CHECK(result.second) << "Unique ID already in cache.";
+  return result.first->second;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/nccl_id_store.h b/tensorflow/compiler/xla/pjrt/nccl_id_store.h
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_NCCL_ID_STORE_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_NCCL_ID_STORE_H_
+
+#include <memory>
+#include <utility>
+#include <string>
+
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/client.h"
+#include "tensorflow/compiler/xla/service/global_device_id.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// A table mapping NcclCliqueKeys to ncclUniqueId values encoded as strings.
+// In a distributed setup the table of NCCL IDs is kept on the master node
+// (node 0). The node of the first participating device will create the unique
+// id.
+class NcclIdStore {
+ public:
+  NcclIdStore(int node_id, std::shared_ptr<DistributedRuntimeClient> client,
+              absl::flat_hash_map<GlobalDeviceId, int> device_to_node)
+      : node_id_(node_id),
+        client_(std::move(client)),
+        device_to_node_(std::move(device_to_node)) {}
+
+  StatusOr<std::string> GetNcclUniqueId(const gpu::NcclCliqueKey& key);
+
+ private:
+  const int node_id_;
+  const std::shared_ptr<DistributedRuntimeClient> client_;
+  const absl::flat_hash_map<GlobalDeviceId, int> device_to_node_;
+
+  absl::Mutex mu_;
+  absl::flat_hash_map<gpu::NcclCliqueKey, std::string> cache_
+      ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_NCCL_ID_STORE_H_