Merge branch 'master' into multistream-streammerge

tensorflow · Apr 12, 2024 · 5aabb58 · 5aabb58
2 parents efe56d7 + 8479d5f
commit 5aabb58
Show file tree

Hide file tree

Showing 1,487 changed files with 43,669 additions and 21,660 deletions.
diff --git a/.bazelrc b/.bazelrc
@@ -253,7 +253,7 @@ build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.3"
 build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
 build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-build:cuda_clang_official --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
+build:cuda_clang_official --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
 
 # Build with nvcc for CUDA and clang for host
 build:nvcc_clang --config=cuda
@@ -293,6 +293,11 @@ build:rocm --define=using_rocm_hipcc=true
 build:rocm --define=tensorflow_mkldnn_contraction_kernel=0
 build:rocm --repo_env TF_NEED_ROCM=1
 
+build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
+build:sycl --define=using_sycl=true
+build:sycl --define=tensorflow_mkldnn_contraction_kernel=0
+build:sycl --repo_env TF_NEED_SYCL=1
+
 # Options to disable default on features
 build:noaws --define=no_aws_support=true
 build:nogcp --define=no_gcp_support=true
@@ -497,12 +502,12 @@ build:rbe_linux --host_linkopt=-lm
 
 build:rbe_linux_cpu --config=rbe_linux
 # Linux cpu and cuda builds share the same toolchain now.
-build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
-build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.16-clang_config_platform//:platform"
-build:rbe_linux_cpu --host_platform="@sigbuild-r2.16-clang_config_platform//:platform"
-build:rbe_linux_cpu --platforms="@sigbuild-r2.16-clang_config_platform//:platform"
+build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
+build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.17-clang_config_platform//:platform"
+build:rbe_linux_cpu --host_platform="@sigbuild-r2.17-clang_config_platform//:platform"
+build:rbe_linux_cpu --platforms="@sigbuild-r2.17-clang_config_platform//:platform"
 # This is needed for all Clang17 builds but must not be present in GCC builds.
 build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
 # This was added in clang-16 by https://reviews.llvm.org/D133574.
@@ -511,7 +516,7 @@ build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
 # See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
 build:rbe_linux_cpu --copt=-Wno-gnu-offsetof-extensions
 # Python config is the same across all containers because the binary is the same
-build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.16-clang_config_python"
+build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.17-clang_config_python"
 build:rbe_linux_cpu --python_path="/usr/bin/python3"
 # These you may need to change for your own GCP project.
 common:rbe_linux_cpu --remote_instance_name=projects/tensorflow-testing/instances/default_instance
@@ -532,9 +537,9 @@ build:rbe_linux_cuda --config=cuda_clang_official
 build:rbe_linux_cuda --config=rbe_linux_cpu
 # For Remote build execution -- GPU configuration
 build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
-build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.16-clang_config_cuda"
-build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.16-clang_config_tensorrt"
-build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.16-clang_config_nccl"
+build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.17-clang_config_cuda"
+build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.17-clang_config_tensorrt"
+build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.17-clang_config_nccl"
 test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
 build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
@@ -639,7 +644,7 @@ test:release_linux_base --test_summary=short
 
 # Use the Clang toolchain to compile
 build:release_cpu_linux --config=release_linux_base
-build:release_cpu_linux --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
+build:release_cpu_linux --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
 
 build:release_gpu_linux --config=release_cpu_linux
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.

diff --git a/.github/workflows/update-rbe.yml b/.github/workflows/update-rbe.yml
@@ -117,6 +117,18 @@ jobs:
         map sigbuild-r2.16-clang-python3.10 2.16-python3.10
         map sigbuild-r2.16-clang-python3.11 2.16-python3.11
         map sigbuild-r2.16-clang-python3.12 2.16-python3.12
+        # TF 2.17
+        map sigbuild-r2.17 2.17-python3.11
+        map sigbuild-r2.17-python3.9 2.17-python3.9
+        map sigbuild-r2.17-python3.10 2.17-python3.10
+        map sigbuild-r2.17-python3.11 2.17-python3.11
+        map sigbuild-r2.17-python3.12 2.17-python3.12
+        # TF 2.17 + Clang (containers are the same, but env vars in configs.bzl are different)
+        map sigbuild-r2.17-clang 2.17-python3.11
+        map sigbuild-r2.17-clang-python3.9 2.17-python3.9
+        map sigbuild-r2.17-clang-python3.10 2.17-python3.10
+        map sigbuild-r2.17-clang-python3.11 2.17-python3.11
+        map sigbuild-r2.17-clang-python3.12 2.17-python3.12
     - name: Create Pull Request with changes
       uses: peter-evans/create-pull-request@2b011faafdcbc9ceb11414d64d0573f37c774b04 # v4.2.3
       with:

diff --git a/RELEASE.md b/RELEASE.md
@@ -59,6 +59,15 @@
     built with support for a given CPU target. This can be useful for skipping
     target-specific tests if a target is not supported.
 
+*   `tf.data`
+    * Support `data.experimental.distribued_save`. `distribued_save` uses
+      tf.data service
+      (https://www.tensorflow.org/api_docs/python/tf/data/experimental/service)
+      to write distributed dataset snapshots. The call is non-blocking and
+      returns without waiting for the snapshot to finish. Setting `wait=True` to
+      `tf.data.Dataset.load` allows the snapshots to be read while they are
+      being written.
+
 ### Bug Fixes and Other Changes
 
 * <SIMILAR TO ABOVE SECTION, BUT FOR OTHER IMPORTANT CHANGES / BUG FIXES>
@@ -79,6 +88,13 @@
       `experimental_default_delegate_latest_features` to enable all default
       delegate features.
 
+* `tf.data`
+    * Add `wait` to `tf.data.Dataset.load`. If `True`, for snapshots written
+      with `distributed_save`, it reads the snapshot while it is being written.
+      For snapshots written with regular `save`, it waits for the snapshot until
+      it's finished. The default is `False` for backward compatibility. Users of
+      `distributed_save` are recommended to set it to `True`.
+
 ## Thanks to our Contributors
 
 This release contains contributions from many people at Google, as well as:

diff --git a/WORKSPACE b/WORKSPACE
@@ -2,6 +2,8 @@
 
 workspace(name = "org_tensorflow")
 
+# buildifier: disable=load-on-top
+
 # We must initialize hermetic python first.
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 
@@ -14,13 +16,20 @@ http_archive(
     ],
 )
 
+http_archive(
+    name = "rules_java",
+    sha256 = "c73336802d0b4882e40770666ad055212df4ea62cfa6edf9cb0f9d29828a0934",
+    url = "https://github.com/bazelbuild/rules_java/releases/download/5.3.5/rules_java-5.3.5.tar.gz",
+)
+
 http_archive(
     name = "rules_python",
     sha256 = "9d04041ac92a0985e344235f5d946f71ac543f1b1565f2cdbc9a2aaee8adf55b",
     strip_prefix = "rules_python-0.26.0",
     url = "https://github.com/bazelbuild/rules_python/releases/download/0.26.0/rules_python-0.26.0.tar.gz",
 )
 
+# buildifier: disable=same-origin-load
 load("@rules_python//python:repositories.bzl", "py_repositories")
 
 py_repositories()

diff --git a/ci/official/requirements_updater/WORKSPACE b/ci/official/requirements_updater/WORKSPACE
@@ -2,6 +2,8 @@
 
 workspace(name = "requirements_updater")
 
+# buildifier: disable=load-on-top
+
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 
 http_archive(
@@ -20,6 +22,7 @@ http_archive(
     url = "https://github.com/bazelbuild/rules_python/releases/download/0.26.0/rules_python-0.26.0.tar.gz",
 )
 
+# buildifier: disable=same-origin-load
 load("@rules_python//python:repositories.bzl", "py_repositories")
 
 py_repositories()

diff --git a/ci/official/wheel_test/WORKSPACE b/ci/official/wheel_test/WORKSPACE
@@ -2,6 +2,8 @@
 
 workspace(name = "wheel_test")
 
+# buildifier: disable=load-on-top
+
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 
 http_archive(
@@ -20,6 +22,7 @@ http_archive(
     url = "https://github.com/bazelbuild/rules_python/releases/download/0.26.0/rules_python-0.26.0.tar.gz",
 )
 
+# buildifier: disable=same-origin-load
 load("@rules_python//python:repositories.bzl", "py_repositories")
 
 py_repositories()

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
@@ -1068,6 +1068,7 @@ package_group(
         "//third_party/py/keras/...",
         "//third_party/py/tf_keras/...",
         "//third_party/yggdrasil_decision_forests/...",
+        "//waymo/accelerator/...",
         "//waymo/ml/cn/...",
         "//waymo/ml/models/...",
     ],
@@ -1116,9 +1117,10 @@ bzl_library(
         "@local_config_cuda//cuda:build_defs_bzl",
         "@local_config_rocm//rocm:build_defs_bzl",
         "@local_config_tensorrt//:build_defs_bzl",
-        "@local_tsl//tsl:tsl_bzl",
         "@local_tsl//tsl/platform/default:cuda_build_defs_bzl",
+        "@local_xla//xla/tsl:tsl_bzl",
         "@local_xla//xla/tsl/mkl:build_defs_bzl",
+        "@rules_java//java:rules",
     ],
 )
 

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
@@ -913,8 +913,8 @@ tf_cuda_library(
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
         "@local_xla//xla/tsl/c:tsl_status_internal",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
     ],
     alwayslink = 1,
 )

diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "xla/tsl/c/tsl_status_internal.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
@@ -44,7 +45,6 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/strcat.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tsl/framework/cancellation.h"
 
 using tensorflow::string;

diff --git a/tensorflow/c/experimental/gradients/nn_grad.cc b/tensorflow/c/experimental/gradients/nn_grad.cc
@@ -26,7 +26,6 @@ limitations under the License.
 
 using std::vector;
 using tensorflow::ops::BiasAddGrad;
-using tensorflow::ops::Mul;
 using tensorflow::ops::ReluGrad;
 
 namespace tensorflow {

diff --git a/tensorflow/c/experimental/next_pluggable_device/BUILD b/tensorflow/c/experimental/next_pluggable_device/BUILD
@@ -31,11 +31,11 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
         "@local_xla//xla/pjrt:pjrt_c_api_client",
         "@local_xla//xla/pjrt:pjrt_client",
         "@local_xla//xla/pjrt/c:pjrt_c_api_hdrs",
         "@local_xla//xla/pjrt/c:pjrt_c_api_helpers",
+        "@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
     ],
 )
 

diff --git a/tensorflow/c/experimental/next_pluggable_device/c_api.cc b/tensorflow/c/experimental/next_pluggable_device/c_api.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "xla/pjrt/c/pjrt_c_api_helpers.h"
 #include "xla/pjrt/pjrt_c_api_client.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/plugin_resource.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_handle.h"
@@ -51,7 +52,6 @@ limitations under the License.
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tfrt/common/pjrt_util.h"
-#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
 
 TF_Device* TF_GetDevice(TF_OpKernelContext* ctx) {
   auto* cc_ctx = reinterpret_cast<tensorflow::OpKernelContext*>(ctx);

diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/BUILD b/tensorflow/c/experimental/ops/gen/cpp/renderers/BUILD
@@ -36,10 +36,12 @@ tf_cc_tests(
     ),
     deps = [
         ":renderers",
+        "//tensorflow/c/experimental/ops/gen/common",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/platform:types",
     ],
 )
diff --git a/tensorflow/c/experimental/stream_executor/BUILD b/tensorflow/c/experimental/stream_executor/BUILD
@@ -45,6 +45,8 @@ cc_library(
         "//tensorflow/core/common_runtime/device:device_utils",
         "//tensorflow/core/platform:strcat",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/platform:status",
         "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor:platform",

diff --git a/tensorflow/c/experimental/stream_executor/stream_executor.cc b/tensorflow/c/experimental/stream_executor/stream_executor.cc
@@ -21,15 +21,20 @@ limitations under the License.
 // device.
 #include "tensorflow/c/experimental/stream_executor/stream_executor.h"
 
+#include <memory>
 #include <string>
 #include <utility>
 
 #include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/c/c_api_macros.h"
 #include "tensorflow/c/c_api_macros_internal.h"
 #include "tensorflow/c/experimental/stream_executor/stream_executor_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "xla/stream_executor/executor_cache.h"
+#include "xla/stream_executor/host_memory_allocation.h"
+#include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
@@ -215,9 +220,7 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
     platform_fns_->destroy_device(platform_, &device_);
   }
 
-  absl::Status Init(int device_ordinal) override {
-    return ::tensorflow::OkStatus();
-  }
+  absl::Status Init() override { return absl::OkStatus(); }
 
   DeviceMemoryBase Allocate(uint64 size, int64_t memory_space) override {
     SP_DeviceMemoryBase mem = {SP_DEVICE_MEMORY_BASE_STRUCT_SIZE};
@@ -237,8 +240,14 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
     stream_executor_->deallocate(&device_, &device_memory_base);
   }
 
-  void* HostMemoryAllocate(uint64 size) override {
-    return stream_executor_->host_memory_allocate(&device_, size);
+  absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
+      uint64 size) override {
+    auto* buffer = stream_executor_->host_memory_allocate(&device_, size);
+    if (buffer == nullptr && size > 0) {
+      return absl::InternalError(
+          absl::StrFormat("Failed to allocate HostMemory of size %d", size));
+    }
+    return std::make_unique<HostMemoryAllocation>(buffer, size, this);
   }
 
   void HostMemoryDeallocate(void* mem) override {
@@ -655,11 +664,10 @@ absl::StatusOr<std::unique_ptr<StreamExecutor>> CPlatform::GetUncachedExecutor(
                                  c_status.get());
   TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status.get()));
 
-  auto executor = absl::make_unique<CStreamExecutor>(
+  auto executor = std::make_unique<CStreamExecutor>(
       std::move(device), &device_fns_, &stream_executor_, &platform_,
       &platform_fns_, &timer_fns_, name_, visible_device_count);
-  auto result = absl::make_unique<StreamExecutor>(this, std::move(executor),
-                                                  config.ordinal);
+  auto result = std::make_unique<StreamExecutor>(this, std::move(executor));
   return result;
 }
 

diff --git a/tensorflow/c/experimental/stream_executor/stream_executor_test.cc b/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
@@ -185,7 +185,6 @@ TEST_F(StreamExecutorTest, Allocate) {
   ASSERT_NE(mem.opaque(), nullptr);
   ASSERT_EQ(mem.size(), 2 * sizeof(int));
   executor->Deallocate(&mem);
-  ASSERT_EQ(mem.opaque(), nullptr);
 }
 
 TEST_F(StreamExecutorTest, HostMemoryAllocate) {
@@ -530,7 +529,7 @@ TEST_F(StreamExecutorTest, SyncMemcpyDeviceToDevice) {
   int dst_data = 0;
   DeviceMemoryBase device_dst(&dst_data, size);
   DeviceMemoryBase device_src(&src_data, size);
-  ASSERT_TRUE(executor->SynchronousMemcpy(&device_dst, device_src, size));
+  TF_ASSERT_OK(executor->SynchronousMemcpy(&device_dst, device_src, size));
   ASSERT_EQ(dst_data, 18);
 }
 

diff --git a/tensorflow/c/kernels_experimental.cc b/tensorflow/c/kernels_experimental.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/c/kernels_experimental.h"
 
 #include <algorithm>
+#include <memory>
 #include <optional>
 #include <string>
 #include <utility>
@@ -414,9 +415,9 @@ void TF_MaybeLockVariableInputMutexesInOrder(
   std::sort(acquire_order.begin(), acquire_order.end(),
             [&mutexes](int a, int b) { return mutexes[a] < mutexes[b]; });
 
-  auto locks = absl::make_unique<std::vector<tensorflow::mutex_lock>>();
+  auto locks = std::make_unique<std::vector<tensorflow::mutex_lock>>();
   auto shared_locks =
-      absl::make_unique<std::vector<tensorflow::tf_shared_lock>>();
+      std::make_unique<std::vector<tensorflow::tf_shared_lock>>();
   locks->reserve(acquire_order.size());
 
   for (auto acquire : acquire_order) {