Skip to content

Commit

Permalink
Merge branch 'master' into multistream-streammerge
Browse files Browse the repository at this point in the history
  • Loading branch information
buptzyb committed Apr 12, 2024
2 parents efe56d7 + 8479d5f commit 5aabb58
Show file tree
Hide file tree
Showing 1,487 changed files with 43,669 additions and 21,660 deletions.
29 changes: 17 additions & 12 deletions .bazelrc
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.3"
build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
build:cuda_clang_official --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
build:cuda_clang_official --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"

# Build with nvcc for CUDA and clang for host
build:nvcc_clang --config=cuda
Expand Down Expand Up @@ -293,6 +293,11 @@ build:rocm --define=using_rocm_hipcc=true
build:rocm --define=tensorflow_mkldnn_contraction_kernel=0
build:rocm --repo_env TF_NEED_ROCM=1

build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
build:sycl --define=using_sycl=true
build:sycl --define=tensorflow_mkldnn_contraction_kernel=0
build:sycl --repo_env TF_NEED_SYCL=1

# Options to disable default on features
build:noaws --define=no_aws_support=true
build:nogcp --define=no_gcp_support=true
Expand Down Expand Up @@ -497,12 +502,12 @@ build:rbe_linux --host_linkopt=-lm

build:rbe_linux_cpu --config=rbe_linux
# Linux cpu and cuda builds share the same toolchain now.
build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain-linux-x86_64"
build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.16-clang_config_platform//:platform"
build:rbe_linux_cpu --host_platform="@sigbuild-r2.16-clang_config_platform//:platform"
build:rbe_linux_cpu --platforms="@sigbuild-r2.16-clang_config_platform//:platform"
build:rbe_linux_cpu --host_crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
build:rbe_linux_cpu --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"
build:rbe_linux_cpu --extra_toolchains="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain-linux-x86_64"
build:rbe_linux_cpu --extra_execution_platforms="@sigbuild-r2.17-clang_config_platform//:platform"
build:rbe_linux_cpu --host_platform="@sigbuild-r2.17-clang_config_platform//:platform"
build:rbe_linux_cpu --platforms="@sigbuild-r2.17-clang_config_platform//:platform"
# This is needed for all Clang17 builds but must not be present in GCC builds.
build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
# This was added in clang-16 by https://reviews.llvm.org/D133574.
Expand All @@ -511,7 +516,7 @@ build:rbe_linux_cpu --copt=-Wno-error=unused-command-line-argument
# See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
build:rbe_linux_cpu --copt=-Wno-gnu-offsetof-extensions
# Python config is the same across all containers because the binary is the same
build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.16-clang_config_python"
build:rbe_linux_cpu --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.17-clang_config_python"
build:rbe_linux_cpu --python_path="/usr/bin/python3"
# These you may need to change for your own GCP project.
common:rbe_linux_cpu --remote_instance_name=projects/tensorflow-testing/instances/default_instance
Expand All @@ -532,9 +537,9 @@ build:rbe_linux_cuda --config=cuda_clang_official
build:rbe_linux_cuda --config=rbe_linux_cpu
# For Remote build execution -- GPU configuration
build:rbe_linux_cuda --repo_env=REMOTE_GPU_TESTING=1
build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.16-clang_config_cuda"
build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.16-clang_config_tensorrt"
build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.16-clang_config_nccl"
build:rbe_linux_cuda --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.17-clang_config_cuda"
build:rbe_linux_cuda --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.17-clang_config_tensorrt"
build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.17-clang_config_nccl"
test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"

build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
Expand Down Expand Up @@ -639,7 +644,7 @@ test:release_linux_base --test_summary=short

# Use the Clang toolchain to compile
build:release_cpu_linux --config=release_linux_base
build:release_cpu_linux --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
build:release_cpu_linux --crosstool_top="@sigbuild-r2.17-clang_config_cuda//crosstool:toolchain"

build:release_gpu_linux --config=release_cpu_linux
# Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
Expand Down
12 changes: 12 additions & 0 deletions .github/workflows/update-rbe.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,18 @@ jobs:
map sigbuild-r2.16-clang-python3.10 2.16-python3.10
map sigbuild-r2.16-clang-python3.11 2.16-python3.11
map sigbuild-r2.16-clang-python3.12 2.16-python3.12
# TF 2.17
map sigbuild-r2.17 2.17-python3.11
map sigbuild-r2.17-python3.9 2.17-python3.9
map sigbuild-r2.17-python3.10 2.17-python3.10
map sigbuild-r2.17-python3.11 2.17-python3.11
map sigbuild-r2.17-python3.12 2.17-python3.12
# TF 2.17 + Clang (containers are the same, but env vars in configs.bzl are different)
map sigbuild-r2.17-clang 2.17-python3.11
map sigbuild-r2.17-clang-python3.9 2.17-python3.9
map sigbuild-r2.17-clang-python3.10 2.17-python3.10
map sigbuild-r2.17-clang-python3.11 2.17-python3.11
map sigbuild-r2.17-clang-python3.12 2.17-python3.12
- name: Create Pull Request with changes
uses: peter-evans/create-pull-request@2b011faafdcbc9ceb11414d64d0573f37c774b04 # v4.2.3
with:
Expand Down
16 changes: 16 additions & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,15 @@
built with support for a given CPU target. This can be useful for skipping
target-specific tests if a target is not supported.

* `tf.data`
* Support `data.experimental.distribued_save`. `distribued_save` uses
tf.data service
(https://www.tensorflow.org/api_docs/python/tf/data/experimental/service)
to write distributed dataset snapshots. The call is non-blocking and
returns without waiting for the snapshot to finish. Setting `wait=True` to
`tf.data.Dataset.load` allows the snapshots to be read while they are
being written.

### Bug Fixes and Other Changes

* <SIMILAR TO ABOVE SECTION, BUT FOR OTHER IMPORTANT CHANGES / BUG FIXES>
Expand All @@ -79,6 +88,13 @@
`experimental_default_delegate_latest_features` to enable all default
delegate features.

* `tf.data`
* Add `wait` to `tf.data.Dataset.load`. If `True`, for snapshots written
with `distributed_save`, it reads the snapshot while it is being written.
For snapshots written with regular `save`, it waits for the snapshot until
it's finished. The default is `False` for backward compatibility. Users of
`distributed_save` are recommended to set it to `True`.

## Thanks to our Contributors

This release contains contributions from many people at Google, as well as:
Expand Down
9 changes: 9 additions & 0 deletions WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

workspace(name = "org_tensorflow")

# buildifier: disable=load-on-top

# We must initialize hermetic python first.
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")

Expand All @@ -14,13 +16,20 @@ http_archive(
],
)

http_archive(
name = "rules_java",
sha256 = "c73336802d0b4882e40770666ad055212df4ea62cfa6edf9cb0f9d29828a0934",
url = "https://github.com/bazelbuild/rules_java/releases/download/5.3.5/rules_java-5.3.5.tar.gz",
)

http_archive(
name = "rules_python",
sha256 = "9d04041ac92a0985e344235f5d946f71ac543f1b1565f2cdbc9a2aaee8adf55b",
strip_prefix = "rules_python-0.26.0",
url = "https://github.com/bazelbuild/rules_python/releases/download/0.26.0/rules_python-0.26.0.tar.gz",
)

# buildifier: disable=same-origin-load
load("@rules_python//python:repositories.bzl", "py_repositories")

py_repositories()
Expand Down
3 changes: 3 additions & 0 deletions ci/official/requirements_updater/WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

workspace(name = "requirements_updater")

# buildifier: disable=load-on-top

load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")

http_archive(
Expand All @@ -20,6 +22,7 @@ http_archive(
url = "https://github.com/bazelbuild/rules_python/releases/download/0.26.0/rules_python-0.26.0.tar.gz",
)

# buildifier: disable=same-origin-load
load("@rules_python//python:repositories.bzl", "py_repositories")

py_repositories()
Expand Down
3 changes: 3 additions & 0 deletions ci/official/wheel_test/WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

workspace(name = "wheel_test")

# buildifier: disable=load-on-top

load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")

http_archive(
Expand All @@ -20,6 +22,7 @@ http_archive(
url = "https://github.com/bazelbuild/rules_python/releases/download/0.26.0/rules_python-0.26.0.tar.gz",
)

# buildifier: disable=same-origin-load
load("@rules_python//python:repositories.bzl", "py_repositories")

py_repositories()
Expand Down
4 changes: 3 additions & 1 deletion tensorflow/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -1068,6 +1068,7 @@ package_group(
"//third_party/py/keras/...",
"//third_party/py/tf_keras/...",
"//third_party/yggdrasil_decision_forests/...",
"//waymo/accelerator/...",
"//waymo/ml/cn/...",
"//waymo/ml/models/...",
],
Expand Down Expand Up @@ -1116,9 +1117,10 @@ bzl_library(
"@local_config_cuda//cuda:build_defs_bzl",
"@local_config_rocm//rocm:build_defs_bzl",
"@local_config_tensorrt//:build_defs_bzl",
"@local_tsl//tsl:tsl_bzl",
"@local_tsl//tsl/platform/default:cuda_build_defs_bzl",
"@local_xla//xla/tsl:tsl_bzl",
"@local_xla//xla/tsl/mkl:build_defs_bzl",
"@rules_java//java:rules",
],
)

Expand Down
2 changes: 1 addition & 1 deletion tensorflow/c/eager/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -913,8 +913,8 @@ tf_cuda_library(
"@com_google_absl//absl/memory",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/time",
"@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
"@local_xla//xla/tsl/c:tsl_status_internal",
"@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
],
alwayslink = 1,
)
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/c/eager/c_api_experimental.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ limitations under the License.
#include "tensorflow/c/tf_status.h"
#include "tensorflow/c/tf_status_helper.h"
#include "xla/tsl/c/tsl_status_internal.h"
#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
#include "tensorflow/core/common_runtime/composite_device.h"
#include "tensorflow/core/common_runtime/device.h"
#include "tensorflow/core/common_runtime/eager/eager_operation.h"
Expand All @@ -44,7 +45,6 @@ limitations under the License.
#include "tensorflow/core/platform/errors.h"
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/strcat.h"
#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"
#include "tsl/framework/cancellation.h"

using tensorflow::string;
Expand Down
1 change: 0 additions & 1 deletion tensorflow/c/experimental/gradients/nn_grad.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ limitations under the License.

using std::vector;
using tensorflow::ops::BiasAddGrad;
using tensorflow::ops::Mul;
using tensorflow::ops::ReluGrad;

namespace tensorflow {
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/c/experimental/next_pluggable_device/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ cc_library(
"@com_google_absl//absl/strings",
"@com_google_absl//absl/time",
"@com_google_absl//absl/types:span",
"@local_tsl//tsl/distributed_runtime/coordination:coordination_service_agent",
"@local_xla//xla/pjrt:pjrt_c_api_client",
"@local_xla//xla/pjrt:pjrt_client",
"@local_xla//xla/pjrt/c:pjrt_c_api_hdrs",
"@local_xla//xla/pjrt/c:pjrt_c_api_helpers",
"@local_xla//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
],
)

Expand Down
2 changes: 1 addition & 1 deletion tensorflow/c/experimental/next_pluggable_device/c_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ limitations under the License.
#include "xla/pjrt/c/pjrt_c_api_helpers.h"
#include "xla/pjrt/pjrt_c_api_client.h"
#include "xla/pjrt/pjrt_client.h"
#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_resource.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/resource_handle.h"
Expand All @@ -51,7 +52,6 @@ limitations under the License.
#include "tensorflow/core/platform/refcount.h"
#include "tensorflow/core/platform/status.h"
#include "tensorflow/core/tfrt/common/pjrt_util.h"
#include "tsl/distributed_runtime/coordination/coordination_service_agent.h"

TF_Device* TF_GetDevice(TF_OpKernelContext* ctx) {
auto* cc_ctx = reinterpret_cast<tensorflow::OpKernelContext*>(ctx);
Expand Down
2 changes: 2 additions & 0 deletions tensorflow/c/experimental/ops/gen/cpp/renderers/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,12 @@ tf_cc_tests(
),
deps = [
":renderers",
"//tensorflow/c/experimental/ops/gen/common",
"//tensorflow/core:core_cpu",
"//tensorflow/core:framework",
"//tensorflow/core:test",
"//tensorflow/core:test_main",
"//tensorflow/core:testlib",
"//tensorflow/core/platform:types",
],
)
2 changes: 2 additions & 0 deletions tensorflow/c/experimental/stream_executor/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ cc_library(
"//tensorflow/core/common_runtime/device:device_utils",
"//tensorflow/core/platform:strcat",
"@com_google_absl//absl/functional:any_invocable",
"@com_google_absl//absl/status",
"@com_google_absl//absl/strings:str_format",
"@local_tsl//tsl/platform:status",
"@local_xla//xla/stream_executor",
"@local_xla//xla/stream_executor:platform",
Expand Down
24 changes: 16 additions & 8 deletions tensorflow/c/experimental/stream_executor/stream_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,20 @@ limitations under the License.
// device.
#include "tensorflow/c/experimental/stream_executor/stream_executor.h"

#include <memory>
#include <string>
#include <utility>

#include "absl/functional/any_invocable.h"
#include "absl/status/status.h"
#include "absl/strings/str_format.h"
#include "tensorflow/c/c_api_macros.h"
#include "tensorflow/c/c_api_macros_internal.h"
#include "tensorflow/c/experimental/stream_executor/stream_executor_internal.h"
#include "tensorflow/c/tf_status_helper.h"
#include "xla/stream_executor/executor_cache.h"
#include "xla/stream_executor/host_memory_allocation.h"
#include "xla/stream_executor/memory_allocation.h"
#include "xla/stream_executor/platform.h"
#include "xla/stream_executor/platform_manager.h"
#include "xla/stream_executor/stream.h"
Expand Down Expand Up @@ -215,9 +220,7 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
platform_fns_->destroy_device(platform_, &device_);
}

absl::Status Init(int device_ordinal) override {
return ::tensorflow::OkStatus();
}
absl::Status Init() override { return absl::OkStatus(); }

DeviceMemoryBase Allocate(uint64 size, int64_t memory_space) override {
SP_DeviceMemoryBase mem = {SP_DEVICE_MEMORY_BASE_STRUCT_SIZE};
Expand All @@ -237,8 +240,14 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
stream_executor_->deallocate(&device_, &device_memory_base);
}

void* HostMemoryAllocate(uint64 size) override {
return stream_executor_->host_memory_allocate(&device_, size);
absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
uint64 size) override {
auto* buffer = stream_executor_->host_memory_allocate(&device_, size);
if (buffer == nullptr && size > 0) {
return absl::InternalError(
absl::StrFormat("Failed to allocate HostMemory of size %d", size));
}
return std::make_unique<HostMemoryAllocation>(buffer, size, this);
}

void HostMemoryDeallocate(void* mem) override {
Expand Down Expand Up @@ -655,11 +664,10 @@ absl::StatusOr<std::unique_ptr<StreamExecutor>> CPlatform::GetUncachedExecutor(
c_status.get());
TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status.get()));

auto executor = absl::make_unique<CStreamExecutor>(
auto executor = std::make_unique<CStreamExecutor>(
std::move(device), &device_fns_, &stream_executor_, &platform_,
&platform_fns_, &timer_fns_, name_, visible_device_count);
auto result = absl::make_unique<StreamExecutor>(this, std::move(executor),
config.ordinal);
auto result = std::make_unique<StreamExecutor>(this, std::move(executor));
return result;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,6 @@ TEST_F(StreamExecutorTest, Allocate) {
ASSERT_NE(mem.opaque(), nullptr);
ASSERT_EQ(mem.size(), 2 * sizeof(int));
executor->Deallocate(&mem);
ASSERT_EQ(mem.opaque(), nullptr);
}

TEST_F(StreamExecutorTest, HostMemoryAllocate) {
Expand Down Expand Up @@ -530,7 +529,7 @@ TEST_F(StreamExecutorTest, SyncMemcpyDeviceToDevice) {
int dst_data = 0;
DeviceMemoryBase device_dst(&dst_data, size);
DeviceMemoryBase device_src(&src_data, size);
ASSERT_TRUE(executor->SynchronousMemcpy(&device_dst, device_src, size));
TF_ASSERT_OK(executor->SynchronousMemcpy(&device_dst, device_src, size));
ASSERT_EQ(dst_data, 18);
}

Expand Down
5 changes: 3 additions & 2 deletions tensorflow/c/kernels_experimental.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ limitations under the License.
#include "tensorflow/c/kernels_experimental.h"

#include <algorithm>
#include <memory>
#include <optional>
#include <string>
#include <utility>
Expand Down Expand Up @@ -414,9 +415,9 @@ void TF_MaybeLockVariableInputMutexesInOrder(
std::sort(acquire_order.begin(), acquire_order.end(),
[&mutexes](int a, int b) { return mutexes[a] < mutexes[b]; });

auto locks = absl::make_unique<std::vector<tensorflow::mutex_lock>>();
auto locks = std::make_unique<std::vector<tensorflow::mutex_lock>>();
auto shared_locks =
absl::make_unique<std::vector<tensorflow::tf_shared_lock>>();
std::make_unique<std::vector<tensorflow::tf_shared_lock>>();
locks->reserve(acquire_order.size());

for (auto acquire : acquire_order) {
Expand Down

0 comments on commit 5aabb58

Please sign in to comment.