Skip to content

Commit

Permalink
Remove unused GpuVirtualMemAllocator.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 615560419
  • Loading branch information
klucke authored and tensorflower-gardener committed Mar 13, 2024
1 parent 7611533 commit ae567e4
Show file tree
Hide file tree
Showing 8 changed files with 2 additions and 755 deletions.
20 changes: 0 additions & 20 deletions tensorflow/core/common_runtime/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ filegroup(
"gpu_managed_allocator.h",
"gpu_process_state.h",
"gpu_util.h",
"gpu_virtual_mem_allocator.h",
"//tensorflow/core/common_runtime:gpu_runtime_headers",
"//tensorflow/core/common_runtime/device:device_runtime_headers",
"@local_tsl//tsl/framework:bfc_allocator.h",
Expand All @@ -159,7 +158,6 @@ tf_cuda_library(
"@local_config_cuda//cuda:cudnn_header",
"@local_xla//xla/stream_executor/cuda:cuda_platform",
"@local_xla//xla/stream_executor/gpu:gpu_stream",
":gpu_virtual_mem_allocator",
],
defines = if_linux_x86_64(["TF_PLATFORM_LINUX_X86_64"]),
features = ["-layering_check"],
Expand Down Expand Up @@ -253,7 +251,6 @@ tf_cuda_library(
],
visibility = ["//visibility:public"],
deps = [
":gpu_virtual_mem_allocator",
"//tensorflow/core:lib",
"//tensorflow/core:lib_internal",
"//tensorflow/core:protos_all_cc",
Expand All @@ -262,22 +259,6 @@ tf_cuda_library(
],
)

tf_cuda_library(
name = "gpu_virtual_mem_allocator",
hdrs = [
"gpu_virtual_mem_allocator.h",
],
copts = tf_copts(),
features = [
"-layering_check",
"parse_headers",
],
visibility = ["//visibility:public"],
deps = [
"@local_xla//xla/stream_executor/integrations:gpu_virtual_mem_allocator",
],
)

# -----------------------------------------------------------------------------
# Tests

Expand Down Expand Up @@ -318,7 +299,6 @@ tf_cuda_cc_test(
"//tensorflow/core/common_runtime:direct_session_internal",
"//tensorflow/core/kernels:ops_util",
"@local_xla//xla/stream_executor/integrations:device_mem_allocator",
"@local_xla//xla/stream_executor/integrations:gpu_virtual_mem_allocator",
],
)

Expand Down
102 changes: 1 addition & 101 deletions tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ limitations under the License.
#include "xla/stream_executor/gpu/gpu_init.h"
#include "xla/stream_executor/stream_executor.h"
#include "tensorflow/core/common_runtime/device/device_mem_allocator.h"
#include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
#include "tensorflow/core/framework/typed_allocator.h"
#include "tensorflow/core/protobuf/bfc_memory_map.pb.h"
#include "tensorflow/core/protobuf/config.pb.h"
Expand Down Expand Up @@ -67,20 +66,6 @@ class GPUBFCAllocatorTest
: public ::testing::TestWithParam<std::unique_ptr<SubAllocator> (*)(
size_t)> {};

#if CUDA_VERSION >= 10020
std::unique_ptr<SubAllocator> CreateVirtualMemorySubAllocator(
size_t virtual_address_space_size = 1ull << 32) {
PlatformDeviceId gpu_id(0);
auto executor =
GPUMachineManager()->ExecutorForDevice(gpu_id.value()).value();
auto* gpu_context = reinterpret_cast<stream_executor::gpu::GpuContext*>(
executor->platform_specific_handle().context);
return tensorflow::GpuVirtualMemAllocator::Create(
{}, {}, *gpu_context, gpu_id, virtual_address_space_size, {})
.value();
}
#endif

std::unique_ptr<SubAllocator> CreateGPUMemAllocator(size_t) {
PlatformDeviceId gpu_id(0);
return absl::WrapUnique(new DeviceMemAllocator(
Expand All @@ -90,21 +75,10 @@ std::unique_ptr<SubAllocator> CreateGPUMemAllocator(size_t) {

std::unique_ptr<SubAllocator> CreateSubAllocator(
size_t virtual_address_space_size = 1ull << 32) {
#if CUDA_VERSION >= 10020
return CreateVirtualMemorySubAllocator(virtual_address_space_size);
#else
return CreateGPUMemAllocator(virtual_address_space_size);
#endif
}

auto TestSuiteValues() {
#if CUDA_VERSION >= 10020
return ::testing::Values(&CreateGPUMemAllocator,
&CreateVirtualMemorySubAllocator);
#else
return ::testing::Values(&CreateGPUMemAllocator);
#endif
}
auto TestSuiteValues() { return ::testing::Values(&CreateGPUMemAllocator); }

TEST_P(GPUBFCAllocatorTest, NoDups) {
GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc", {});
Expand Down Expand Up @@ -603,34 +577,6 @@ INSTANTIATE_TEST_SUITE_P(GPUBFCAllocatorPrivateMethodTestSuite,
// Tests that cannot be trivially parameterized for both suballocator types.
class GPUBFCAllocatorTest_SubAllocatorSpecific : public ::testing::Test {};

#if CUDA_VERSION >= 10020
// Benchmark for measuring "high water mark" for BFCAllocator owned memory.
TEST_F(GPUBFCAllocatorTest_SubAllocatorSpecific,
VirtualAllocatorPromotesReuse) {
GPUBFCAllocator::Options options;
options.allow_growth = true;

constexpr size_t k512MiB = 512ull << 20;

// 512 MiB allocator.
GPUBFCAllocator a(CreateVirtualMemorySubAllocator(1ull << 32), k512MiB,
"GPU_0_bfc", options);
// Allocate 128 raw pointers of 4 megs.
const size_t size = 1LL << 22;
std::vector<void*> initial_ptrs;
for (size_t s = 0; s < 128; s++) {
void* raw = a.AllocateRaw(1, size);
initial_ptrs.push_back(raw);
}
// Deallocate all but the last one so the big chunk cannot be GC'd
for (int i = 0; i < 127; ++i) {
a.DeallocateRaw(initial_ptrs[i]);
}
void* big_alloc = a.AllocateRaw(1, k512MiB - size);
EXPECT_NE(big_alloc, nullptr);
}
#endif

TEST_F(GPUBFCAllocatorTest_SubAllocatorSpecific,
PhysicalAllocatorOomsFragmentation) {
GPUBFCAllocator::Options options;
Expand Down Expand Up @@ -706,59 +652,13 @@ class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific
}
EXPECT_EQ(1, num_chunks_in_bins);
}

#if CUDA_VERSION >= 10020
// Counterpart to the GPUMemAllocator test suite TestRegionDeallocation tests.
// Here we expect no deallocations because all allocations are coalesced into
// a single region.
void TestNoRegionDeallocation() {
GPUBFCAllocator::Options options;
options.allow_growth = true;

// Max of 2GiB, but starts out small.
GPUBFCAllocator a(CreateVirtualMemorySubAllocator(1uLL << 32), 1LL << 31,
"GPU_0_bfc", options);

// Allocate 128 raw pointers of 4 megs.
const size_t size = 1LL << 22;
std::vector<void*> initial_ptrs;
for (size_t s = 0; s < 128; s++) {
void* raw = a.AllocateRaw(1, size);
initial_ptrs.push_back(raw);
}

{
mutex_lock l(a.lock_);
EXPECT_EQ(1, a.region_manager_.regions().size());
}

// Deallocate all the memories except the last one.
for (size_t i = 0; i < initial_ptrs.size() - 1; i++) {
a.DeallocateRaw(initial_ptrs[i]);
}

// Deallocate free regions and there should still be only one.
EXPECT_EQ(false, a.DeallocateFreeRegions(/*rounded_bytes=*/0));
{
mutex_lock l(a.lock_);
EXPECT_EQ(1, a.region_manager_.regions().size());
}
}
#endif
};

TEST_F(GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific,
TestRegionDeallocation) {
TestRegionDeallocation();
}

#if CUDA_VERSION >= 10020
TEST_F(GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific,
TestNoRegionDeallocation) {
TestNoRegionDeallocation();
}
#endif

} // namespace tsl

#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
40 changes: 0 additions & 40 deletions tensorflow/core/common_runtime/gpu/gpu_process_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ limitations under the License.
#include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
#include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
#include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
#include "tensorflow/core/common_runtime/pool_allocator.h"
#include "tensorflow/core/common_runtime/shared_counter.h"
#include "tensorflow/core/framework/log_memory.h"
Expand Down Expand Up @@ -116,52 +115,13 @@ static std::unique_ptr<SubAllocator> CreateSubAllocator(
->ExecutorForDevice(platform_device_id.value())
.value();

// FIXME(imintz): Observed OOM issues when using the virtual memory
// allocators. This should be reenabled when resolved.
#if 0 && defined(GOOGLE_CUDA) && CUDA_VERSION >= 10020
// Use the old allocator when unified memory is required.
// TODO(imintz): Remove the cuMemAlloc capability of this allocator.
if (options.per_process_gpu_memory_fraction() > 1.0 ||
options.experimental().use_unified_memory()) {
return new se::DeviceMemAllocator(executor, platform_device_id,
/*use_unified_memory=*/true, alloc_visitors,
{});
} else {
auto* gpu_context = reinterpret_cast<stream_executor::gpu::GpuContext*>(
executor->platform_specific_handle().context);

absl::flat_hash_set<tsl::PlatformDeviceId> platform_peer_gpu_ids;
platform_peer_gpu_ids.reserve(peer_gpu_ids.size());
for (const tsl::TfDeviceId tf_device_id : peer_gpu_ids) {
tsl::PlatformDeviceId platform_device_id;
TF_CHECK_OK(GpuIdManager::TfToPlatformDeviceId(tf_device_id,
&platform_device_id));
platform_peer_gpu_ids.insert(platform_device_id);
}
std::vector<tsl::PlatformDeviceId> platform_peer_gpu_ids_vec(
platform_peer_gpu_ids.begin(), platform_peer_gpu_ids.end());

// Adjust virtual address space to be slightly larger than the physical
// address space in case the BFC allocator performs suboptimal garbage
// collection.
// TODO(imintz): Update BFC allocator to ensure it doesn't create holes in
// the va space.
return GpuVirtualMemAllocator::Create(
alloc_visitors, {}, *gpu_context, platform_device_id,
/*virtual_address_space_size=*/total_bytes * 2,
platform_peer_gpu_ids_vec)
.value()
.release();
}
#else
bool use_unified_memory = (options.per_process_gpu_memory_fraction() > 1.0 ||
options.experimental().use_unified_memory());
return absl::WrapUnique(new se::DeviceMemAllocator(
executor, platform_device_id,
use_unified_memory ? stream_executor::MemoryType::kUnified
: stream_executor::MemoryType::kDevice,
alloc_visitors, {}));
#endif
}

Allocator* GPUProcessState::GetGPUAllocator(
Expand Down
28 changes: 0 additions & 28 deletions tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h

This file was deleted.

40 changes: 1 addition & 39 deletions third_party/xla/xla/stream_executor/integrations/BUILD
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
load("//xla:xla.bzl", "xla_cc_test")
load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured", "stream_executor_friends")
load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
load("@local_tsl//tsl:tsl.bzl", "if_google", "internal_visibility")
load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
Expand Down Expand Up @@ -79,25 +78,6 @@ cc_library(
],
)

cc_library(
name = "gpu_virtual_mem_allocator",
srcs = ["gpu_virtual_mem_allocator.cc"],
hdrs = ["gpu_virtual_mem_allocator.h"],
defines = if_cuda(["GOOGLE_CUDA=1"]),
deps = [
"//xla/stream_executor:stream_executor_headers",
"@com_google_absl//absl/strings:str_format",
"@local_tsl//tsl/framework:allocator",
"@local_tsl//tsl/framework:device_id_impl",
"@local_tsl//tsl/platform:numbers",
"@local_tsl//tsl/platform:statusor",
"@local_tsl//tsl/profiler/lib:traceme",
] + if_cuda([
"//xla/stream_executor/gpu:gpu_driver_header",
"//xla/stream_executor/gpu:gpu_types_header",
]),
)

xla_cc_test(
name = "tf_allocator_adapter_test",
srcs = ["tf_allocator_adapter_test.cc"],
Expand All @@ -116,21 +96,3 @@ xla_cc_test(
"@local_tsl//tsl/framework:allocator",
]),
)

xla_cc_test(
name = "gpu_virtual_mem_allocator_test",
srcs = if_gpu_is_configured(["gpu_virtual_mem_allocator_test.cc"]),
tags = [
"gpu",
"no_oss",
"requires-gpu-nvidia",
],
deps = [
":gpu_virtual_mem_allocator",
"//xla/stream_executor/gpu:gpu_init",
"@local_tsl//tsl/framework:device_id_impl",
"@local_tsl//tsl/platform:test",
"@local_tsl//tsl/platform:test_benchmark",
"@local_tsl//tsl/platform:test_main",
],
)

0 comments on commit ae567e4

Please sign in to comment.