Remove unused GpuVirtualMemAllocator.

PiperOrigin-RevId: 615560419
tensorflow · Mar 13, 2024 · ae567e4 · ae567e4
1 parent 7611533
commit ae567e4
Show file tree

Hide file tree

Showing 8 changed files with 2 additions and 755 deletions.
diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
@@ -133,7 +133,6 @@ filegroup(
         "gpu_managed_allocator.h",
         "gpu_process_state.h",
         "gpu_util.h",
-        "gpu_virtual_mem_allocator.h",
         "//tensorflow/core/common_runtime:gpu_runtime_headers",
         "//tensorflow/core/common_runtime/device:device_runtime_headers",
         "@local_tsl//tsl/framework:bfc_allocator.h",
@@ -159,7 +158,6 @@ tf_cuda_library(
         "@local_config_cuda//cuda:cudnn_header",
         "@local_xla//xla/stream_executor/cuda:cuda_platform",
         "@local_xla//xla/stream_executor/gpu:gpu_stream",
-        ":gpu_virtual_mem_allocator",
     ],
     defines = if_linux_x86_64(["TF_PLATFORM_LINUX_X86_64"]),
     features = ["-layering_check"],
@@ -253,7 +251,6 @@ tf_cuda_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":gpu_virtual_mem_allocator",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
@@ -262,22 +259,6 @@ tf_cuda_library(
     ],
 )
 
-tf_cuda_library(
-    name = "gpu_virtual_mem_allocator",
-    hdrs = [
-        "gpu_virtual_mem_allocator.h",
-    ],
-    copts = tf_copts(),
-    features = [
-        "-layering_check",
-        "parse_headers",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "@local_xla//xla/stream_executor/integrations:gpu_virtual_mem_allocator",
-    ],
-)
-
 # -----------------------------------------------------------------------------
 # Tests
 
@@ -318,7 +299,6 @@ tf_cuda_cc_test(
         "//tensorflow/core/common_runtime:direct_session_internal",
         "//tensorflow/core/kernels:ops_util",
         "@local_xla//xla/stream_executor/integrations:device_mem_allocator",
-        "@local_xla//xla/stream_executor/integrations:gpu_virtual_mem_allocator",
     ],
 )
 

diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_init.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tensorflow/core/common_runtime/device/device_mem_allocator.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
 #include "tensorflow/core/framework/typed_allocator.h"
 #include "tensorflow/core/protobuf/bfc_memory_map.pb.h"
 #include "tensorflow/core/protobuf/config.pb.h"
@@ -67,20 +66,6 @@ class GPUBFCAllocatorTest
     : public ::testing::TestWithParam<std::unique_ptr<SubAllocator> (*)(
           size_t)> {};
 
-#if CUDA_VERSION >= 10020
-std::unique_ptr<SubAllocator> CreateVirtualMemorySubAllocator(
-    size_t virtual_address_space_size = 1ull << 32) {
-  PlatformDeviceId gpu_id(0);
-  auto executor =
-      GPUMachineManager()->ExecutorForDevice(gpu_id.value()).value();
-  auto* gpu_context = reinterpret_cast<stream_executor::gpu::GpuContext*>(
-      executor->platform_specific_handle().context);
-  return tensorflow::GpuVirtualMemAllocator::Create(
-             {}, {}, *gpu_context, gpu_id, virtual_address_space_size, {})
-      .value();
-}
-#endif
-
 std::unique_ptr<SubAllocator> CreateGPUMemAllocator(size_t) {
   PlatformDeviceId gpu_id(0);
   return absl::WrapUnique(new DeviceMemAllocator(
@@ -90,21 +75,10 @@ std::unique_ptr<SubAllocator> CreateGPUMemAllocator(size_t) {
 
 std::unique_ptr<SubAllocator> CreateSubAllocator(
     size_t virtual_address_space_size = 1ull << 32) {
-#if CUDA_VERSION >= 10020
-  return CreateVirtualMemorySubAllocator(virtual_address_space_size);
-#else
   return CreateGPUMemAllocator(virtual_address_space_size);
-#endif
 }
 
-auto TestSuiteValues() {
-#if CUDA_VERSION >= 10020
-  return ::testing::Values(&CreateGPUMemAllocator,
-                           &CreateVirtualMemorySubAllocator);
-#else
-  return ::testing::Values(&CreateGPUMemAllocator);
-#endif
-}
+auto TestSuiteValues() { return ::testing::Values(&CreateGPUMemAllocator); }
 
 TEST_P(GPUBFCAllocatorTest, NoDups) {
   GPUBFCAllocator a(GetParam()(1ull << 32), 1 << 30, "GPU_0_bfc", {});
@@ -603,34 +577,6 @@ INSTANTIATE_TEST_SUITE_P(GPUBFCAllocatorPrivateMethodTestSuite,
 // Tests that cannot be trivially parameterized for both suballocator types.
 class GPUBFCAllocatorTest_SubAllocatorSpecific : public ::testing::Test {};
 
-#if CUDA_VERSION >= 10020
-// Benchmark for measuring "high water mark" for BFCAllocator owned memory.
-TEST_F(GPUBFCAllocatorTest_SubAllocatorSpecific,
-       VirtualAllocatorPromotesReuse) {
-  GPUBFCAllocator::Options options;
-  options.allow_growth = true;
-
-  constexpr size_t k512MiB = 512ull << 20;
-
-  // 512 MiB allocator.
-  GPUBFCAllocator a(CreateVirtualMemorySubAllocator(1ull << 32), k512MiB,
-                    "GPU_0_bfc", options);
-  // Allocate 128 raw pointers of 4 megs.
-  const size_t size = 1LL << 22;
-  std::vector<void*> initial_ptrs;
-  for (size_t s = 0; s < 128; s++) {
-    void* raw = a.AllocateRaw(1, size);
-    initial_ptrs.push_back(raw);
-  }
-  // Deallocate all but the last one so the big chunk cannot be GC'd
-  for (int i = 0; i < 127; ++i) {
-    a.DeallocateRaw(initial_ptrs[i]);
-  }
-  void* big_alloc = a.AllocateRaw(1, k512MiB - size);
-  EXPECT_NE(big_alloc, nullptr);
-}
-#endif
-
 TEST_F(GPUBFCAllocatorTest_SubAllocatorSpecific,
        PhysicalAllocatorOomsFragmentation) {
   GPUBFCAllocator::Options options;
@@ -706,59 +652,13 @@ class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific
     }
     EXPECT_EQ(1, num_chunks_in_bins);
   }
-
-#if CUDA_VERSION >= 10020
-  // Counterpart to the GPUMemAllocator test suite TestRegionDeallocation tests.
-  // Here we expect no deallocations because all allocations are coalesced into
-  // a single region.
-  void TestNoRegionDeallocation() {
-    GPUBFCAllocator::Options options;
-    options.allow_growth = true;
-
-    // Max of 2GiB, but starts out small.
-    GPUBFCAllocator a(CreateVirtualMemorySubAllocator(1uLL << 32), 1LL << 31,
-                      "GPU_0_bfc", options);
-
-    // Allocate 128 raw pointers of 4 megs.
-    const size_t size = 1LL << 22;
-    std::vector<void*> initial_ptrs;
-    for (size_t s = 0; s < 128; s++) {
-      void* raw = a.AllocateRaw(1, size);
-      initial_ptrs.push_back(raw);
-    }
-
-    {
-      mutex_lock l(a.lock_);
-      EXPECT_EQ(1, a.region_manager_.regions().size());
-    }
-
-    // Deallocate all the memories except the last one.
-    for (size_t i = 0; i < initial_ptrs.size() - 1; i++) {
-      a.DeallocateRaw(initial_ptrs[i]);
-    }
-
-    // Deallocate free regions and there should still be only one.
-    EXPECT_EQ(false, a.DeallocateFreeRegions(/*rounded_bytes=*/0));
-    {
-      mutex_lock l(a.lock_);
-      EXPECT_EQ(1, a.region_manager_.regions().size());
-    }
-  }
-#endif
 };
 
 TEST_F(GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific,
        TestRegionDeallocation) {
   TestRegionDeallocation();
 }
 
-#if CUDA_VERSION >= 10020
-TEST_F(GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific,
-       TestNoRegionDeallocation) {
-  TestNoRegionDeallocation();
-}
-#endif
-
 }  // namespace tsl
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
 #include "tensorflow/core/common_runtime/pool_allocator.h"
 #include "tensorflow/core/common_runtime/shared_counter.h"
 #include "tensorflow/core/framework/log_memory.h"
@@ -116,52 +115,13 @@ static std::unique_ptr<SubAllocator> CreateSubAllocator(
                       ->ExecutorForDevice(platform_device_id.value())
                       .value();
 
-  // FIXME(imintz): Observed OOM issues when using the virtual memory
-  // allocators. This should be reenabled when resolved.
-#if 0 && defined(GOOGLE_CUDA) && CUDA_VERSION >= 10020
-  // Use the old allocator when unified memory is required.
-  // TODO(imintz): Remove the cuMemAlloc capability of this allocator.
-  if (options.per_process_gpu_memory_fraction() > 1.0 ||
-      options.experimental().use_unified_memory()) {
-    return new se::DeviceMemAllocator(executor, platform_device_id,
-                                  /*use_unified_memory=*/true, alloc_visitors,
-                                  {});
-  } else {
-    auto* gpu_context = reinterpret_cast<stream_executor::gpu::GpuContext*>(
-        executor->platform_specific_handle().context);
-
-    absl::flat_hash_set<tsl::PlatformDeviceId> platform_peer_gpu_ids;
-    platform_peer_gpu_ids.reserve(peer_gpu_ids.size());
-    for (const tsl::TfDeviceId tf_device_id : peer_gpu_ids) {
-      tsl::PlatformDeviceId platform_device_id;
-      TF_CHECK_OK(GpuIdManager::TfToPlatformDeviceId(tf_device_id,
-                                                     &platform_device_id));
-      platform_peer_gpu_ids.insert(platform_device_id);
-    }
-    std::vector<tsl::PlatformDeviceId> platform_peer_gpu_ids_vec(
-        platform_peer_gpu_ids.begin(), platform_peer_gpu_ids.end());
-
-    // Adjust virtual address space to be slightly larger than the physical
-    // address space in case the BFC allocator performs suboptimal garbage
-    // collection.
-    // TODO(imintz): Update BFC allocator to ensure it doesn't create holes in
-    // the va space.
-    return GpuVirtualMemAllocator::Create(
-               alloc_visitors, {}, *gpu_context, platform_device_id,
-               /*virtual_address_space_size=*/total_bytes * 2,
-               platform_peer_gpu_ids_vec)
-        .value()
-        .release();
-  }
-#else
   bool use_unified_memory = (options.per_process_gpu_memory_fraction() > 1.0 ||
                              options.experimental().use_unified_memory());
   return absl::WrapUnique(new se::DeviceMemAllocator(
       executor, platform_device_id,
       use_unified_memory ? stream_executor::MemoryType::kUnified
                          : stream_executor::MemoryType::kDevice,
       alloc_visitors, {}));
-#endif
 }
 
 Allocator* GPUProcessState::GetGPUAllocator(

diff --git a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h
diff --git a/third_party/xla/xla/stream_executor/integrations/BUILD b/third_party/xla/xla/stream_executor/integrations/BUILD
@@ -1,6 +1,5 @@
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("//xla:xla.bzl", "xla_cc_test")
-load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured", "stream_executor_friends")
+load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
 load("@local_tsl//tsl:tsl.bzl", "if_google", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
@@ -79,25 +78,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "gpu_virtual_mem_allocator",
-    srcs = ["gpu_virtual_mem_allocator.cc"],
-    hdrs = ["gpu_virtual_mem_allocator.h"],
-    defines = if_cuda(["GOOGLE_CUDA=1"]),
-    deps = [
-        "//xla/stream_executor:stream_executor_headers",
-        "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/framework:allocator",
-        "@local_tsl//tsl/framework:device_id_impl",
-        "@local_tsl//tsl/platform:numbers",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/profiler/lib:traceme",
-    ] + if_cuda([
-        "//xla/stream_executor/gpu:gpu_driver_header",
-        "//xla/stream_executor/gpu:gpu_types_header",
-    ]),
-)
-
 xla_cc_test(
     name = "tf_allocator_adapter_test",
     srcs = ["tf_allocator_adapter_test.cc"],
@@ -116,21 +96,3 @@ xla_cc_test(
         "@local_tsl//tsl/framework:allocator",
     ]),
 )
-
-xla_cc_test(
-    name = "gpu_virtual_mem_allocator_test",
-    srcs = if_gpu_is_configured(["gpu_virtual_mem_allocator_test.cc"]),
-    tags = [
-        "gpu",
-        "no_oss",
-        "requires-gpu-nvidia",
-    ],
-    deps = [
-        ":gpu_virtual_mem_allocator",
-        "//xla/stream_executor/gpu:gpu_init",
-        "@local_tsl//tsl/framework:device_id_impl",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_benchmark",
-        "@local_tsl//tsl/platform:test_main",
-    ],
-)