From 6aed4827b349773618c2995049f5a31cc1d46220 Mon Sep 17 00:00:00 2001
From: Vladimir Mironov <vmi@terraquantum.swiss>
Date: Tue, 5 Nov 2024 13:20:33 +0000
Subject: [PATCH 1/2] Enable CUDA GPU oversubscription

Split memory evenly between processes on one GPU
---
 .../device/cuda/cuda_backend.cxx              | 41 +++++++++++++++++--
 .../device/cuda/cuda_backend.hpp              |  9 ++++
 .../device/device_backend.hpp                 |  4 +-
 .../device_runtime_environment_impl.hpp       |  2 +-
 .../device/hip/hip_backend.cxx                |  3 +-
 5 files changed, 52 insertions(+), 7 deletions(-)
diff --git a/src/runtime_environment/device/cuda/cuda_backend.cxx b/src/runtime_environment/device/cuda/cuda_backend.cxx
index 610f33f42..4c196cdfc 100644
--- a/src/runtime_environment/device/cuda/cuda_backend.cxx
+++ b/src/runtime_environment/device/cuda/cuda_backend.cxx
@@ -28,6 +28,33 @@ CUDABackend::CUDABackend() {
 
 }
 
+#ifdef GAUXC_HAS_MPI
+CUDABackend::CUDABackend(MPI_Comm c)
+{
+  comm = c;
+  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0,
+                      MPI_INFO_NULL, &local_comm);
+  MPI_Comm_size(local_comm, &local_size);
+  MPI_Comm_rank(local_comm, &local_rank);
+  int ndev;
+  auto stat = cudaGetDeviceCount(&ndev);
+  GAUXC_CUDA_ERROR("CUDA backend init failed", stat);
+  gpuid = local_rank % ndev;
+  cudaSetDevice(gpuid);
+
+  // Create CUDA Stream and CUBLAS Handles and make them talk to eachother
+  master_stream = std::make_shared< util::cuda_stream >();
+  master_handle = std::make_shared< util::cublas_handle >();
+
+  cublasSetStream( *master_handle, *master_stream );
+
+#ifdef GAUXC_HAS_MAGMA
+  // Setup MAGMA queue with CUDA stream / cuBLAS handle
+  master_magma_queue_ = std::make_shared< util::magma_queue >(0, *master_stream, *master_handle);
+#endif
+}
+#endif
+
 CUDABackend::~CUDABackend() noexcept = default;
 
 CUDABackend::device_buffer_t CUDABackend::allocate_device_buffer(int64_t sz) {
@@ -41,6 +68,15 @@ size_t CUDABackend::get_available_mem() {
   size_t cuda_avail, cuda_total;
   auto stat = cudaMemGetInfo( &cuda_avail, &cuda_total );
   GAUXC_CUDA_ERROR( "MemInfo Failed", stat );
+#ifdef GAUXC_HAS_MPI
+  int ndev;
+  stat = cudaGetDeviceCount(&ndev);
+  GAUXC_CUDA_ERROR("MemInfo Failed while getting number of devices", stat);
+  double factor = 1.0 / ((local_size - 1) / ndev + 1);
+  factor = (factor > 1.0 ? 1.0 : factor);
+  cuda_avail = size_t(cuda_avail * factor);
+  MPI_Barrier(local_comm);
+#endif
   return cuda_avail;
 }
 
@@ -137,8 +173,7 @@ void CUDABackend::check_error_(std::string msg) {
   GAUXC_CUDA_ERROR("CUDA Failed ["+msg+"]", stat );
 }
 
-
-std::unique_ptr<DeviceBackend> make_device_backend() {
-  return std::make_unique<CUDABackend>();
+std::unique_ptr<DeviceBackend> make_device_backend(GAUXC_MPI_CODE(MPI_Comm c)) {
+  return std::make_unique<CUDABackend>(GAUXC_MPI_CODE(c));
 }
 }
diff --git a/src/runtime_environment/device/cuda/cuda_backend.hpp b/src/runtime_environment/device/cuda/cuda_backend.hpp
index 8e47a6a9a..916cad88c 100644
--- a/src/runtime_environment/device/cuda/cuda_backend.hpp
+++ b/src/runtime_environment/device/cuda/cuda_backend.hpp
@@ -52,6 +52,15 @@ struct CUDABackend : public DeviceBackend {
 
   std::vector<std::shared_ptr<util::cuda_stream>>   blas_streams;
   std::vector<std::shared_ptr<util::cublas_handle>> blas_handles;
+
+#ifdef GAUXC_HAS_MPI
+  MPI_Comm comm = MPI_COMM_NULL;
+  MPI_Comm local_comm = MPI_COMM_NULL;
+  int gpuid = 0;
+  int local_rank = 0;
+  int local_size = 1;
+  CUDABackend(MPI_Comm comm);
+#endif
 };
 
 }
diff --git a/src/runtime_environment/device/device_backend.hpp b/src/runtime_environment/device/device_backend.hpp
index 594b79888..f506543f2 100644
--- a/src/runtime_environment/device/device_backend.hpp
+++ b/src/runtime_environment/device/device_backend.hpp
@@ -17,6 +17,7 @@
 #include "device_queue.hpp"
 #include "device_blas_handle.hpp"
 #include <gauxc/gauxc_config.hpp>
+#include "gauxc/runtime_environment.hpp"
 
 #ifdef GAUXC_HAS_MAGMA
 #include "device_specific/magma_util.hpp"
@@ -99,6 +100,5 @@ class DeviceBackend {
 
 
 /// Generate the default device backend for this platform
-std::unique_ptr<DeviceBackend> make_device_backend();
-
+std::unique_ptr<DeviceBackend> make_device_backend(GAUXC_MPI_CODE(MPI_Comm c));
 }
diff --git a/src/runtime_environment/device/device_runtime_environment_impl.hpp b/src/runtime_environment/device/device_runtime_environment_impl.hpp
index 9831c5c21..7d5d0ddf5 100644
--- a/src/runtime_environment/device/device_runtime_environment_impl.hpp
+++ b/src/runtime_environment/device/device_runtime_environment_impl.hpp
@@ -35,7 +35,7 @@ class DeviceRuntimeEnvironmentImpl : public RuntimeEnvironmentImpl {
    size_t sz) : parent_type(GAUXC_MPI_CODE(c)), 
      i_own_this_memory_(false), device_memory_(p), 
      device_memory_size_(sz),
-     device_backend_{make_device_backend()} {}
+     device_backend_{make_device_backend(GAUXC_MPI_CODE(c))} {}
 
 
   explicit DeviceRuntimeEnvironmentImpl(GAUXC_MPI_CODE(MPI_Comm c,)
diff --git a/src/runtime_environment/device/hip/hip_backend.cxx b/src/runtime_environment/device/hip/hip_backend.cxx
index 69c3fd286..be16c8f8a 100644
--- a/src/runtime_environment/device/hip/hip_backend.cxx
+++ b/src/runtime_environment/device/hip/hip_backend.cxx
@@ -128,7 +128,8 @@ void HIPBackend::check_error_(std::string msg) {
   GAUXC_HIP_ERROR("HIP Failed ["+msg+"]", stat );
 }
 
-std::unique_ptr<DeviceBackend> make_device_backend() {
+std::unique_ptr<DeviceBackend> make_device_backend(GAUXC_MPI_CODE(MPI_Comm c))
+{
   return std::make_unique<HIPBackend>();
 }
 }

From 500a40e49b0b7188f13aafd5577d8737f0458e55 Mon Sep 17 00:00:00 2001
From: Vladimir Mironov <vmi@terraquantum.swiss>
Date: Fri, 17 Apr 2026 12:58:06 +0200
Subject: [PATCH 2/2] Address PR #185 review comments

- Free local_comm in CUDABackend destructor
- Guard against ndev <= 0
- Move MPI_Barrier from get_available_mem() to the call site
- Narrow include in device_backend.hpp
- Suppress unused MPI_Comm parameter warning in HIP backend
---
 .../device/cuda/cuda_backend.cxx                    | 13 ++++++++++---
 src/runtime_environment/device/device_backend.hpp   |  2 +-
 .../device/device_runtime_environment_impl.hpp      |  1 +
 src/runtime_environment/device/hip/hip_backend.cxx  |  1 +
 4 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/runtime_environment/device/cuda/cuda_backend.cxx b/src/runtime_environment/device/cuda/cuda_backend.cxx
index 4c196cdfc..2ff21f6af 100644
--- a/src/runtime_environment/device/cuda/cuda_backend.cxx
+++ b/src/runtime_environment/device/cuda/cuda_backend.cxx
@@ -10,6 +10,7 @@
  * See LICENSE.txt for details
  */
 #include "cuda_backend.hpp"
+#include <gauxc/exceptions.hpp>
 
 namespace GauXC {
 
@@ -39,8 +40,10 @@ CUDABackend::CUDABackend(MPI_Comm c)
   int ndev;
   auto stat = cudaGetDeviceCount(&ndev);
   GAUXC_CUDA_ERROR("CUDA backend init failed", stat);
+  if(ndev <= 0) GAUXC_GENERIC_EXCEPTION("No CUDA devices found");
   gpuid = local_rank % ndev;
-  cudaSetDevice(gpuid);
+  stat = cudaSetDevice(gpuid);
+  GAUXC_CUDA_ERROR("cudaSetDevice failed", stat);
 
   // Create CUDA Stream and CUBLAS Handles and make them talk to eachother
   master_stream = std::make_shared< util::cuda_stream >();
@@ -55,7 +58,12 @@ CUDABackend::CUDABackend(MPI_Comm c)
 }
 #endif
 
-CUDABackend::~CUDABackend() noexcept = default;
+CUDABackend::~CUDABackend() noexcept {
+#ifdef GAUXC_HAS_MPI
+  if(local_comm != MPI_COMM_NULL)
+    MPI_Comm_free(&local_comm);
+#endif
+}
 
 CUDABackend::device_buffer_t CUDABackend::allocate_device_buffer(int64_t sz) {
   void* ptr;
@@ -75,7 +83,6 @@ size_t CUDABackend::get_available_mem() {
   double factor = 1.0 / ((local_size - 1) / ndev + 1);
   factor = (factor > 1.0 ? 1.0 : factor);
   cuda_avail = size_t(cuda_avail * factor);
-  MPI_Barrier(local_comm);
 #endif
   return cuda_avail;
 }
diff --git a/src/runtime_environment/device/device_backend.hpp b/src/runtime_environment/device/device_backend.hpp
index f506543f2..b5f566ec1 100644
--- a/src/runtime_environment/device/device_backend.hpp
+++ b/src/runtime_environment/device/device_backend.hpp
@@ -17,7 +17,7 @@
 #include "device_queue.hpp"
 #include "device_blas_handle.hpp"
 #include <gauxc/gauxc_config.hpp>
-#include "gauxc/runtime_environment.hpp"
+#include <gauxc/util/mpi.hpp>
 
 #ifdef GAUXC_HAS_MAGMA
 #include "device_specific/magma_util.hpp"
diff --git a/src/runtime_environment/device/device_runtime_environment_impl.hpp b/src/runtime_environment/device/device_runtime_environment_impl.hpp
index 7d5d0ddf5..15edf5730 100644
--- a/src/runtime_environment/device/device_runtime_environment_impl.hpp
+++ b/src/runtime_environment/device/device_runtime_environment_impl.hpp
@@ -44,6 +44,7 @@ class DeviceRuntimeEnvironmentImpl : public RuntimeEnvironmentImpl {
 
     // Allocate Device Memory
     auto avail = device_backend_->get_available_mem();
+    GAUXC_MPI_CODE(MPI_Barrier(c);)
     avail = std::min( avail, detail::memory_cap() );
 
     std::tie( device_memory_, device_memory_size_ ) = 
diff --git a/src/runtime_environment/device/hip/hip_backend.cxx b/src/runtime_environment/device/hip/hip_backend.cxx
index be16c8f8a..79bf739bf 100644
--- a/src/runtime_environment/device/hip/hip_backend.cxx
+++ b/src/runtime_environment/device/hip/hip_backend.cxx
@@ -130,6 +130,7 @@ void HIPBackend::check_error_(std::string msg) {
 
 std::unique_ptr<DeviceBackend> make_device_backend(GAUXC_MPI_CODE(MPI_Comm c))
 {
+  GAUXC_MPI_CODE((void)c;)
   return std::make_unique<HIPBackend>();
 }
 }