zama-ai · antoniupop · Jun 24, 2024 · Jun 19, 2024 · Jun 21, 2024
diff --git a/compilers/concrete-compiler/compiler/include/concretelang/Runtime/GPUDFG.hpp b/compilers/concrete-compiler/compiler/include/concretelang/Runtime/GPUDFG.hpp
@@ -0,0 +1,28 @@
+// Part of the Concrete Compiler Project, under the BSD3 License with Zama
+// Exceptions. See
+// https://github.com/zama-ai/concrete/blob/main/LICENSE.txt
+// for license information.
+
+#ifndef CONCRETELANG_GPUDFG_HPP
+#define CONCRETELANG_GPUDFG_HPP
+
+#ifdef CONCRETELANG_CUDA_SUPPORT
+#include "bootstrap.h"
+#include "device.h"
+#include "keyswitch.h"
+#include "linear_algebra.h"
+
+#endif
+
+namespace mlir {
+namespace concretelang {
+namespace gpu_dfg {
+
+bool check_cuda_device_available();
+bool check_cuda_runtime_enabled();
+
+} // namespace gpu_dfg
+} // namespace concretelang
+} // namespace mlir
+
+#endif
diff --git a/compilers/concrete-compiler/compiler/include/concretelang/Support/CompilerEngine.h b/compilers/concrete-compiler/compiler/include/concretelang/Support/CompilerEngine.h
@@ -26,8 +26,6 @@ using concretelang::protocol::Message;
 namespace mlir {
 namespace concretelang {
 
-bool getEmitGPUOption();
-
 /// Compilation context that acts as the root owner of LLVM and MLIR
 /// data structures directly and indirectly referenced by artefacts
 /// produced by the `CompilerEngine`.

diff --git a/compilers/concrete-compiler/compiler/lib/Bindings/Python/CompilerAPIModule.cpp b/compilers/concrete-compiler/compiler/lib/Bindings/Python/CompilerAPIModule.cpp
@@ -12,6 +12,7 @@
 #include "concretelang/Common/Keysets.h"
 #include "concretelang/Dialect/FHE/IR/FHEOpsDialect.h.inc"
 #include "concretelang/Runtime/DFRuntime.hpp"
+#include "concretelang/Runtime/GPUDFG.hpp"
 #include "concretelang/ServerLib/ServerLib.h"
 #include "concretelang/Support/logging.h"
 #include <llvm/Support/Debug.h>
@@ -462,6 +463,14 @@ void initDataflowParallelization() {
   mlir::concretelang::dfr::_dfr_set_required(true);
 }
 
+bool checkGPURuntimeEnabled() {
+  return mlir::concretelang::gpu_dfg::check_cuda_runtime_enabled();
+}
+
+bool checkCudaDeviceAvailable() {
+  return mlir::concretelang::gpu_dfg::check_cuda_device_available();
+}
+
 std::string roundTrip(const char *module) {
   std::shared_ptr<mlir::concretelang::CompilationContext> ccx =
       mlir::concretelang::CompilationContext::createShared();
@@ -673,6 +682,8 @@ void mlir::concretelang::python::populateCompilerAPISubmodule(
   m.def("terminate_df_parallelization", &terminateDataflowParallelization);
 
   m.def("init_df_parallelization", &initDataflowParallelization);
+  m.def("check_gpu_runtime_enabled", &checkGPURuntimeEnabled);
+  m.def("check_cuda_device_available", &checkCudaDeviceAvailable);
 
   pybind11::enum_<mlir::concretelang::Backend>(m, "Backend")
       .value("CPU", mlir::concretelang::Backend::CPU)

diff --git a/compilers/concrete-compiler/compiler/lib/Bindings/Python/concrete/compiler/__init__.py b/compilers/concrete-compiler/compiler/lib/Bindings/Python/concrete/compiler/__init__.py
@@ -8,6 +8,8 @@
 from mlir._mlir_libs._concretelang._compiler import (
     terminate_df_parallelization as _terminate_df_parallelization,
     init_df_parallelization as _init_df_parallelization,
+    check_gpu_runtime_enabled as _check_gpu_runtime_enabled,
+    check_cuda_device_available as _check_cuda_device_available,
 )
 from mlir._mlir_libs._concretelang._compiler import round_trip as _round_trip
 from mlir._mlir_libs._concretelang._compiler import (
@@ -49,6 +51,18 @@ def init_dfr():
     _init_df_parallelization()
 
 
+def check_gpu_enabled() -> bool:
+    """Check whether the compiler and runtime support GPU offloading.
+
+    GPU offloading is not always available, in particular in non-GPU wheels."""
+    return _check_gpu_runtime_enabled()
+
+
+def check_gpu_available() -> bool:
+    """Check whether a CUDA device is available and online."""
+    return _check_cuda_device_available()
+
+
 # Cleanly terminate the dataflow runtime if it has been initialized
 # (does nothing otherwise)
 atexit.register(_terminate_df_parallelization)

diff --git a/compilers/concrete-compiler/compiler/lib/Runtime/CMakeLists.txt b/compilers/concrete-compiler/compiler/lib/Runtime/CMakeLists.txt
@@ -6,7 +6,7 @@ if(CONCRETELANG_CUDA_SUPPORT)
   target_link_libraries(ConcretelangRuntime PRIVATE hwloc)
 else()
   add_library(ConcretelangRuntime SHARED context.cpp simulation.cpp wrappers.cpp DFRuntime.cpp key_manager.cpp
-                                         StreamEmulator.cpp)
+                                         GPUDFG.cpp)
 endif()
 
 add_dependencies(ConcretelangRuntime concrete_cpu concrete_cpu_noise_model concrete-protocol)

diff --git a/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp b/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp
@@ -3,6 +3,7 @@
 // https://github.com/zama-ai/concrete/blob/main/LICENSE.txt
 // for license information.
 
+#ifdef CONCRETELANG_CUDA_SUPPORT
 #include <atomic>
 #include <cmath>
 #include <cstdarg>
@@ -18,15 +19,10 @@
 #include <utility>
 #include <vector>
 
+#include <concretelang/Runtime/GPUDFG.hpp>
 #include <concretelang/Runtime/stream_emulator_api.h>
 #include <concretelang/Runtime/wrappers.h>
 
-#ifdef CONCRETELANG_CUDA_SUPPORT
-#include "bootstrap.h"
-#include "device.h"
-#include "keyswitch.h"
-#include "linear_algebra.h"
-
 using RuntimeContext = mlir::concretelang::RuntimeContext;
 
 namespace mlir {
@@ -1652,3 +1648,30 @@ void *stream_emulator_init() {
 void stream_emulator_run(void *dfg) {}
 void stream_emulator_delete(void *dfg) { delete (GPU_DFG *)dfg; }
 #endif
+
+namespace mlir {
+namespace concretelang {
+namespace gpu_dfg {
+
+bool check_cuda_device_available() {
+#ifdef CONCRETELANG_CUDA_SUPPORT
+  int num;
+  if (cudaGetDeviceCount(&num) != cudaSuccess)
+    return false;
+  return num > 0;
+#else
+  return false;
+#endif
+}
+
+bool check_cuda_runtime_enabled() {
+#ifdef CONCRETELANG_CUDA_SUPPORT
+  return true;
+#else
+  return false;
+#endif
+}
+
+} // namespace gpu_dfg
+} // namespace concretelang
+} // namespace mlir
diff --git a/compilers/concrete-compiler/compiler/lib/Runtime/context.cpp b/compilers/concrete-compiler/compiler/lib/Runtime/context.cpp
@@ -41,14 +41,15 @@ RuntimeContext::RuntimeContext(ServerKeyset serverKeyset)
   }
 
 #ifdef CONCRETELANG_CUDA_SUPPORT
-  assert(cudaGetDeviceCount(&num_devices) == cudaSuccess);
-  bsk_gpu.resize(num_devices);
-  ksk_gpu.resize(num_devices);
-  for (int i = 0; i < num_devices; ++i) {
-    bsk_gpu[i].resize(serverKeyset.lweBootstrapKeys.size(), nullptr);
-    ksk_gpu[i].resize(serverKeyset.lweKeyswitchKeys.size(), nullptr);
-    bsk_gpu_mutex.push_back(std::make_unique<std::mutex>());
-    ksk_gpu_mutex.push_back(std::make_unique<std::mutex>());
+  if (cudaGetDeviceCount(&num_devices) == cudaSuccess) {
+    bsk_gpu.resize(num_devices);
+    ksk_gpu.resize(num_devices);
+    for (int i = 0; i < num_devices; ++i) {
+      bsk_gpu[i].resize(serverKeyset.lweBootstrapKeys.size(), nullptr);
+      ksk_gpu[i].resize(serverKeyset.lweKeyswitchKeys.size(), nullptr);
+      bsk_gpu_mutex.push_back(std::make_unique<std::mutex>());
+      ksk_gpu_mutex.push_back(std::make_unique<std::mutex>());
+    }
   }
 #endif
 }

diff --git a/compilers/concrete-compiler/compiler/lib/Support/CompilerEngine.cpp b/compilers/concrete-compiler/compiler/lib/Support/CompilerEngine.cpp
@@ -63,14 +63,10 @@
 #include "concretelang/Support/LLVMEmitFile.h"
 #include "concretelang/Support/Pipeline.h"
 #include "concretelang/Support/Utils.h"
+#include <concretelang/Runtime/GPUDFG.hpp>
 
 namespace mlir {
 namespace concretelang {
-// TODO: should be removed when bufferization is not related to CAPI lowering
-// Control whether we should call a cpu of gpu function when lowering
-// to CAPI
-static bool EMIT_GPU_OPS;
-bool getEmitGPUOption() { return EMIT_GPU_OPS; }
 
 /// Creates a new compilation context that can be shared across
 /// compilation engines and results
@@ -297,9 +293,6 @@ CompilerEngine::compile(mlir::ModuleOp moduleOp, Target target,
 
   mlir::MLIRContext &mlirContext = *this->compilationContext->getMLIRContext();
 
-  // enable/disable usage of gpu functions during bufferization
-  EMIT_GPU_OPS = options.emitGPUOps;
-
   auto dataflowParallelize =
       options.autoParallelize || options.dataflowParallelize;
   auto loopParallelize = options.autoParallelize || options.loopParallelize;
@@ -310,6 +303,45 @@ CompilerEngine::compile(mlir::ModuleOp moduleOp, Target target,
   if (dataflowParallelize)
     mlir::concretelang::dfr::_dfr_set_required(true);
 
+  // Sanity checks for enabling GPU usage: the compiler must have been
+  // compiled with Cuda support (especially important when building
+  // python wheels), and at least one device must be available to
+  // execute on.
+  if (options.emitGPUOps) {
+    // If this compiler is not compiled using Cuda support, then
+    // requesting GPU is forbidden - instead of a hard error, issue a
+    // warning and disable the GPU option.
+    if (!mlir::concretelang::gpu_dfg::check_cuda_runtime_enabled()) {
+      // Allow compilation to complete if only code generation is expected.
+      if (target != Target::LIBRARY) {
+        warnx("This instance of the Concrete compiler does not support GPU "
+              "acceleration."
+              " Allowing code generation to proceed, but execution will not be "
+              "possible.");
+      } else {
+        warnx("This instance of the Concrete compiler does not support GPU "
+              "acceleration."
+              " If you are using Concrete-Python, it means that the module "
+              "installed is not GPU enabled.\n"
+              "Continuing without GPU acceleration.");
+        options.emitGPUOps = false;
+        options.emitSDFGOps = false;
+        options.batchTFHEOps = false;
+      }
+    } else {
+      // Ensure that at least one Cuda device is available if GPU option
+      // is used
+      if (!mlir::concretelang::gpu_dfg::check_cuda_device_available()) {
+        warnx("No Cuda device available on this system (either not present or "
+              "the driver is not online).\n"
+              "Continuing without GPU acceleration.");
+        options.emitGPUOps = false;
+        options.emitSDFGOps = false;
+        options.batchTFHEOps = false;
+      }
+    }
+  }
+
   mlir::OwningOpRef<mlir::ModuleOp> mlirModuleRef(moduleOp);
   res.mlirModuleRef = std::move(mlirModuleRef);
   mlir::ModuleOp module = res.mlirModuleRef->get();