From 9750fe3cece2cab917317e3dba79bce658a8c4b4 Mon Sep 17 00:00:00 2001
From: Antoniu Pop <antoniu.pop@zama.ai>
Date: Wed, 19 Jun 2024 14:20:02 +0100
Subject: [PATCH] fix(compiler): [GPU backend] add sanity checks when the
 emitGPUOps option is selected to ensure that the compiler/runtime have GPU
 capability and that at least one device is available to run on.

---
 .../include/concretelang/Runtime/GPUDFG.hpp   | 29 ++++++++++++
 .../concretelang/Support/CompilerEngine.h     |  2 -
 .../compiler/lib/Runtime/GPUDFG.cpp           |  8 +---
 .../compiler/lib/Support/CompilerEngine.cpp   | 44 +++++++++++++++----
 4 files changed, 67 insertions(+), 16 deletions(-)
 create mode 100644 compilers/concrete-compiler/compiler/include/concretelang/Runtime/GPUDFG.hpp
diff --git a/compilers/concrete-compiler/compiler/include/concretelang/Runtime/GPUDFG.hpp b/compilers/concrete-compiler/compiler/include/concretelang/Runtime/GPUDFG.hpp
new file mode 100644
index 000000000..19ca0e0ee
--- /dev/null
+++ b/compilers/concrete-compiler/compiler/include/concretelang/Runtime/GPUDFG.hpp
@@ -0,0 +1,29 @@
+// Part of the Concrete Compiler Project, under the BSD3 License with Zama
+// Exceptions. See
+// https://github.com/zama-ai/concrete/blob/main/LICENSE.txt
+// for license information.
+
+#ifndef CONCRETELANG_GPUDFG_HPP
+#define CONCRETELANG_GPUDFG_HPP
+
+#ifdef CONCRETELANG_CUDA_SUPPORT
+#include "bootstrap.h"
+#include "device.h"
+#include "keyswitch.h"
+#include "linear_algebra.h"
+
+namespace mlir {
+namespace concretelang {
+namespace dfr {
+
+bool check_cuda_device_available() {
+  int num;
+  if (cudaGetDeviceCount(&num) != cudaSuccess)
+    return false;
+  return num > 0;
+}
+} // namespace dfr
+} // namespace concretelang
+} // namespace mlir
+#endif
+#endif
diff --git a/compilers/concrete-compiler/compiler/include/concretelang/Support/CompilerEngine.h b/compilers/concrete-compiler/compiler/include/concretelang/Support/CompilerEngine.h
index ae56ea941..a9fbfcc97 100644
--- a/compilers/concrete-compiler/compiler/include/concretelang/Support/CompilerEngine.h
+++ b/compilers/concrete-compiler/compiler/include/concretelang/Support/CompilerEngine.h
@@ -26,8 +26,6 @@ using concretelang::protocol::Message;
 namespace mlir {
 namespace concretelang {
 
-bool getEmitGPUOption();
-
 /// Compilation context that acts as the root owner of LLVM and MLIR
 /// data structures directly and indirectly referenced by artefacts
 /// produced by the `CompilerEngine`.
diff --git a/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp b/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp
index 3cb3c078c..7b5b6128a 100644
--- a/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp
@@ -18,15 +18,11 @@
 #include <utility>
 #include <vector>
 
+#ifdef CONCRETELANG_CUDA_SUPPORT
+#include <concretelang/Runtime/GPUDFG.hpp>
 #include <concretelang/Runtime/stream_emulator_api.h>
 #include <concretelang/Runtime/wrappers.h>
 
-#ifdef CONCRETELANG_CUDA_SUPPORT
-#include "bootstrap.h"
-#include "device.h"
-#include "keyswitch.h"
-#include "linear_algebra.h"
-
 using RuntimeContext = mlir::concretelang::RuntimeContext;
 
 namespace mlir {
diff --git a/compilers/concrete-compiler/compiler/lib/Support/CompilerEngine.cpp b/compilers/concrete-compiler/compiler/lib/Support/CompilerEngine.cpp
index 3260d920f..5a5cf25e2 100644
--- a/compilers/concrete-compiler/compiler/lib/Support/CompilerEngine.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Support/CompilerEngine.cpp
@@ -63,14 +63,10 @@
 #include "concretelang/Support/LLVMEmitFile.h"
 #include "concretelang/Support/Pipeline.h"
 #include "concretelang/Support/Utils.h"
+#include <concretelang/Runtime/GPUDFG.hpp>
 
 namespace mlir {
 namespace concretelang {
-// TODO: should be removed when bufferization is not related to CAPI lowering
-// Control whether we should call a cpu of gpu function when lowering
-// to CAPI
-static bool EMIT_GPU_OPS;
-bool getEmitGPUOption() { return EMIT_GPU_OPS; }
 
 /// Creates a new compilation context that can be shared across
 /// compilation engines and results
@@ -297,9 +293,6 @@ CompilerEngine::compile(mlir::ModuleOp moduleOp, Target target,
 
   mlir::MLIRContext &mlirContext = *this->compilationContext->getMLIRContext();
 
-  // enable/disable usage of gpu functions during bufferization
-  EMIT_GPU_OPS = options.emitGPUOps;
-
   auto dataflowParallelize =
       options.autoParallelize || options.dataflowParallelize;
   auto loopParallelize = options.autoParallelize || options.loopParallelize;
@@ -310,6 +303,41 @@ CompilerEngine::compile(mlir::ModuleOp moduleOp, Target target,
   if (dataflowParallelize)
     mlir::concretelang::dfr::_dfr_set_required(true);
 
+  // Sanity checks for enabling GPU usage: the compiler must have been
+  // compiled with Cuda support (especially important when building
+  // python wheels), and at least one device must be available to
+  // execute on.
+  if (options.emitGPUOps) {
+    // If this compiler is not compiled using Cuda support, then
+    // requesting GPU is forbidden - instead of a hard error, issue a
+    // warning and disable the GPU option.
+#if CONCRETELANG_CUDA_SUPPORT==OFF
+    // Allow compilation to complete if only code generation is expected.
+    if (target != Target::LIBRARY) {
+      warnx("This instance of the Concrete compiler does not support GPU "
+            "acceleration."
+            " Allowing code generation to proceed, but execution will not be "
+            "possible.");
+    } else {
+      warnx("This instance of the Concrete compiler does not support GPU "
+            "acceleration."
+            " If you are using Concrete-Python, it means that the module "
+            "installed is not GPU enabled.\n"
+            "Continuing without GPU acceleration.");
+      options.emitGPUOps = false;
+    }
+#else
+    // Ensure that at least one Cuda device is available if GPU option
+    // is used
+    if (!mlir::concretelang::dfr::check_cuda_device_available()) {
+      warnx("No Cuda device available on this system (either not present or "
+            "the driver is not online).\n"
+            "Continuing without GPU acceleration.");
+      options.emitGPUOps = false;
+    }
+#endif
+  }
+
   mlir::OwningOpRef<mlir::ModuleOp> mlirModuleRef(moduleOp);
   res.mlirModuleRef = std::move(mlirModuleRef);
   mlir::ModuleOp module = res.mlirModuleRef->get();