add nccl topology

microsoft · shiyu1994 · Oct 10, 2023 · Oct 26, 2023 · Oct 27, 2023 · Nov 8, 2023
commit 35b0ca147b011b9453130c55b42aad197bc0bc88
@@ -1280,31 +1280,39 @@ GPU Parameters
 
 -  ``gpu_platform_id`` :raw-html:`<a id="gpu_platform_id" title="Permalink to this parameter" href="#gpu_platform_id">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int
 
-   -  OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform
+   -  OpenCL platform ID with device_type=gpu. Usually each GPU vendor exposes one OpenCL platform
 
    -  ``-1`` means the system-wide default platform
 
    -  **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
 
 -  ``gpu_device_id`` :raw-html:`<a id="gpu_device_id" title="Permalink to this parameter" href="#gpu_device_id">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int
 
-   -  OpenCL device ID in the specified platform. Each GPU in the selected platform has a unique device ID
+   -  Master CUDA device ID with device_type=cuda or OpenCL device ID in the specified platform with device_type=gpu.
+
+   -  Each GPU in the selected platform has a unique device ID
 
    -  ``-1`` means the default device in the selected platform
 
    -  **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
 
--  ``gpu_use_dp`` :raw-html:`<a id="gpu_use_dp" title="Permalink to this parameter" href="#gpu_use_dp">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
+-  ``num_gpus`` :raw-html:`<a id="num_gpus" title="Permalink to this parameter" href="#num_gpus">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int
 
-   -  set this to ``true`` to use double precision math on GPU (by default single precision is used)
+   -  Number of GPUs to use for training, used with device_type=cuda
 
-   -  **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported
+   -  When <= 0, only 1 GPU will be used
+
+-  ``gpu_device_id_list`` :raw-html:`<a id="gpu_device_id_list" title="Permalink to this parameter" href="#gpu_device_id_list">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string
 
--  ``num_gpu`` :raw-html:`<a id="num_gpu" title="Permalink to this parameter" href="#num_gpu">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, constraints: ``num_gpu > 0``
+   -  List of CUDA device IDs used when device_type=cuda
 
-   -  number of GPUs
+   -  When empty, the devices with the smallest IDs will be used
 
-   -  **Note**: can be used only in CUDA implementation
+-  ``gpu_use_dp`` :raw-html:`<a id="gpu_use_dp" title="Permalink to this parameter" href="#gpu_use_dp">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
+
+   -  set this to ``true`` to use double precision math on GPU (by default single precision is used)
+
+   -  **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported
 
 .. end params list
 

@@ -309,9 +309,11 @@ class LIGHTGBM_EXPORT Boosting {
   * \param format Format of model
   * \param config config for boosting
   * \param filename name of model file, if existing will continue to train from this model
+  * \param device_type type of device, can be cpu, gpu or cuda
+  * \param num_gpu number of GPUs to use
   * \return The boosting object
   */
-  static Boosting* CreateBoosting(const std::string& type, const char* filename);
+  static Boosting* CreateBoosting(const std::string& type, const char* filename, const std::string& device_type, const int num_gpu);
 
   virtual std::string GetLoadedParam() const = 0;
 

@@ -1091,25 +1091,29 @@ struct Config {
   #pragma region GPU Parameters
   #endif  // __NVCC__
 
-  // desc = OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform
+  // desc = OpenCL platform ID with device_type=gpu. Usually each GPU vendor exposes one OpenCL platform
   // desc = ``-1`` means the system-wide default platform
   // desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
   int gpu_platform_id = -1;
 
-  // desc = OpenCL device ID in the specified platform. Each GPU in the selected platform has a unique device ID
+  // desc = Master CUDA device ID with device_type=cuda or OpenCL device ID in the specified platform with device_type=gpu.
+  // desc = Each GPU in the selected platform has a unique device ID
   // desc = ``-1`` means the default device in the selected platform
   // desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
   int gpu_device_id = -1;
 
+  // desc = Number of GPUs to use for training, used with device_type=cuda
+  // desc = When <= 0, only 1 GPU will be used
+  int num_gpus = 1;
+
+  // desc = List of CUDA device IDs used when device_type=cuda
+  // desc = When empty, the devices with the smallest IDs will be used
+  std::string gpu_device_id_list = "";
+
   // desc = set this to ``true`` to use double precision math on GPU (by default single precision is used)
   // desc = **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported
   bool gpu_use_dp = false;
 
-  // check = >0
-  // desc = number of GPUs
-  // desc = **Note**: can be used only in CUDA implementation
-  int num_gpu = 1;
-
   #ifndef __NVCC__
   #pragma endregion
 

@@ -0,0 +1,225 @@
+/*!
+ * Copyright (c) 2023 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#include <LightGBM/cuda/cuda_utils.hu>
+#include <LightGBM/network.h>
+#include <LightGBM/utils/common.h>
+#include <set>
+#include <string>
+#include <vector>
+#include <nccl.h>
+#include <functional>
+#include <thread>
+
+namespace LightGBM {
+
+class NCCLTopology {
+ public:
+  NCCLTopology(const int master_gpu_device_id, const int num_gpus, const std::string& gpu_device_id_list, const data_size_t global_num_data) {
+    num_gpus_ = num_gpus;
+    master_gpu_device_id_ = master_gpu_device_id;
+    global_num_data_ = global_num_data;
+    int max_num_gpu = 0;
+    CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&max_num_gpu));
+    if (gpu_device_id_list != std::string("")) {
+      std::set<int> gpu_id_set;
+      std::vector<std::string> gpu_list_str = Common::Split(gpu_device_id_list.c_str(), ",");
+      for (const auto& gpu_str : gpu_list_str) {
+        int gpu_id = 0;
+        Common::Atoi<int>(gpu_str.c_str(), &gpu_id);
+        if (gpu_id < 0 || gpu_id >= max_num_gpu) {
+          Log::Warning("Invalid GPU device ID %d in gpu_device_list is ignored.", gpu_id);
+        } else {
+          gpu_id_set.insert(gpu_id);
+        }
+      }
+      for (const int gpu_id : gpu_id_set) {
+        gpu_list_.push_back(gpu_id);
+      }
+    }
+    if (!gpu_list_.empty() && num_gpus_ != static_cast<int>(gpu_list_.size())) {
+      Log::Warning("num_gpus_ = %d is different from the number of valid device IDs in gpu_device_list (%d), using %d GPUs instead.",\
+                  num_gpus_, static_cast<int>(gpu_list_.size()), static_cast<int>(gpu_list_.size()));
+      num_gpus_ = static_cast<int>(gpu_list_.size());
+    }
+
+    if (!gpu_list_.empty()) {
+      bool check_master_gpu = false;
+      for (int i = 0; i < static_cast<int>(gpu_list_.size()); ++i) {
+        const int gpu_id = gpu_list_[i];
+        if (gpu_id == master_gpu_device_id_) {
+          check_master_gpu = true;
+          master_gpu_index_ = i;
+          break;
+        }
+      }
+      if (!check_master_gpu) {
+        Log::Warning("Master GPU index not in gpu_device_list. Using %d as the master GPU instead.", gpu_list_[0]);
+        master_gpu_device_id_ = gpu_list_[0];
+        master_gpu_index_ = 0;
+      }
+    } else {
+      if (num_gpus_ <= 0) {
+        num_gpus_ = 1;
+      } else if (num_gpus_ > max_num_gpu) {
+        Log::Warning("Only %d GPUs available, using num_gpu = %d.", max_num_gpu, max_num_gpu);
+        num_gpus_ = max_num_gpu;
+      }
+      if (master_gpu_device_id_ < 0 || master_gpu_device_id_ >= num_gpus_) {
+        Log::Warning("Invalid gpu_device_id = %d for master GPU index, using gpu_device_id = 0 instead.", master_gpu_device_id_);
+        master_gpu_device_id_ = 0;
+        master_gpu_index_ = 0;
+      }
+      for (int i = 0; i < num_gpus_; ++i) {
+        gpu_list_.push_back(i);
+      }
+    }
+
+    Log::Info("Using GPU devices %s, and local master GPU device %d.", Common::Join<int>(gpu_list_, ","), master_gpu_device_id_);
+
+    const int num_threads = OMP_NUM_THREADS();
+    if (num_gpus_ > num_threads) {
+      Log::Fatal("Number of GPUs %d is greather than the number of threads %d. Please use more threads.", num_gpus_, num_threads);
+    }
+
+    host_threads_.resize(num_gpus_);
+  }
+
+  void InitNCCL() {
+    nccl_gpu_rank_.resize(num_gpus_, -1);
+    nccl_communicators_.resize(num_gpus_);
+    ncclUniqueId nccl_unique_id;
+    if (Network::num_machines() == 1 || Network::rank() == 0) {
+      NCCLCHECK(ncclGetUniqueId(&nccl_unique_id));
+    }
+    if (Network::num_machines() > 1) {
+      std::vector<ncclUniqueId> output_buffer(Network::num_machines());
+      Network::Allgather(
+        reinterpret_cast<char*>(&nccl_unique_id),
+        sizeof(ncclUniqueId) / sizeof(char),
+        reinterpret_cast<char*>(output_buffer.data()));
+      if (Network::rank() > 0) {
+        nccl_unique_id = output_buffer[0];
+      }
+    }
+
+    if (Network::num_machines() > 1) {
+      node_rank_offset_.resize(Network::num_machines() + 1, 0);
+      Network::Allgather(
+        reinterpret_cast<char*>(&num_gpus_),
+        sizeof(int) / sizeof(char),
+        reinterpret_cast<char*>(node_rank_offset_.data() + 1));
+      for (int rank = 1; rank < Network::num_machines() + 1; ++rank) {
+        node_rank_offset_[rank] += node_rank_offset_[rank - 1];
+      }
+      CHECK_EQ(node_rank_offset_[Network::rank() + 1] - node_rank_offset_[Network::rank()], num_gpus_);
+      NCCLCHECK(ncclGroupStart());
+      for (int gpu_index = 0; gpu_index < num_gpus_; ++gpu_index) {
+        SetCUDADevice(gpu_list_[gpu_index], __FILE__, __LINE__);
+        nccl_gpu_rank_[gpu_index] = gpu_index + node_rank_offset_[Network::rank()];
+        NCCLCHECK(ncclCommInitRank(&nccl_communicators_[gpu_index], node_rank_offset_.back(), nccl_unique_id, nccl_gpu_rank_[gpu_index]));
+      }
+      NCCLCHECK(ncclGroupEnd());
+    } else {
+      NCCLCHECK(ncclGroupStart());
+      for (int gpu_index = 0; gpu_index < num_gpus_; ++gpu_index) {
+        SetCUDADevice(gpu_list_[gpu_index], __FILE__, __LINE__);
+        nccl_gpu_rank_[gpu_index] = gpu_index;
+        NCCLCHECK(ncclCommInitRank(&nccl_communicators_[gpu_index], num_gpus_, nccl_unique_id, gpu_index));
+      }
+      NCCLCHECK(ncclGroupEnd());
+    }
+
+    // return to master gpu device
+    CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
+  }
+
+  template <typename ARG_T, typename RET_T>
+  void RunPerDevice(const std::vector<std::unique_ptr<ARG_T>>& objs, const std::function<RET_T(ARG_T*)>& func) {
+    #pragma omp parallel for schedule(static) num_threads(num_gpus_)
+    for (int i = 0; i < num_gpus_; ++i) {
+      CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i]));
+      func(objs[i].get());
+    }
+    CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
+  }
+
+  template <typename RET_T>
+  void InitPerDevice(std::vector<std::unique_ptr<RET_T>>& vec) {
+    vec.resize(num_gpus_);
+    #pragma omp parallel for schedule(static) num_threads(num_gpus_)
+    for (int i = 0; i < num_gpus_; ++i) {
+      CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i]));
+      RET_T* nccl_info = new RET_T();
+      nccl_info->SetNCCLInfo(nccl_communicators_[i], nccl_gpu_rank_[i], i, gpu_list_[i], global_num_data_);
+    }
+    CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
+  }
+
+  template <typename ARG_T>
+  void DispatchPerDevice(std::vector<std::unique_ptr<ARG_T>>& objs, const std::function<void(ARG_T*)>& func) {
+    for (int i = 0; i < num_gpus_; ++i) {
+      host_threads_[i] = std::thread([this, i, &func, &objs] () {
+        CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i]))
+        func(objs[i].get());
+      });
+      // if (pthread_create(&host_threads_[i], nullptr, [this, i, &func] (void* ptr) {
+      //     CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i]))
+      //     func(reinterpret_cast<ARG_T*>(ptr));
+      //     return reinterpret_cast<void*>(nullptr);
+      //   },
+      //   reinterpret_cast<void*>(objs[i].get()))) {
+      //   Log::Fatal("Error in creating boosting threads.");
+      // }
+    }
+    for (int i = 0; i < num_gpus_; ++i) {
+      // if (pthread_join(host_threads_[i], nullptr)) {
+      //   Log::Fatal("Error in joining boosting threads.");
+      // }
+      host_threads_[i].join();
+    }
+    CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
+  }
+
+  template <typename ARG_T, typename RET_T>
+  void RunOnMasterDevice(const std::vector<std::unique_ptr<ARG_T>>& objs, const std::function<RET_T(ARG_T*)>& func) {
+    CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
+    func(objs[master_gpu_index_].get());
+  }
+
+  template <typename ARG_T, typename RET_T>
+  void RunOnNonMasterDevice(const std::vector<std::unique_ptr<ARG_T>>& objs, const std::function<RET_T(ARG_T*)>& func) {
+    for (int i = 0; i < num_gpus_; ++i) {
+      if (i != master_gpu_index_) {
+        CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i]));
+        func(objs[i].get());
+      }
+    }
+    CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
+  }
+
+  int num_gpus() const { return num_gpus_; }
+
+  int master_gpu_index() const { return master_gpu_index_; }
+
+  int master_gpu_device_id() const { return master_gpu_device_id_; }
+
+  const std::vector<int>& gpu_list() const { return gpu_list_; }
+
+ private:
+  int num_gpus_;
+  int master_gpu_index_;
+  int master_gpu_device_id_;
+  std::vector<int> gpu_list_;
+  data_size_t global_num_data_;
+
+  ncclUniqueId nccl_unique_id_;
+  std::vector<int> node_rank_offset_;
+  std::vector<int> nccl_gpu_rank_;
+  std::vector<ncclComm_t> nccl_communicators_;
+  std::vector<std::thread> host_threads_;
+};
+
+}  // namespace LightGBM
@@ -19,7 +19,7 @@
 namespace LightGBM {
 
 template <typename HOST_OBJECTIVE>
-class CUDAObjectiveInterface: public HOST_OBJECTIVE {
+class CUDAObjectiveInterface: public HOST_OBJECTIVE, NCCLInfo {
  public:
   explicit CUDAObjectiveInterface(const Config& config): HOST_OBJECTIVE(config) {
     const int gpu_device_id = config.gpu_device_id >= 0 ? config.gpu_device_id : 0;

@@ -11,8 +11,10 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <stdio.h>
+#include <nccl.h>
 
 #include <LightGBM/utils/log.h>
+#include <LightGBM/meta.h>
 
 #include <algorithm>
 #include <vector>
@@ -32,6 +34,15 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort =
 
 #define CUDASUCCESS_OR_FATAL_OUTER(ans) { gpuAssert((ans), file, line); }
 
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t r = cmd;                             \
+  if (r!= ncclSuccess) {                            \
+    printf("Failed, NCCL error %s:%d '%s'\n",             \
+        __FILE__,__LINE__,ncclGetErrorString(r));   \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
 void SetCUDADevice(int gpu_device_id, const char* file, int line);
 
 int GetCUDADevice(const char* file, int line);
@@ -205,6 +216,32 @@ static __device__ T SafeLog(T x) {
   }
 }
 
+class NCCLInfo {
+ public:
+  NCCLInfo() {}
+
+  void SetNCCLInfo(
+    ncclComm_t nccl_communicator,
+    int nccl_gpu_rank,
+    int local_gpu_rank,
+    int gpu_device_id,
+    data_size_t global_num_data) {
+    nccl_communicator_ = nccl_communicator;
+    nccl_gpu_rank_ = nccl_gpu_rank;
+    local_gpu_rank_ = local_gpu_rank;
+    gpu_device_id_ = gpu_device_id;
+    global_num_data_ = global_num_data;
+  }
+
+ protected:
+  ncclComm_t nccl_communicator_ = nullptr;
+  int nccl_gpu_rank_ = -1;
+  int local_gpu_rank_ = -1;
+  int gpu_device_id_ = -1;
+  int num_gpu_in_node_ = 0;
+  data_size_t global_num_data_ = 0;
+};
+
 }  // namespace LightGBM
 
 #endif  // USE_CUDA