initialize nccl

microsoft · shiyu1994 · Oct 10, 2023 · Oct 26, 2023 · Oct 27, 2023 · Nov 8, 2023
commit ee3923b5d6018292f853df90238d20f6e3c62d13
@@ -204,6 +204,7 @@ endif()
 
 if(USE_CUDA)
     find_package(CUDA 11.0 REQUIRED)
+    find_package(Nccl REQUIRED)
     include_directories(${CUDA_INCLUDE_DIRS})
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS} -Xcompiler=-fPIC -Xcompiler=-Wall")
 
@@ -561,6 +562,10 @@ if(USE_GPU)
   target_link_libraries(lightgbm_objs PUBLIC ${OpenCL_LIBRARY} ${Boost_LIBRARIES})
 endif()
 
+if(USE_CUDA)
+  target_link_libraries(lightgbm_objs PUBLIC ${NCCL_LIBRARY})
+endif(USE_CUDA)
+
 if(__INTEGRATE_OPENCL)
   # targets OpenCL and Boost are added in IntegratedOpenCL.cmake
   add_dependencies(lightgbm_objs OpenCL Boost)

@@ -0,0 +1,70 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Tries to find NCCL headers and libraries.
+#
+# Usage of this module as follows:
+#
+#  find_package(NCCL)
+#
+# Variables used by this module, they can change the default behaviour and need
+# to be set before calling find_package:
+#
+#  NCCL_ROOT - When set, this path is inspected instead of standard library
+#              locations as the root of the NCCL installation.
+#              The environment variable NCCL_ROOT overrides this variable.
+#
+# This module defines
+#  Nccl_FOUND, whether nccl has been found
+#  NCCL_INCLUDE_DIR, directory containing header
+#  NCCL_LIBRARY, directory containing nccl library
+#  NCCL_LIB_NAME, nccl library name
+#  USE_NCCL_LIB_PATH, when set, NCCL_LIBRARY path is also inspected for the
+#                     location of the nccl library. This would disable
+#                     switching between static and shared.
+#
+# This module assumes that the user has already called find_package(CUDA)
+
+if (NCCL_LIBRARY)
+  if(NOT USE_NCCL_LIB_PATH)
+    # Don't cache NCCL_LIBRARY to enable switching between static and shared.
+    unset(NCCL_LIBRARY CACHE)
+  endif(NOT USE_NCCL_LIB_PATH)
+endif()
+
+if (BUILD_WITH_SHARED_NCCL)
+  # libnccl.so
+  set(NCCL_LIB_NAME nccl)
+else ()
+  # libnccl_static.a
+  set(NCCL_LIB_NAME nccl_static)
+endif (BUILD_WITH_SHARED_NCCL)
+
+find_path(NCCL_INCLUDE_DIR
+  NAMES nccl.h
+  PATHS $ENV{NCCL_ROOT}/include ${NCCL_ROOT}/include)
+
+find_library(NCCL_LIBRARY
+  NAMES ${NCCL_LIB_NAME}
+  PATHS $ENV{NCCL_ROOT}/lib/ ${NCCL_ROOT}/lib)
+
+message(STATUS "Using nccl library: ${NCCL_LIBRARY}")
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Nccl DEFAULT_MSG
+                                  NCCL_INCLUDE_DIR NCCL_LIBRARY)
+
+mark_as_advanced(
+  NCCL_INCLUDE_DIR
+  NCCL_LIBRARY
+)
@@ -0,0 +1,169 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#ifndef LIGHTGBM_BOOSTING_CUDA_NCCL_GBDT_HPP_
+#define LIGHTGBM_BOOSTING_CUDA_NCCL_GBDT_HPP_
+
+#ifdef USE_CUDA
+
+#include "../gbdt.h"
+#include <LightGBM/objective_function.h>
+#include <LightGBM/network.h>
+#include "cuda_score_updater.hpp"
+#include <pthread.h>
+
+namespace LightGBM {
+
+template <typename GBDT_T>
+class NCCLGBDT: public GBDT_T {
+ public:
+  NCCLGBDT();
+
+  ~NCCLGBDT();
+
+  void Init(const Config* gbdt_config, const Dataset* train_data,
+            const ObjectiveFunction* objective_function,
+            const std::vector<const Metric*>& training_metrics) override;
+
+  void Boosting() override;
+
+  void RefitTree(const std::vector<std::vector<int>>& /*tree_leaf_prediction*/) override {
+    Log::Fatal("RefitTree is not supported for NCCLGBDT.");
+  }
+
+  bool TrainOneIter(const score_t* gradients, const score_t* hessians) override;
+
+  const double* GetTrainingScore(int64_t* /*out_len*/) override {
+    Log::Fatal("GetTrainingScore is not supported for NCCLGBDT.");
+  }
+
+  void ResetTrainingData(const Dataset* /*train_data*/, const ObjectiveFunction* /*objective_function*/,
+                         const std::vector<const Metric*>& /*training_metrics*/) override {
+    Log::Fatal("ResetTrainingData is not supported for NCCLGBDT.");
+  }
+
+  void ResetConfig(const Config* /*gbdt_config*/) override {
+    Log::Fatal("ResetConfig is not supported for NCCLGBDT.");
+  }
+
+ private:
+  struct BoostingThreadData {
+    int gpu_index;
+    ObjectiveFunction* gpu_objective_function;
+    score_t* gradients;
+    score_t* hessians;
+    const double* score;
+
+    BoostingThreadData() {
+      gpu_index = 0;
+      gpu_objective_function = nullptr;
+    }
+  };
+
+  struct TrainTreeLearnerThreadData {
+    int gpu_index;
+    TreeLearner* gpu_tree_learner;
+    const score_t* gradients;
+    const score_t* hessians; 
+    bool is_first_time;
+    int class_id;
+    data_size_t num_data_in_gpu;
+    std::unique_ptr<Tree> tree;
+
+    TrainTreeLearnerThreadData() {
+      gpu_index = 0;
+      gpu_tree_learner = nullptr;
+      gradients = nullptr;
+      hessians = nullptr;
+      is_first_time = false;
+      class_id = 0;
+      num_data_in_gpu = 0;
+      tree.reset(nullptr);
+    }
+  };
+
+  struct UpdateScoreThreadData {
+    int gpu_index;
+    ScoreUpdater* gpu_score_updater;
+    TreeLearner* gpu_tree_learner;
+    Tree* tree;
+    int cur_tree_id;
+
+    UpdateScoreThreadData() {
+      gpu_index = 0;
+      gpu_score_updater = nullptr;
+      gpu_tree_learner = nullptr;
+      tree = nullptr;
+      cur_tree_id = 0;
+    }
+  };
+
+  static void* BoostingThread(void* thread_data);
+
+  static void* TrainTreeLearnerThread(void* thread_data);
+
+  static void* UpdateScoreThread(void* thread_data);
+
+  void Bagging(int /*iter*/) override {
+    Log::Fatal("Bagging is not supported for NCCLGBDT.");
+  }
+
+  void InitNCCL();
+
+  double BoostFromAverage(int class_id, bool update_scorer) override;
+
+  void UpdateScore(const std::vector<std::unique_ptr<Tree>>& tree, const int cur_tree_id);
+
+  void UpdateScore(const Tree* /*tree*/, const int /*cur_tree_id*/) {
+    Log::Fatal("UpdateScore is not supported for NCCLGBDT.");
+  }
+
+  void RollbackOneIter() override {
+    Log::Fatal("RollbackOneIter is not supported for NCCLGBDT.");
+  }
+
+  std::vector<double> EvalOneMetric(const Metric* metric, const double* score, const data_size_t num_data) const override;
+
+  void SetCUDADevice(int gpu_id) const {
+    if (gpu_list_.empty()) {
+      CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_id));
+    } else {
+      CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[gpu_id]));
+    }
+  }
+
+  int GetCUDADevice(int gpu_id) const {
+    if (gpu_list_.empty()) {
+      return gpu_id;
+    } else {
+      return gpu_list_[gpu_id];
+    }
+  }
+
+  int num_gpu_;
+  int num_threads_;
+  int master_gpu_device_id_;
+  int master_gpu_index_;
+  std::vector<int> gpu_list_;
+  std::vector<std::unique_ptr<ObjectiveFunction>> per_gpu_objective_functions_;
+  std::vector<std::unique_ptr<ScoreUpdater>> per_gpu_train_score_updater_;
+  std::vector<std::unique_ptr<CUDAVector<score_t>>> per_gpu_gradients_;
+  std::vector<std::unique_ptr<CUDAVector<score_t>>> per_gpu_hessians_;
+  std::vector<std::unique_ptr<Dataset>> per_gpu_datasets_;
+  std::vector<data_size_t> per_gpu_data_start_;
+  std::vector<data_size_t> per_gpu_data_end_;
+  std::vector<pthread_t> host_threads_;
+  std::vector<BoostingThreadData> boosting_thread_data_;
+  std::vector<TrainTreeLearnerThreadData> train_tree_learner_thread_data_;
+  std::vector<UpdateScoreThreadData> update_score_thread_data_;
+  std::vector<int> nccl_gpu_rank_;
+  std::vector<ncclComm_t> nccl_communicators_;
+  std::vector<std::unique_ptr<TreeLearner>> per_gpu_tree_learners_;
+};
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA
+#endif  // LIGHTGBM_BOOSTING_CUDA_NCCL_GBDT_HPP_