Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CUDA] Multi-GPU for CUDA Version #6138

Open
wants to merge 82 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
82 commits
Select commit Hold shift + click to select a range
ee3923b
initialize nccl
shiyu1994 Oct 10, 2023
82668d0
Merge branch 'master' into nccl-dev
shiyu1994 Oct 26, 2023
6189cbb
Merge branch 'master' into nccl-dev
shiyu1994 Oct 27, 2023
f39f877
change year in header
shiyu1994 Nov 8, 2023
e513662
Merge branch 'master' into nccl-dev
shiyu1994 Nov 8, 2023
47f3e50
Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…
shiyu1994 Nov 8, 2023
985780f
add implementation of nccl gbdt
shiyu1994 Nov 8, 2023
35b0ca1
add nccl topology
shiyu1994 Nov 9, 2023
7d36a14
clean up
shiyu1994 Nov 9, 2023
5470d99
Merge branch 'master' into nccl-dev
shiyu1994 Nov 9, 2023
7b47a1e
clean up
shiyu1994 Nov 9, 2023
839c375
Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…
shiyu1994 Nov 9, 2023
8eaf3ad
Merge branch 'master' into nccl-dev
shiyu1994 Dec 15, 2023
cc72fc8
Merge branch 'master' into nccl-dev
shiyu1994 Dec 22, 2023
209e25d
set nccl info
shiyu1994 Jan 25, 2024
431f967
support quantized training with categorical features on cpu
shiyu1994 Feb 5, 2024
b07caf2
remove white spaces
shiyu1994 Feb 5, 2024
cf60467
add tests for quantized training with categorical features
shiyu1994 Feb 5, 2024
bf2f649
skip tests for cuda version
shiyu1994 Feb 5, 2024
2fc9525
fix cases when only 1 data block in row-wise quantized histogram cons…
shiyu1994 Feb 6, 2024
dce770c
remove useless capture
shiyu1994 Feb 6, 2024
f0c44fc
Merge branch 'master' into nccl-dev
shiyu1994 Feb 6, 2024
e2cb41f
Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…
shiyu1994 Feb 6, 2024
f3985ef
fix inconsistency of gpu devices
shiyu1994 Feb 7, 2024
d000a41
fix creating boosting object from file
shiyu1994 Feb 7, 2024
ecdccd5
change num_gpu to num_gpus in test case
shiyu1994 Feb 7, 2024
dfa4419
fix objective initialization
shiyu1994 Feb 9, 2024
f4b8906
Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…
shiyu1994 Feb 9, 2024
f0b22d1
fix c++ compilation warning
shiyu1994 Feb 9, 2024
617b3b2
fix lint errors
shiyu1994 Feb 9, 2024
6d090b2
Merge branch 'master' into fix-6257
shiyu1994 Feb 20, 2024
736ab8a
Merge branch 'master' into nccl-dev
shiyu1994 Feb 20, 2024
ad72d9f
Merge branch 'fix-6257' into nccl-dev
shiyu1994 Feb 20, 2024
2670f48
fix compilation warnings
shiyu1994 Feb 20, 2024
02b725b
change num_gpu to num_gpus in R test case
shiyu1994 Feb 20, 2024
3bfb784
add nccl synchronization in tree training
shiyu1994 Feb 20, 2024
fe1f592
fix global num data update
shiyu1994 Feb 21, 2024
a528bd6
merge master
shiyu1994 Feb 22, 2024
996d70b
fix ruff-format issues
shiyu1994 Feb 22, 2024
671bed3
merge master
shiyu1994 Feb 23, 2024
34610fb
use global num data in split finder
shiyu1994 Feb 23, 2024
041018b
Merge branch 'master' into nccl-dev
shiyu1994 Mar 6, 2024
e1b4512
explicit initialization of NCCLInfo members
shiyu1994 Mar 11, 2024
0a21b5f
Merge branch 'master' into nccl-dev
shiyu1994 Mar 25, 2024
be29624
Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…
shiyu1994 Mar 25, 2024
06cfde4
Merge branch 'master' into nccl-dev
shiyu1994 Apr 11, 2024
75afe5e
Merge branch 'master' into nccl-dev
shiyu1994 May 20, 2024
1e6e4a1
Merge branch 'master' into nccl-dev
shiyu1994 Jun 30, 2024
614605c
merge master
shiyu1994 Oct 8, 2024
18babb0
Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…
shiyu1994 Oct 8, 2024
11f4062
fix compilation
shiyu1994 Oct 8, 2024
b4c21c2
use CUDAVector
shiyu1994 Oct 9, 2024
70fe10f
use CUDAVector
shiyu1994 Oct 9, 2024
849a554
merge master
shiyu1994 Oct 18, 2024
19a2662
merge master
shiyu1994 Oct 18, 2024
6db879a
use CUDAVector
shiyu1994 Oct 25, 2024
b43f88b
use CUDAVector for cuda tree and column data
shiyu1994 Oct 25, 2024
582c760
update gbdt
shiyu1994 Oct 25, 2024
b9e143b
changes for cuda tree
shiyu1994 Oct 25, 2024
483e521
use CUDAVector for cuda column data
shiyu1994 Oct 25, 2024
950199d
fix bug in GetDataByColumnPointers
shiyu1994 Oct 25, 2024
f30ee85
Merge branch 'master' into nccl-dev
shiyu1994 Oct 25, 2024
d11991a
disable cuda by default
shiyu1994 Oct 25, 2024
4bb4411
Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…
shiyu1994 Oct 25, 2024
b56b39e
fix single machine gbdt
shiyu1994 Oct 25, 2024
3bebc19
merge main
shiyu1994 Dec 17, 2024
47b4364
clean up
shiyu1994 Dec 17, 2024
a326c87
fix typo
shiyu1994 Dec 17, 2024
5f999e7
fix lint issues
shiyu1994 Dec 17, 2024
d8ea043
Merge branch 'master' into nccl-dev
shiyu1994 Dec 18, 2024
a0864dc
Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…
shiyu1994 Dec 18, 2024
2f040b7
Merge branch 'master' into nccl-dev
shiyu1994 Dec 23, 2024
0cf1062
Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…
shiyu1994 Dec 24, 2024
ae4cce6
use num_gpu instead of num_gpus
shiyu1994 Dec 24, 2024
266e02b
Merge branch 'master' into nccl-dev
shiyu1994 Feb 25, 2025
6aa2aff
fix compilation error
shiyu1994 Feb 25, 2025
b137216
fix cpp lint errors
shiyu1994 Feb 25, 2025
3e1452a
Merge branch 'master' into nccl-dev
shiyu1994 Mar 7, 2025
a86bb42
fix reset config for cuda data partition
shiyu1994 Mar 7, 2025
37fb144
fix subrow copy in cuda column data
shiyu1994 Mar 7, 2025
4fa4837
Merge branch 'nccl-dev' of https://github.com/Microsoft/LightGBM into…
shiyu1994 Mar 7, 2025
5bf50de
fix cmakelint errors
shiyu1994 Mar 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add nccl topology
  • Loading branch information
shiyu1994 committed Nov 9, 2023
commit 35b0ca147b011b9453130c55b42aad197bc0bc88
24 changes: 16 additions & 8 deletions docs/Parameters.rst
Original file line number Diff line number Diff line change
@@ -1280,31 +1280,39 @@ GPU Parameters

- ``gpu_platform_id`` :raw-html:`<a id="gpu_platform_id" title="Permalink to this parameter" href="#gpu_platform_id">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int

- OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform
- OpenCL platform ID with device_type=gpu. Usually each GPU vendor exposes one OpenCL platform

- ``-1`` means the system-wide default platform

- **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details

- ``gpu_device_id`` :raw-html:`<a id="gpu_device_id" title="Permalink to this parameter" href="#gpu_device_id">&#x1F517;&#xFE0E;</a>`, default = ``-1``, type = int

- OpenCL device ID in the specified platform. Each GPU in the selected platform has a unique device ID
- Master CUDA device ID with device_type=cuda or OpenCL device ID in the specified platform with device_type=gpu.

- Each GPU in the selected platform has a unique device ID

- ``-1`` means the default device in the selected platform

- **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details

- ``gpu_use_dp`` :raw-html:`<a id="gpu_use_dp" title="Permalink to this parameter" href="#gpu_use_dp">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
- ``num_gpus`` :raw-html:`<a id="num_gpus" title="Permalink to this parameter" href="#num_gpus">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int

- set this to ``true`` to use double precision math on GPU (by default single precision is used)
- Number of GPUs to use for training, used with device_type=cuda

- **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported
- When <= 0, only 1 GPU will be used

- ``gpu_device_id_list`` :raw-html:`<a id="gpu_device_id_list" title="Permalink to this parameter" href="#gpu_device_id_list">&#x1F517;&#xFE0E;</a>`, default = ``""``, type = string

- ``num_gpu`` :raw-html:`<a id="num_gpu" title="Permalink to this parameter" href="#num_gpu">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, constraints: ``num_gpu > 0``
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm ok with changing the main parameter name to num_gpus, but can we please keep num_gpu as a parameter alias? So that existing code using that parameter isn't broken?

- List of CUDA device IDs used when device_type=cuda

- number of GPUs
- When empty, the devices with the smallest IDs will be used

- **Note**: can be used only in CUDA implementation
- ``gpu_use_dp`` :raw-html:`<a id="gpu_use_dp" title="Permalink to this parameter" href="#gpu_use_dp">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool

- set this to ``true`` to use double precision math on GPU (by default single precision is used)

- **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported

.. end params list

4 changes: 3 additions & 1 deletion include/LightGBM/boosting.h
Original file line number Diff line number Diff line change
@@ -309,9 +309,11 @@ class LIGHTGBM_EXPORT Boosting {
* \param format Format of model
* \param config config for boosting
* \param filename name of model file, if existing will continue to train from this model
* \param device_type type of device, can be cpu, gpu or cuda
* \param num_gpu number of GPUs to use
* \return The boosting object
*/
static Boosting* CreateBoosting(const std::string& type, const char* filename);
static Boosting* CreateBoosting(const std::string& type, const char* filename, const std::string& device_type, const int num_gpu);

virtual std::string GetLoadedParam() const = 0;

18 changes: 11 additions & 7 deletions include/LightGBM/config.h
Original file line number Diff line number Diff line change
@@ -1091,25 +1091,29 @@ struct Config {
#pragma region GPU Parameters
#endif // __NVCC__

// desc = OpenCL platform ID. Usually each GPU vendor exposes one OpenCL platform
// desc = OpenCL platform ID with device_type=gpu. Usually each GPU vendor exposes one OpenCL platform
// desc = ``-1`` means the system-wide default platform
// desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
int gpu_platform_id = -1;

// desc = OpenCL device ID in the specified platform. Each GPU in the selected platform has a unique device ID
// desc = Master CUDA device ID with device_type=cuda or OpenCL device ID in the specified platform with device_type=gpu.
// desc = Each GPU in the selected platform has a unique device ID
// desc = ``-1`` means the default device in the selected platform
// desc = **Note**: refer to `GPU Targets <./GPU-Targets.rst#query-opencl-devices-in-your-system>`__ for more details
int gpu_device_id = -1;

// desc = Number of GPUs to use for training, used with device_type=cuda
// desc = When <= 0, only 1 GPU will be used
int num_gpus = 1;

// desc = List of CUDA device IDs used when device_type=cuda
// desc = When empty, the devices with the smallest IDs will be used
std::string gpu_device_id_list = "";

// desc = set this to ``true`` to use double precision math on GPU (by default single precision is used)
// desc = **Note**: can be used only in OpenCL implementation, in CUDA implementation only double precision is currently supported
bool gpu_use_dp = false;

// check = >0
// desc = number of GPUs
// desc = **Note**: can be used only in CUDA implementation
int num_gpu = 1;

#ifndef __NVCC__
#pragma endregion

225 changes: 225 additions & 0 deletions include/LightGBM/cuda/cuda_nccl_topology.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
/*!
* Copyright (c) 2023 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/

#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/network.h>
#include <LightGBM/utils/common.h>
#include <set>
#include <string>
#include <vector>
#include <nccl.h>
#include <functional>
#include <thread>

namespace LightGBM {

class NCCLTopology {
public:
NCCLTopology(const int master_gpu_device_id, const int num_gpus, const std::string& gpu_device_id_list, const data_size_t global_num_data) {
num_gpus_ = num_gpus;
master_gpu_device_id_ = master_gpu_device_id;
global_num_data_ = global_num_data;
int max_num_gpu = 0;
CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&max_num_gpu));
if (gpu_device_id_list != std::string("")) {
std::set<int> gpu_id_set;
std::vector<std::string> gpu_list_str = Common::Split(gpu_device_id_list.c_str(), ",");
for (const auto& gpu_str : gpu_list_str) {
int gpu_id = 0;
Common::Atoi<int>(gpu_str.c_str(), &gpu_id);
if (gpu_id < 0 || gpu_id >= max_num_gpu) {
Log::Warning("Invalid GPU device ID %d in gpu_device_list is ignored.", gpu_id);
} else {
gpu_id_set.insert(gpu_id);
}
}
for (const int gpu_id : gpu_id_set) {
gpu_list_.push_back(gpu_id);
}
}
if (!gpu_list_.empty() && num_gpus_ != static_cast<int>(gpu_list_.size())) {
Log::Warning("num_gpus_ = %d is different from the number of valid device IDs in gpu_device_list (%d), using %d GPUs instead.",\
num_gpus_, static_cast<int>(gpu_list_.size()), static_cast<int>(gpu_list_.size()));
num_gpus_ = static_cast<int>(gpu_list_.size());
}

if (!gpu_list_.empty()) {
bool check_master_gpu = false;
for (int i = 0; i < static_cast<int>(gpu_list_.size()); ++i) {
const int gpu_id = gpu_list_[i];
if (gpu_id == master_gpu_device_id_) {
check_master_gpu = true;
master_gpu_index_ = i;
break;
}
}
if (!check_master_gpu) {
Log::Warning("Master GPU index not in gpu_device_list. Using %d as the master GPU instead.", gpu_list_[0]);
master_gpu_device_id_ = gpu_list_[0];
master_gpu_index_ = 0;
}
} else {
if (num_gpus_ <= 0) {
num_gpus_ = 1;
} else if (num_gpus_ > max_num_gpu) {
Log::Warning("Only %d GPUs available, using num_gpu = %d.", max_num_gpu, max_num_gpu);
num_gpus_ = max_num_gpu;
}
if (master_gpu_device_id_ < 0 || master_gpu_device_id_ >= num_gpus_) {
Log::Warning("Invalid gpu_device_id = %d for master GPU index, using gpu_device_id = 0 instead.", master_gpu_device_id_);
master_gpu_device_id_ = 0;
master_gpu_index_ = 0;
}
for (int i = 0; i < num_gpus_; ++i) {
gpu_list_.push_back(i);
}
}

Log::Info("Using GPU devices %s, and local master GPU device %d.", Common::Join<int>(gpu_list_, ","), master_gpu_device_id_);

const int num_threads = OMP_NUM_THREADS();
if (num_gpus_ > num_threads) {
Log::Fatal("Number of GPUs %d is greather than the number of threads %d. Please use more threads.", num_gpus_, num_threads);
}

host_threads_.resize(num_gpus_);
}

void InitNCCL() {
nccl_gpu_rank_.resize(num_gpus_, -1);
nccl_communicators_.resize(num_gpus_);
ncclUniqueId nccl_unique_id;
if (Network::num_machines() == 1 || Network::rank() == 0) {
NCCLCHECK(ncclGetUniqueId(&nccl_unique_id));
}
if (Network::num_machines() > 1) {
std::vector<ncclUniqueId> output_buffer(Network::num_machines());
Network::Allgather(
reinterpret_cast<char*>(&nccl_unique_id),
sizeof(ncclUniqueId) / sizeof(char),
reinterpret_cast<char*>(output_buffer.data()));
if (Network::rank() > 0) {
nccl_unique_id = output_buffer[0];
}
}

if (Network::num_machines() > 1) {
node_rank_offset_.resize(Network::num_machines() + 1, 0);
Network::Allgather(
reinterpret_cast<char*>(&num_gpus_),
sizeof(int) / sizeof(char),
reinterpret_cast<char*>(node_rank_offset_.data() + 1));
for (int rank = 1; rank < Network::num_machines() + 1; ++rank) {
node_rank_offset_[rank] += node_rank_offset_[rank - 1];
}
CHECK_EQ(node_rank_offset_[Network::rank() + 1] - node_rank_offset_[Network::rank()], num_gpus_);
NCCLCHECK(ncclGroupStart());
for (int gpu_index = 0; gpu_index < num_gpus_; ++gpu_index) {
SetCUDADevice(gpu_list_[gpu_index], __FILE__, __LINE__);
nccl_gpu_rank_[gpu_index] = gpu_index + node_rank_offset_[Network::rank()];
NCCLCHECK(ncclCommInitRank(&nccl_communicators_[gpu_index], node_rank_offset_.back(), nccl_unique_id, nccl_gpu_rank_[gpu_index]));
}
NCCLCHECK(ncclGroupEnd());
} else {
NCCLCHECK(ncclGroupStart());
for (int gpu_index = 0; gpu_index < num_gpus_; ++gpu_index) {
SetCUDADevice(gpu_list_[gpu_index], __FILE__, __LINE__);
nccl_gpu_rank_[gpu_index] = gpu_index;
NCCLCHECK(ncclCommInitRank(&nccl_communicators_[gpu_index], num_gpus_, nccl_unique_id, gpu_index));
}
NCCLCHECK(ncclGroupEnd());
}

// return to master gpu device
CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
}

template <typename ARG_T, typename RET_T>
void RunPerDevice(const std::vector<std::unique_ptr<ARG_T>>& objs, const std::function<RET_T(ARG_T*)>& func) {
#pragma omp parallel for schedule(static) num_threads(num_gpus_)
for (int i = 0; i < num_gpus_; ++i) {
CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i]));
func(objs[i].get());
}
CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
}

template <typename RET_T>
void InitPerDevice(std::vector<std::unique_ptr<RET_T>>& vec) {
vec.resize(num_gpus_);
#pragma omp parallel for schedule(static) num_threads(num_gpus_)
for (int i = 0; i < num_gpus_; ++i) {
CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i]));
RET_T* nccl_info = new RET_T();
nccl_info->SetNCCLInfo(nccl_communicators_[i], nccl_gpu_rank_[i], i, gpu_list_[i], global_num_data_);
}
CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
}

template <typename ARG_T>
void DispatchPerDevice(std::vector<std::unique_ptr<ARG_T>>& objs, const std::function<void(ARG_T*)>& func) {
for (int i = 0; i < num_gpus_; ++i) {
host_threads_[i] = std::thread([this, i, &func, &objs] () {
CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i]))
func(objs[i].get());
});
// if (pthread_create(&host_threads_[i], nullptr, [this, i, &func] (void* ptr) {
// CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i]))
// func(reinterpret_cast<ARG_T*>(ptr));
// return reinterpret_cast<void*>(nullptr);
// },
// reinterpret_cast<void*>(objs[i].get()))) {
// Log::Fatal("Error in creating boosting threads.");
// }
}
for (int i = 0; i < num_gpus_; ++i) {
// if (pthread_join(host_threads_[i], nullptr)) {
// Log::Fatal("Error in joining boosting threads.");
// }
host_threads_[i].join();
}
CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
}

template <typename ARG_T, typename RET_T>
void RunOnMasterDevice(const std::vector<std::unique_ptr<ARG_T>>& objs, const std::function<RET_T(ARG_T*)>& func) {
CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
func(objs[master_gpu_index_].get());
}

template <typename ARG_T, typename RET_T>
void RunOnNonMasterDevice(const std::vector<std::unique_ptr<ARG_T>>& objs, const std::function<RET_T(ARG_T*)>& func) {
for (int i = 0; i < num_gpus_; ++i) {
if (i != master_gpu_index_) {
CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_list_[i]));
func(objs[i].get());
}
}
CUDASUCCESS_OR_FATAL(cudaSetDevice(master_gpu_device_id_));
}

int num_gpus() const { return num_gpus_; }

int master_gpu_index() const { return master_gpu_index_; }

int master_gpu_device_id() const { return master_gpu_device_id_; }

const std::vector<int>& gpu_list() const { return gpu_list_; }

private:
int num_gpus_;
int master_gpu_index_;
int master_gpu_device_id_;
std::vector<int> gpu_list_;
data_size_t global_num_data_;

ncclUniqueId nccl_unique_id_;
std::vector<int> node_rank_offset_;
std::vector<int> nccl_gpu_rank_;
std::vector<ncclComm_t> nccl_communicators_;
std::vector<std::thread> host_threads_;
};

} // namespace LightGBM
2 changes: 1 addition & 1 deletion include/LightGBM/cuda/cuda_objective_function.hpp
Original file line number Diff line number Diff line change
@@ -19,7 +19,7 @@
namespace LightGBM {

template <typename HOST_OBJECTIVE>
class CUDAObjectiveInterface: public HOST_OBJECTIVE {
class CUDAObjectiveInterface: public HOST_OBJECTIVE, NCCLInfo {
public:
explicit CUDAObjectiveInterface(const Config& config): HOST_OBJECTIVE(config) {
const int gpu_device_id = config.gpu_device_id >= 0 ? config.gpu_device_id : 0;
37 changes: 37 additions & 0 deletions include/LightGBM/cuda/cuda_utils.hu
Original file line number Diff line number Diff line change
@@ -11,8 +11,10 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <nccl.h>

#include <LightGBM/utils/log.h>
#include <LightGBM/meta.h>

#include <algorithm>
#include <vector>
@@ -32,6 +34,15 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort =

#define CUDASUCCESS_OR_FATAL_OUTER(ans) { gpuAssert((ans), file, line); }

#define NCCLCHECK(cmd) do { \
ncclResult_t r = cmd; \
if (r!= ncclSuccess) { \
printf("Failed, NCCL error %s:%d '%s'\n", \
__FILE__,__LINE__,ncclGetErrorString(r)); \
exit(EXIT_FAILURE); \
} \
} while(0)

void SetCUDADevice(int gpu_device_id, const char* file, int line);

int GetCUDADevice(const char* file, int line);
@@ -205,6 +216,32 @@ static __device__ T SafeLog(T x) {
}
}

class NCCLInfo {
public:
NCCLInfo() {}

void SetNCCLInfo(
ncclComm_t nccl_communicator,
int nccl_gpu_rank,
int local_gpu_rank,
int gpu_device_id,
data_size_t global_num_data) {
nccl_communicator_ = nccl_communicator;
nccl_gpu_rank_ = nccl_gpu_rank;
local_gpu_rank_ = local_gpu_rank;
gpu_device_id_ = gpu_device_id;
global_num_data_ = global_num_data;
}

protected:
ncclComm_t nccl_communicator_ = nullptr;
int nccl_gpu_rank_ = -1;
int local_gpu_rank_ = -1;
int gpu_device_id_ = -1;
int num_gpu_in_node_ = 0;
data_size_t global_num_data_ = 0;
};

} // namespace LightGBM

#endif // USE_CUDA
Loading
Oops, something went wrong.