From 02378d7d35821d88271f2ddeb166dbbb7dbe8cb2 Mon Sep 17 00:00:00 2001 From: TJ Xu Date: Mon, 8 Nov 2021 11:26:40 -0800 Subject: [PATCH] Call process_set._setup in init() to point to the correct native lib path (#3258) * call setup for common process_set in remote trainers moved _setup call to init() Signed-off-by: TJ Signed-off-by: weihanmines --- CHANGELOG.md | 1 + horovod/mxnet/mpi_ops.py | 6 +++++- horovod/spark/keras/remote.py | 4 ++++ horovod/spark/lightning/remote.py | 6 ++++++ horovod/spark/torch/remote.py | 4 ++++ horovod/tensorflow/mpi_ops.py | 6 +++++- horovod/torch/mpi_ops.py | 4 +++- 7 files changed, 28 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 65fe710543..a0f5a0a1aa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - fix the example of pytorch_lightning_mnist.py ([#3245](https://github.com/horovod/horovod/pull/3245)) +- Call _setup in remote trainers to point to the correct shared lib path ([#3258](https://github.com/horovod/horovod/pull/3258)) ## [v0.23.0] - 2021-10-06 ### Added diff --git a/horovod/mxnet/mpi_ops.py b/horovod/mxnet/mpi_ops.py index c4b6d9cd2b..6994a26556 100644 --- a/horovod/mxnet/mpi_ops.py +++ b/horovod/mxnet/mpi_ops.py @@ -39,7 +39,6 @@ check_installed_version('mxnet', mx.__version__) # import basic methods -init = _basics.init shutdown = _basics.shutdown is_initialized = _basics.is_initialized start_timeline = _basics.start_timeline @@ -61,6 +60,11 @@ cuda_built = _basics.cuda_built rocm_built = _basics.rocm_built +def init(*args, **kwargs): + _basics.init(*args, **kwargs) + # Call set up again to make sure the basics is in sync + _setup_process_sets(_basics) + dll_path = os.path.join(os.path.dirname(__file__), 'mpi_lib' + get_ext_suffix()) MPI_MXNET_LIB_CTYPES = ctypes.CDLL(dll_path, ctypes.RTLD_GLOBAL) diff --git a/horovod/spark/keras/remote.py b/horovod/spark/keras/remote.py index 8e098bdd18..879e80e12b 100644 --- a/horovod/spark/keras/remote.py +++ b/horovod/spark/keras/remote.py @@ -109,6 +109,7 @@ def train(serialized_model, train_rows, val_rows, avg_row_size): hvd = get_horovod() hvd.init() + pin_gpu(hvd, tf, k) if not user_shuffle_buffer_size: @@ -129,6 +130,9 @@ def train(serialized_model, train_rows, val_rows, avg_row_size): # Verbose mode 1 will print a progress bar verbose = user_verbose if hvd.rank() == 0 else 0 + if verbose: + print(f"Shared lib path is pointing to: {_horovod.common.process_sets._basics.MPI_LIB_CTYPES}") + transform_spec = None if transformation: transform_spec = TransformSpec(transformation) diff --git a/horovod/spark/lightning/remote.py b/horovod/spark/lightning/remote.py index 28550c9d99..dce397907e 100644 --- a/horovod/spark/lightning/remote.py +++ b/horovod/spark/lightning/remote.py @@ -100,8 +100,14 @@ def RemoteTrainer(estimator, metadata, ckpt_bytes, run_id, dataset_idx, train_ro def train(serialized_model): import horovod.torch as hvd + # Horovod: initialize library. hvd.init() + + if verbose: + import horovod as _horovod + print(f"Shared lib path is pointing to: {_horovod.common.process_sets._basics.MPI_LIB_CTYPES}") + _checkpoint_callback = None require_checkpoint = False diff --git a/horovod/spark/torch/remote.py b/horovod/spark/torch/remote.py index fe38237949..6b2f325caa 100644 --- a/horovod/spark/torch/remote.py +++ b/horovod/spark/torch/remote.py @@ -117,6 +117,10 @@ def train(serialized_model, optimizer_cls, model_opt_state_serialized, # Horovod: initialize library. hvd.init() + if user_verbose: + import horovod as _horovod + print(f"Shared lib path is pointing to: {_horovod.common.process_sets._basics.MPI_LIB_CTYPES}") + if not user_shuffle_buffer_size: shuffle_buffer_size = \ calculate_shuffle_buffer_size(hvd, avg_row_size, train_rows / hvd.size()) diff --git a/horovod/tensorflow/mpi_ops.py b/horovod/tensorflow/mpi_ops.py index ccb9f34705..8865e5843d 100644 --- a/horovod/tensorflow/mpi_ops.py +++ b/horovod/tensorflow/mpi_ops.py @@ -57,7 +57,6 @@ def _load_library(name): _basics = _HorovodBasics(__file__, 'mpi_lib') # import basic methods -init = _basics.init shutdown = _basics.shutdown is_initialized = _basics.is_initialized start_timeline = _basics.start_timeline @@ -84,6 +83,11 @@ def _load_library(name): Sum = _basics.Sum Adasum = _basics.Adasum +def init(*args, **kwargs): + _basics.init(*args, **kwargs) + # Call set up again to make sure the basics is in sync + _setup_process_sets(_basics) + is_homogeneous = _basics.is_homogeneous handle_average_backwards_compatibility = get_average_backwards_compatibility_fun(_basics) diff --git a/horovod/torch/mpi_ops.py b/horovod/torch/mpi_ops.py index ba798a2d59..1fbcc3bc3d 100644 --- a/horovod/torch/mpi_ops.py +++ b/horovod/torch/mpi_ops.py @@ -69,7 +69,9 @@ def shutdown(*args, **kwargs): def init(*args, **kwargs): global _handle_map _handle_map = {} - return _basics.init(*args, **kwargs) + _basics.init(*args, **kwargs) + # Call set up again to make sure the basics is in sync + _setup_process_sets(_basics) # import reduction op values Average = _basics.Average