Skip to content

Commit

Permalink
fix torch op handles lazy release which may cause oom in elastic scen…
Browse files Browse the repository at this point in the history
…ario (horovod#3110)

* fix torch op handles lazy release which may cause oom in elastic scenario

Signed-off-by: guoze.lin <guozelin@tencent.com>

* Update mpi_ops.py

Co-authored-by: guoze.lin <guozelin@tencent.com>
Co-authored-by: Travis Addair <tgaddair@gmail.com>
Signed-off-by: weihanmines <weihan13@amd.com>
  • Loading branch information
3 people authored and weihanmines committed Dec 10, 2021
1 parent e11a5a9 commit 927497c
Showing 1 changed file with 8 additions and 1 deletion.
9 changes: 8 additions & 1 deletion horovod/torch/mpi_ops.py
Expand Up @@ -40,8 +40,8 @@
_NULL = ""

_basics = _HorovodBasics(__file__, 'mpi_lib_v2')

# import basic methods
init = _basics.init
is_initialized = _basics.is_initialized
start_timeline = _basics.start_timeline
stop_timeline = _basics.stop_timeline
Expand All @@ -61,10 +61,16 @@
ccl_built = _basics.ccl_built
cuda_built = _basics.cuda_built
rocm_built = _basics.rocm_built

def shutdown(*args, **kwargs):
mpi_lib.horovod_torch_reset()
return _basics.shutdown(*args, **kwargs)

def init(*args, **kwargs):
global _handle_map
_handle_map = {}
return _basics.init(*args, **kwargs)

# import reduction op values
Average = _basics.Average
Sum = _basics.Sum
Expand Down Expand Up @@ -939,6 +945,7 @@ def synchronize(handle):
output = _handle_map.pop(handle)[-1]
return output
except RuntimeError as e:
_handle_map.pop(handle, None)
raise HorovodInternalError(e)


Expand Down

0 comments on commit 927497c

Please sign in to comment.