Skip to content

Commit

Permalink
Removing MPI barriers in TemporaryDirectoryChanger and Output cache (#…
Browse files Browse the repository at this point in the history
…984)

These utilities aren't always used in a way that has all MPI nodes
synced up. For example if worker x has 3 tasks and worker y has 5,
that could lead to a deadlock if they use these utilities.
Rather than requiring all clients of these utlities know the MPI
global state, we just remove the barriers, and will deal with
file lock issues in more forgiving ways.
  • Loading branch information
ntouran committed Nov 18, 2022
1 parent 474534b commit 323e949
Show file tree
Hide file tree
Showing 7 changed files with 20 additions and 26 deletions.
16 changes: 10 additions & 6 deletions armi/bookkeeping/mainInterface.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from armi import operators
from armi.utils.customExceptions import InputError
from armi.bookkeeping.db.database3 import Database3
from armi import context


ORDER = interfaces.STACK_ORDER.PREPROCESSING
Expand Down Expand Up @@ -150,11 +151,17 @@ def interactEOL(self):
runLog.warningReport()

def cleanARMIFiles(self):
r"""delete ARMI run files like MC**2 and REBUS inputs/outputs. Useful
if running a clean job that doesn't require restarts."""
"""
Delete temporary ARMI run files like simulation inputs/outputs.
Useful if running a clean job that doesn't require restarts.
"""
if context.MPI_RANK != 0:
# avoid inadvertently calling from worker nodes which could cause filesystem lockups.
raise ValueError("Only the master node is allowed to clean files here.")
runLog.important("Cleaning ARMI files due to smallRun option")
for fileName in os.listdir(os.getcwd()):
# clean MC**2 and REBUS inputs and outputs
# clean simulation inputs and outputs
for candidate in [".BCD", ".inp", ".out", "ISOTXS-"]:
if candidate in fileName:
if ".htos.out" in fileName:
Expand All @@ -174,20 +181,17 @@ def cleanARMIFiles(self):
node = int(snapText[3:])
newFolder = "snapShot{0}_{1}".format(cycle, node)
utils.pathTools.cleanPath(newFolder)
context.waitAll()

# delete database if it's SQLlite
# no need to delete because the database won't have copied it back if using fastpath.

# clean temp directories.
if os.path.exists("shuffleBranches"):
utils.pathTools.cleanPath("shuffleBranches")
context.waitAll()
# Potentially, wait for all the processes to catch up.

if os.path.exists("failedRuns"):
utils.pathTools.cleanPath("failedRuns")
context.waitAll()

# pylint: disable=no-self-use
def cleanLastCycleFiles(self):
Expand Down
8 changes: 0 additions & 8 deletions armi/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,14 +285,6 @@ def cleanAllArmiTempDirs(olderThanDays: int) -> None:
pass


def waitAll() -> None:
"""
If there are parallel processes running, wait for all to catch up to the checkpoint.
"""
if MPI_SIZE > 1 and MPI_DISTRIBUTABLE:
MPI_COMM.barrier()


def disconnectAllHdfDBs() -> None:
"""
Forcibly disconnect all instances of HdfDB objects
Expand Down
10 changes: 6 additions & 4 deletions armi/tests/test_mpiFeatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@
elif find_executable("mpiexec") is not None:
MPI_EXE = "mpiexec"

MPI_COMM = context.MPI_COMM


class FailingInterface1(Interface):
"""utility classes to make sure the logging system fails properly"""
Expand Down Expand Up @@ -262,15 +264,15 @@ def test_cleanPathMpi(self):
self.assertTrue(os.path.exists(filePath0))
with self.assertRaises(Exception):
pathTools.cleanPath(filePath0, mpiRank=context.MPI_RANK)
context.waitAll()
MPI_COMM.barrier()

# TEST 1: Delete a single file
filePath1 = "test1_cleanPathNoMpi_mongoose"
open(filePath1, "w").write("something")

self.assertTrue(os.path.exists(filePath1))
pathTools.cleanPath(filePath1, mpiRank=context.MPI_RANK)
context.waitAll()
MPI_COMM.barrier()
self.assertFalse(os.path.exists(filePath1))

# TEST 2: Delete an empty directory
Expand All @@ -279,7 +281,7 @@ def test_cleanPathMpi(self):

self.assertTrue(os.path.exists(dir2))
pathTools.cleanPath(dir2, mpiRank=context.MPI_RANK)
context.waitAll()
MPI_COMM.barrier()
self.assertFalse(os.path.exists(dir2))

# TEST 3: Delete a directory with two files inside
Expand All @@ -296,7 +298,7 @@ def test_cleanPathMpi(self):
self.assertTrue(os.path.exists(os.path.join(dir3, "file1.txt")))
self.assertTrue(os.path.exists(os.path.join(dir3, "file2.txt")))
pathTools.cleanPath(dir3, mpiRank=context.MPI_RANK)
context.waitAll()
MPI_COMM.barrier()
self.assertFalse(os.path.exists(dir3))


Expand Down
1 change: 0 additions & 1 deletion armi/utils/directoryChangers.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,6 @@ def __exit__(self, exc_type, exc_value, traceback):
"That is, if you create a directory with a name starting with a period, the "
"TempDirChanger will not be able to clean it (for instance, a '.git' dir)."
)
context.waitAll()


class ForcedCreationDirectoryChanger(DirectoryChanger):
Expand Down
1 change: 0 additions & 1 deletion armi/utils/outputCache.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,6 @@ def deleteCache(cachedFolder):
raise RuntimeError("Cache location must contain safeword: `Output_Cache`.")

cleanPath(cachedFolder)
context.waitAll()


def cacheCall(
Expand Down
8 changes: 3 additions & 5 deletions armi/utils/tests/test_pathTools.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,9 @@ def test_moduleAndAttributeExist(self):

@unittest.skipUnless(context.MPI_RANK == 0, "test only on root node")
def test_cleanPathNoMpi(self):
"""Simple tests of cleanPath(), in the no-MPI scenario"""
"""
Simple tests of cleanPath(), in the no-MPI scenario
"""
with TemporaryDirectoryChanger():
# TEST 0: File is not safe to delete, due to name pathing
filePath0 = "test0_cleanPathNoMpi"
Expand All @@ -84,15 +86,13 @@ def test_cleanPathNoMpi(self):
self.assertTrue(os.path.exists(filePath0))
with self.assertRaises(Exception):
pathTools.cleanPath(filePath0, mpiRank=0)
context.waitAll()

# TEST 1: Delete a single file
filePath1 = "test1_cleanPathNoMpi_mongoose"
open(filePath1, "w").write("something")

self.assertTrue(os.path.exists(filePath1))
pathTools.cleanPath(filePath1, mpiRank=0)
context.waitAll()
self.assertFalse(os.path.exists(filePath1))

# TEST 2: Delete an empty directory
Expand All @@ -101,7 +101,6 @@ def test_cleanPathNoMpi(self):

self.assertTrue(os.path.exists(dir2))
pathTools.cleanPath(dir2, mpiRank=0)
context.waitAll()
self.assertFalse(os.path.exists(dir2))

# TEST 3: Delete a directory with two files inside
Expand All @@ -118,7 +117,6 @@ def test_cleanPathNoMpi(self):
self.assertTrue(os.path.exists(os.path.join(dir3, "file1.txt")))
self.assertTrue(os.path.exists(os.path.join(dir3, "file2.txt")))
pathTools.cleanPath(dir3, mpiRank=0)
context.waitAll()
self.assertFalse(os.path.exists(dir3))

def test_isFilePathNewer(self):
Expand Down
2 changes: 1 addition & 1 deletion doc/release/0.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ What's new in ARMI

Bug fixes
---------
#. TBD
#. Removed Barriers in temp directory changers and output cache to avoid deadlocks in MPI cases


ARMI v0.2.5
Expand Down

0 comments on commit 323e949

Please sign in to comment.