Skip to content

Commit

Permalink
Merge pull request #4758 from nilsvu/exit_codes
Browse files Browse the repository at this point in the history
Support exit codes
  • Loading branch information
nilsdeppe committed Feb 22, 2023
2 parents e2984f9 + bcb2ab3 commit a84801f
Show file tree
Hide file tree
Showing 10 changed files with 139 additions and 66 deletions.
4 changes: 4 additions & 0 deletions docs/Tutorials/CheckpointRestart.md
Expand Up @@ -21,6 +21,10 @@ Executables can checkpoint when:
This reduces the disc space taken up by checkpoint files and stops using
up the allocation's CPU-hours on work that would be redone anyway after the
run is restarted.
The executable will return exit code 2 when it terminates from
`CheckpointAndExitAfterWallclock`, meaning it is incomplete and should
continue from the checkpoint. See `Parallel::ExitCode` for a definition of
all exit code.
- using `VisitAndReturn(WriteCheckpoint)`. This is useful for writing more
frequent checkpoint files, which could help when debugging a run by
restarting it from just before the failure.
Expand Down
1 change: 1 addition & 0 deletions src/Parallel/CMakeLists.txt
Expand Up @@ -30,6 +30,7 @@ spectre_target_headers(
CharmRegistration.hpp
CreateFromOptions.hpp
DistributedObject.hpp
ExitCode.hpp
GetSection.hpp
GlobalCache.hpp
GlobalCacheDeclare.hpp
Expand Down
38 changes: 38 additions & 0 deletions src/Parallel/ExitCode.hpp
@@ -0,0 +1,38 @@
// Distributed under the MIT License.
// See LICENSE.txt for details.

#pragma once

#include "DataStructures/DataBox/Tag.hpp"

namespace Parallel {

/*!
* \brief Exit code of an executable
*
* \warning Don't change the integer values of the enum cases unless you have a
* very good reason to do so. The integer values are used by external code, so
* this is a public interface that should remain stable.
*/
enum class ExitCode : int {
/// Program is complete
Complete = 0,
/// Program aborted because of an error
Abort = 1,
/// Program is incomplete and should be continued from the last checkpoint
ContinueFromCheckpoint = 2
};

namespace Tags {

/*!
* \brief Exit code of an executable
*
* \see Parallel::ExitCode
*/
struct ExitCode : db::SimpleTag {
using type = Parallel::ExitCode;
};

} // namespace Tags
} // namespace Parallel
7 changes: 6 additions & 1 deletion src/Parallel/Main.hpp
Expand Up @@ -22,6 +22,7 @@
#include "Parallel/AlgorithmMetafunctions.hpp"
#include "Parallel/CharmRegistration.hpp"
#include "Parallel/CreateFromOptions.hpp"
#include "Parallel/ExitCode.hpp"
#include "Parallel/GlobalCache.hpp"
#include "Parallel/Local.hpp"
#include "Parallel/ParallelComponentHelpers.hpp"
Expand Down Expand Up @@ -584,6 +585,8 @@ Main<Metavariables>::Main(CkArgMsg* msg) {
global_cache_proxy_.set_parallel_components(the_parallel_components,
callback);

get<Tags::ExitCode>(phase_change_decision_data_) =
Parallel::ExitCode::Complete;
PhaseControl::initialize_phase_change_decision_data(
make_not_null(&phase_change_decision_data_),
*Parallel::local_branch(global_cache_proxy_));
Expand Down Expand Up @@ -921,7 +924,9 @@ void Main<Metavariables>::post_deadlock_analysis_termination() {
if (not components_that_did_not_terminate_.empty()) {
sys::abort("");
} else {
sys::exit();
const Parallel::ExitCode exit_code =
get<Tags::ExitCode>(phase_change_decision_data_);
sys::exit(static_cast<int>(exit_code));
}
}

Expand Down
4 changes: 4 additions & 0 deletions src/Parallel/PhaseControl/CheckpointAndExitAfterWallclock.hpp
Expand Up @@ -13,6 +13,7 @@
#include "Options/Options.hpp"
#include "Parallel/AlgorithmMetafunctions.hpp"
#include "Parallel/CharmPupable.hpp"
#include "Parallel/ExitCode.hpp"
#include "Parallel/GlobalCache.hpp"
#include "Parallel/Phase.hpp"
#include "Parallel/PhaseControl/ContributeToPhaseChangeReduction.hpp"
Expand Down Expand Up @@ -204,6 +205,8 @@ CheckpointAndExitAfterWallclock::arbitrate_phase_change_impl(
auto& wallclock_hours_at_checkpoint =
tuples::get<Tags::WallclockHoursAtCheckpoint>(
*phase_change_decision_data);
auto& exit_code =
tuples::get<Parallel::Tags::ExitCode>(*phase_change_decision_data);
if (restart_phase.has_value()) {
ASSERT(wallclock_hours_at_checkpoint.has_value(),
"Consistency error: Should have recorded the Wallclock time "
Expand All @@ -215,6 +218,7 @@ CheckpointAndExitAfterWallclock::arbitrate_phase_change_impl(
// - restart_phase, if the time is small
if (elapsed_hours >= wallclock_hours_at_checkpoint.value()) {
// Preserve restart_phase for use after restarting from the checkpoint
exit_code = Parallel::ExitCode::ContinueFromCheckpoint;
return std::make_pair(Parallel::Phase::Exit,
ArbitrationStrategy::RunPhaseImmediately);
} else {
Expand Down
3 changes: 2 additions & 1 deletion src/Parallel/PhaseControl/PhaseControlTags.hpp
Expand Up @@ -9,6 +9,7 @@
#include <vector>

#include "Options/Options.hpp"
#include "Parallel/ExitCode.hpp"
#include "Parallel/PhaseControl/PhaseChange.hpp"
#include "Parallel/Serialize.hpp"
#include "ParallelAlgorithms/EventsAndTriggers/Trigger.hpp"
Expand Down Expand Up @@ -121,5 +122,5 @@ using get_phase_change_tags = tmpl::push_back<
tmpl::flatten<tmpl::transform<
typename detail::phase_change_derived_classes<Metavariables>::type,
detail::get_phase_change_tags_and_combines<tmpl::_1>>>,
TagsAndCombines::UsePhaseChangeArbitration>;
TagsAndCombines::UsePhaseChangeArbitration, Parallel::Tags::ExitCode>;
} // namespace PhaseControl
4 changes: 2 additions & 2 deletions src/Utilities/System/Exit.hpp
Expand Up @@ -11,8 +11,8 @@ namespace sys {
/// \ingroup UtilitiesGroup
/// \brief Exit the program normally.
/// This should only be called once over all processors.
[[noreturn]] inline void exit() {
CkExit();
[[noreturn]] inline void exit(const int exit_code = 0) {
CkExit(exit_code);
// the following call is never reached, but suppresses the warning that
// a 'noreturn' function does return
std::terminate(); // LCOV_EXCL_LINE
Expand Down
Expand Up @@ -9,6 +9,7 @@
#include "DataStructures/DataBox/DataBox.hpp"
#include "Framework/TestCreation.hpp"
#include "Options/Protocols/FactoryCreation.hpp"
#include "Parallel/ExitCode.hpp"
#include "Parallel/GlobalCache.hpp"
#include "Parallel/Phase.hpp"
#include "Parallel/PhaseControl/CheckpointAndExitAfterWallclock.hpp"
Expand Down Expand Up @@ -46,44 +47,47 @@ SPECTRE_TEST_CASE("Unit.Parallel.PhaseControl.CheckpointAndExitAfterWallclock",

Parallel::GlobalCache<Metavariables> cache{};

using phase_change_decision_data_type = tuples::tagged_tuple_from_typelist<
using PhaseChangeDecisionData = tuples::tagged_tuple_from_typelist<
PhaseControl::get_phase_change_tags<Metavariables>>;
phase_change_decision_data_type phase_change_decision_data{
Parallel::Phase::Execute, true, 1.0, true};

const PhaseControl::CheckpointAndExitAfterWallclock phase_change0(0.0);
const PhaseControl::CheckpointAndExitAfterWallclock phase_change1(1.0);
{
INFO("Test initialize phase change decision data");
PhaseChangeDecisionData phase_change_decision_data{
Parallel::Phase::Execute, true, 1.0, true,
Parallel::ExitCode::Complete};
phase_change0.initialize_phase_data<Metavariables>(
make_not_null(&phase_change_decision_data));
// extra parens in the check prevent Catch from trying to stream the tuple
CHECK((phase_change_decision_data ==
phase_change_decision_data_type{std::nullopt, std::nullopt, false,
true}));
PhaseChangeDecisionData{std::nullopt, std::nullopt, false, true,
Parallel::ExitCode::Complete}));
}
{
INFO("Test arbitrate phase control");
INFO("Wallclock time < big trigger time");
// Check behavior when a checkpoint-and-exit has been requested
// First check case where wallclock time < trigger wallclock time, using
// the PhaseChange with a big trigger time.
// (this assumes the test doesn't take 1h to get here)
phase_change_decision_data =
phase_change_decision_data_type{std::nullopt, std::nullopt, true, true};
auto decision_result = phase_change1.arbitrate_phase_change(
PhaseChangeDecisionData phase_change_decision_data{
std::nullopt, std::nullopt, true, true, Parallel::ExitCode::Complete};
const auto decision_result = phase_change1.arbitrate_phase_change(
make_not_null(&phase_change_decision_data), Parallel::Phase::Execute,
cache);
CHECK((decision_result == std::nullopt));
CHECK((phase_change_decision_data ==
phase_change_decision_data_type{std::nullopt, std::nullopt, false,
true}));

// Now check case where wallclock time < trigger wallclock time, using
PhaseChangeDecisionData{std::nullopt, std::nullopt, false, true,
Parallel::ExitCode::Complete}));
}
{
INFO("Wallclock time > small trigger time");
// Now check case where wallclock time > trigger wallclock time, using
// the PhaseChange with a tiny trigger time.
// (this assumes the test takes at least a few cycles to get here)
phase_change_decision_data =
phase_change_decision_data_type{std::nullopt, std::nullopt, true, true};
decision_result = phase_change0.arbitrate_phase_change(
PhaseChangeDecisionData phase_change_decision_data{
std::nullopt, std::nullopt, true, true, Parallel::ExitCode::Complete};
const auto decision_result = phase_change0.arbitrate_phase_change(
make_not_null(&phase_change_decision_data), Parallel::Phase::Execute,
cache);
CHECK((decision_result ==
Expand All @@ -103,38 +107,45 @@ SPECTRE_TEST_CASE("Unit.Parallel.PhaseControl.CheckpointAndExitAfterWallclock",
phase_change_decision_data) < one_second);
CHECK(tuples::get<PhaseControl::Tags::CheckpointAndExitRequested>(
phase_change_decision_data) == false);

}
{
INFO("Restarting from checkpoint");
// Check behavior following the checkpoint phase
// First check case where wallclock time < recorded time, which corresponds
// to restarting from a checkpoint.
// (this assumes the test doesn't take 1h to get here)
phase_change_decision_data = phase_change_decision_data_type{
Parallel::Phase::Execute, 1.0, false, true};
decision_result = phase_change0.arbitrate_phase_change(
PhaseChangeDecisionData phase_change_decision_data{
Parallel::Phase::Execute, 1.0, false, true,
Parallel::ExitCode::Complete};
const auto decision_result = phase_change0.arbitrate_phase_change(
make_not_null(&phase_change_decision_data),
Parallel::Phase::WriteCheckpoint, cache);
CHECK((decision_result ==
std::make_pair(
Parallel::Phase::Execute,
PhaseControl::ArbitrationStrategy::PermitAdditionalJumps)));
CHECK((phase_change_decision_data ==
phase_change_decision_data_type{std::nullopt, std::nullopt, false,
true}));

PhaseChangeDecisionData{std::nullopt, std::nullopt, false, true,
Parallel::ExitCode::Complete}));
}
{
INFO("Exiting after checkpoint");
// Now check case where wallclock time > recorded time, which corresponds to
// having just written a checkpoint. We want to exit now.
// having just written a checkpoint. We want to exit with exit code 2 now.
// (this assumes the test takes at least a few cycles to get here)
phase_change_decision_data = phase_change_decision_data_type{
Parallel::Phase::Execute, 1e-15, false, true};
decision_result = phase_change0.arbitrate_phase_change(
PhaseChangeDecisionData phase_change_decision_data{
Parallel::Phase::Execute, 1e-15, false, true,
Parallel::ExitCode::Complete};
const auto decision_result = phase_change0.arbitrate_phase_change(
make_not_null(&phase_change_decision_data),
Parallel::Phase::WriteCheckpoint, cache);
CHECK((decision_result ==
std::make_pair(
Parallel::Phase::Exit,
PhaseControl::ArbitrationStrategy::RunPhaseImmediately)));
CHECK((phase_change_decision_data ==
phase_change_decision_data_type{Parallel::Phase::Execute, 1e-15,
false, true}));
CHECK(
(phase_change_decision_data ==
PhaseChangeDecisionData{Parallel::Phase::Execute, 1e-15, false, true,
Parallel::ExitCode::ContinueFromCheckpoint}));
}
}
12 changes: 7 additions & 5 deletions tests/Unit/Parallel/PhaseControl/Test_PhaseControlTags.cpp
Expand Up @@ -14,6 +14,7 @@
#include "Options/Options.hpp"
#include "Options/Protocols/FactoryCreation.hpp"
#include "Parallel/CharmPupable.hpp"
#include "Parallel/ExitCode.hpp"
#include "Parallel/PhaseControl/PhaseChange.hpp"
#include "Parallel/PhaseControl/PhaseControlTags.hpp"
#include "Parallel/RegisterDerivedClassesWithCharm.hpp"
Expand Down Expand Up @@ -123,10 +124,11 @@ SPECTRE_TEST_CASE("Unit.Parallel.PhaseControl.PhaseControlTags",
CHECK(dynamic_cast<TestCreatable<2_st>*>(second_creatable.get())
->option_value_ == 2);

static_assert(std::is_same_v<
PhaseControl::get_phase_change_tags<Metavariables>,
tmpl::list<
Tags::DummyDecisionTag2, Tags::DummyDecisionTag1,
PhaseControl::TagsAndCombines::UsePhaseChangeArbitration>>);
static_assert(
std::is_same_v<
PhaseControl::get_phase_change_tags<Metavariables>,
tmpl::list<Tags::DummyDecisionTag2, Tags::DummyDecisionTag1,
PhaseControl::TagsAndCombines::UsePhaseChangeArbitration,
Parallel::Tags::ExitCode>>);
}
} // namespace

0 comments on commit a84801f

Please sign in to comment.