Skip to content

Commit

Permalink
Merge pull request #4633 from nilsvu/fix_hdf5_locking
Browse files Browse the repository at this point in the history
Disable HDF5 file locking to prevent crashes
  • Loading branch information
wthrowe committed Jan 27, 2023
2 parents 19712dd + 39a6cd0 commit 990e3c5
Show file tree
Hide file tree
Showing 12 changed files with 265 additions and 182 deletions.
28 changes: 28 additions & 0 deletions cmake/SetupHdf5.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,34 @@ if(HDF5_IS_PARALLEL)
endif()
endif()

# Check if file locking API is available. The versions supporting this feature
# are listed here:
# https://github.com/HDFGroup/hdf5/blob/develop/doc/file-locking.md
include(CheckCXXSourceCompiles)
set(CMAKE_REQUIRED_LIBRARIES hdf5::hdf5)
check_cxx_source_compiles(
"#include <hdf5.h>\n\
int main() {\n\
const hid_t fapl_id = H5Pcopy(H5P_DEFAULT);\n\
H5Pset_file_locking(fapl_id, false, true);\n\
}"
HDF5_SUPPORTS_SET_FILE_LOCKING)
if(${HDF5_SUPPORTS_SET_FILE_LOCKING})
set_property(
TARGET hdf5::hdf5
APPEND PROPERTY INTERFACE_COMPILE_DEFINITIONS
HDF5_SUPPORTS_SET_FILE_LOCKING)
else()
message(WARNING "The HDF5 library does not support 'H5Pset_file_locking'. "
"This means that simulations may crash when you read H5 files while "
"the simulation is trying to access them. To avoid this, set the "
"environment variable\n"
" HDF5_USE_FILE_LOCKING=FALSE\n"
"when running simulations, or load an HDF5 module that supports "
"'H5Pset_file_locking'. Supporting versions are listed here:\n"
"https://github.com/HDFGroup/hdf5/blob/develop/doc/file-locking.md")
endif()

set_property(
GLOBAL APPEND PROPERTY SPECTRE_THIRD_PARTY_LIBS
hdf5::hdf5
Expand Down
16 changes: 13 additions & 3 deletions containers/Dockerfile.buildenv
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ RUN apt-get update -y \
gdb git ninja-build autoconf automake \
bison flex \
libopenblas-dev liblapack-dev \
libhdf5-dev hdf5-tools \
libgsl0-dev \
clang-10 clang-format-10 clang-tidy-10 \
lld \
Expand Down Expand Up @@ -153,6 +152,15 @@ RUN wget https://github.com/doxygen/doxygen/archive/Release_1_9_3.tar.gz -O doxy
&& make $PARALLEL_MAKE_ARG \
&& make install \
&& cd ../.. && rm -rf doxygen*
# - HDF5
RUN wget https://support.hdfgroup.org/ftp/HDF5/releases/hdf5-1.10/hdf5-1.10.9/src/hdf5-1.10.9.tar.gz -O hdf5.tar.gz \
&& tar -xzf hdf5.tar.gz \
&& mv hdf5-* hdf5 \
&& cd hdf5 \
&& ./configure --prefix=/usr/local \
&& make $PARALLEL_MAKE_ARG \
&& make install \
&& cd .. && rm -rf hdf5*
# - Libsharp
RUN wget https://github.com/Libsharp/libsharp/archive/v1.0.0.tar.gz -O libsharp.tar.gz \
&& tar -xzf libsharp.tar.gz \
Expand Down Expand Up @@ -193,8 +201,7 @@ RUN wget https://github.com/jbeder/yaml-cpp/archive/yaml-cpp-0.6.3.tar.gz -O yam
&& make install \
&& cd ../.. \
&& rm -rf yaml-cpp*

# Install xsimd https://github.com/xtensor-stack/xsimd
# - xsimd https://github.com/xtensor-stack/xsimd
RUN wget http://github.com/xtensor-stack/xsimd/archive/refs/tags/8.1.0.tar.gz \
&& tar -xzf 8.1.0.tar.gz \
&& cd ./xsimd-8.1.0 \
Expand All @@ -206,6 +213,9 @@ RUN wget http://github.com/xtensor-stack/xsimd/archive/refs/tags/8.1.0.tar.gz \
&& cd ../.. \
&& rm -rf /work/8.1.0.tar.gz /work/xsimd-8.1.0

# Update ld cache to find shared libs in /usr/local/lib/
RUN ldconfig

WORKDIR /work
# Charm doesn't support compiling with clang without symbolic links
RUN ln -s $(which clang++-10) /usr/local/bin/clang++ \
Expand Down
29 changes: 26 additions & 3 deletions src/IO/H5/File.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
namespace h5 {
template <AccessType Access_t>
H5File<Access_t>::H5File(std::string file_name, bool append_to_file,
const std::string& input_source)
const std::string& input_source, bool use_file_locking)
: file_name_(std::move(file_name)) {
if (file_name_.size() - 3 != file_name_.find(".h5")) {
ERROR("All HDF5 file names must end in '.h5'. The path and file name '"
Expand All @@ -48,13 +48,24 @@ H5File<Access_t>::H5File(std::string file_name, bool append_to_file,
"explicitly delete the file first using the file_system "
"library in SpECTRE or through your shell.");
}

const hid_t fapl_id = H5Pcreate(H5P_FILE_ACCESS);
CHECK_H5(fapl_id, "Failed to create file access property list.");
#ifdef HDF5_SUPPORTS_SET_FILE_LOCKING
CHECK_H5(H5Pset_file_locking(
fapl_id, use_file_locking,
// Ignore file locks when they are disabled on the file system
true),
"Failed to configure file locking.");
#endif

file_id_ = file_exists
? H5Fopen(file_name_.c_str(),
AccessType::ReadOnly == Access_t ? h5f_acc_rdonly()
: h5f_acc_rdwr(),
h5p_default())
fapl_id)
: H5Fcreate(file_name_.c_str(), h5f_acc_trunc(), h5p_default(),
h5p_default());
fapl_id);
CHECK_H5(file_id_, "Failed to open file '"
<< file_name_ << "'. The file exists status is: "
<< std::boolalpha << file_exists
Expand Down Expand Up @@ -120,6 +131,18 @@ H5File<Access_t>::~H5File() {
}
}
}

template <AccessType Access_t>
void H5File<Access_t>::close() const {
// Need to close current object because `H5Fclose` keeps the file open
// internally until all objects are closed
close_current_object();
if (file_id_ != -1) {
CHECK_H5(H5Fclose(file_id_), "Failed to close the file.");
file_id_ = -1;
}
}

template <AccessType Access_t>
std::string H5File<Access_t>::input_source() const {
return h5::read_value_attribute<std::string>(file_id_, "InputSource.yaml"s);
Expand Down
25 changes: 23 additions & 2 deletions src/IO/H5/File.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,23 @@ class H5File {
* formatted). Defaults to an empty string; when writing, specify the provided
* yaml input options (if any) to write them to the output file's
* `InputSource.yaml` attribute.
* @param use_file_locking Toggle file locking (default false).
* HDF5 file locking is explained here:
* https://github.com/HDFGroup/hdf5/blob/develop/doc/file-locking.md.
* This toggle only has an effect if the HDF5 library supports
* 'H5Pset_file_locking'. Otherwise, file locking is enabled if the HDF5
* library was built with it, which it probably was. If file locking is
* enabled, simulations may crash when the file they try to access is being
* read by another process (like an analysis tool). We could make this more
* resilient in the future by waiting to acquire the file lock with a timeout,
* and/or retrying IO operations after progressively longer wait times (e.g.
* first try again right away, then also print to terminal after some retries,
* then eventually abort to avoid wasting compute time on a run that can't do
* IO).
*/
explicit H5File(std::string file_name, bool append_to_file = false,
const std::string& input_source = ""s);
const std::string& input_source = ""s,
bool use_file_locking = false);

/// \cond HIDDEN_SYMBOLS
~H5File();
Expand Down Expand Up @@ -167,6 +181,12 @@ class H5File {
*/
void close_current_object() const { current_object_ = nullptr; }

/*!
* \effects Closes the H5 file. No H5 operations are permitted after this
* operation.
*/
void close() const;

template <typename ObjectType>
bool exists(const std::string& path) const {
auto exists_group_name = check_if_object_exists<ObjectType>(path);
Expand All @@ -192,7 +212,8 @@ class H5File {
const std::string& path) const;

std::string file_name_;
hid_t file_id_{-1};
// NOLINTNEXTLINE(spectre-mutable)
mutable hid_t file_id_{-1};
// NOLINTNEXTLINE(spectre-mutable)
mutable std::unique_ptr<h5::Object> current_object_{nullptr};
std::vector<std::string> h5_groups_;
Expand Down
5 changes: 3 additions & 2 deletions src/IO/H5/Python/File.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ void bind_h5file_impl(py::module& m) { // NOLINT
return f.template get<h5::Dat>(path);
},
py::return_value_policy::reference, py::arg("path"))
.def("close", &H5File::close_current_object)
.def("close_current_object", &H5File::close_current_object)
.def("close", &H5File::close)
.def("all_files",
[](const H5File& f) -> const std::vector<std::string> {
return f.template all_files<void>("/");
Expand All @@ -63,7 +64,7 @@ void bind_h5file_impl(py::module& m) { // NOLINT
.def("__exit__", [](H5File& f, const py::object& /* exception_type */,
const py::object& /* val */,
const py::object& /* traceback */) {
f.close_current_object();
f.close();
});

if constexpr (Access_t == h5::AccessType::ReadWrite) {
Expand Down
10 changes: 5 additions & 5 deletions src/Visualization/Python/InterpolateToMesh.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,14 @@ def interpolate_to_mesh(source_file_path,
if obs_start <= source_vol.get_observation_value(obs) <= obs_end
][::obs_stride]

source_file.close()
source_file.close_current_object()
target_file.insert_vol(target_volume_data, version)
target_file.close()
target_file.close_current_object()

for obs in observations:
# the vols memory address may shift as we write to file,
# so we need to get them every iteration
source_file.close()
source_file.close_current_object()
source_vol = source_file.get_vol(source_volume_data)
extents = source_vol.get_extents(obs)
bases = source_vol.get_bases(obs)
Expand All @@ -126,7 +126,7 @@ def interpolate_to_mesh(source_file_path,
copy=False) for name in tensor_names
]

source_file.close()
source_file.close_current_object()

volume_data = []
# iterate over elements
Expand Down Expand Up @@ -157,7 +157,7 @@ def interpolate_to_mesh(source_file_path,
extents=target_mesh.extents(),
basis=target_mesh.basis(),
quadrature=target_mesh.quadrature()))
target_file.close()
target_file.close_current_object()
target_vol = target_file.get_vol(target_volume_data)
target_vol.write_volume_data(obs, obs_value, volume_data)

Expand Down
4 changes: 2 additions & 2 deletions support/Environments/caltech_hpc_gcc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ spectre_load_sys_modules() {
module load oneapi/mpi/2021.5.1
module load mkl/18.1
module load gsl/2.4
module load hdf5/1.10.1
module load hdf5/1.12.1
module load boost/1_68_0-gcc730
module load cmake/3.18.0
module load python3/3.8.5
Expand All @@ -19,7 +19,7 @@ spectre_load_sys_modules() {
# Unload system modules
spectre_unload_sys_modules() {
module unload boost/1_68_0-gcc730
module unload hdf5/1.10.1
module unload hdf5/1.12.1
module unload gsl/2.4
module unload mkl/18.1
module unload oneapi/mpi/2021.5.1
Expand Down
2 changes: 1 addition & 1 deletion support/Python/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# Click: to compose our Python CLI, load subcommands lazily, and for
# autocompletion
click
h5py >= 3.0.0
h5py >= 3.5.0
# - We constrain the Numpy version because v1.22+ segfaults in Pypp tests, see
# issue: https://github.com/sxs-collaboration/spectre/issues/3844
numpy < 1.22.0
Expand Down
14 changes: 6 additions & 8 deletions tests/Unit/IO/H5/Python/Test_ExtractDatFromH5.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,12 @@ def tearDown(self):
shutil.rmtree(self.created_dir)

def test_extract_dat_files(self):
h5file = spectre_h5.H5File(self.h5_filename, "r")

expected_memory_data = np.array(
h5file.get_dat("/Group0/MemoryData").get_data())
h5file.close()
expected_timestep_data = np.array(
h5file.get_dat("/TimeSteps2").get_data())
h5file.close()
with spectre_h5.H5File(self.h5_filename, "r") as h5file:
expected_memory_data = np.array(
h5file.get_dat("/Group0/MemoryData").get_data())
h5file.close_current_object()
expected_timestep_data = np.array(
h5file.get_dat("/TimeSteps2").get_data())

# All defaults, data should be same as expected
extract_dat_files(self.h5_filename,
Expand Down

0 comments on commit 990e3c5

Please sign in to comment.