Skip to content

Commit

Permalink
Adding "equal" and "allequal", bugfix for "root" in "copydatasets" (#14)
Browse files Browse the repository at this point in the history
  • Loading branch information
tdegeus committed Dec 6, 2019
1 parent 3de7648 commit 181790a
Show file tree
Hide file tree
Showing 2 changed files with 123 additions and 21 deletions.
141 changes: 120 additions & 21 deletions GooseHDF5/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import warnings
warnings.filterwarnings("ignore")

import posixpath
import re
import h5py

__version__ = '0.2.0'
__version__ = '0.3.0'

# ==================================================================================================

Expand All @@ -14,6 +12,8 @@ def abspath(path):
Return absolute path.
'''

import posixpath

return posixpath.normpath(posixpath.join('/', path))

# ==================================================================================================
Expand All @@ -23,10 +23,20 @@ def join(*args, root=False):
Join path components.
'''

import posixpath

lst = []

for i, arg in enumerate(args):
if i == 0:
lst += [arg]
else:
lst += [arg.strip('/')]

if root:
return posixpath.join('/', *args)
return posixpath.join('/', *lst)

return posixpath.join(*args)
return posixpath.join(*lst)

# ==================================================================================================

Expand All @@ -36,19 +46,17 @@ def getdatasets(data, root='/'):
.. code-block:: python
data = h5py.File('...', 'r')
with h5py.File('...', 'r') as data:
# loop over all paths
for path in GooseHDF5.getdatasets(data):
print(path)
# get a set of all datasets
paths = set(GooseHDF5.getdatasets(data))
# loop over all paths
for path in GooseHDF5.getdatasets(data):
print(path)
# get a list of all datasets
paths = list(GooseHDF5.getdatasets(data))
# get a set of all datasets
paths = set(GooseHDF5.getdatasets(data))
data.close()
# get a list of all datasets
paths = list(GooseHDF5.getdatasets(data))
Read more in `this answer <https://stackoverflow.com/a/50720736/2646505>`_.
Expand Down Expand Up @@ -136,10 +144,10 @@ def getpaths(data, root='/', max_depth=None, fold=None):
.. code-block:: python
data = h5py.File('...', 'r')
with h5py.File('...', 'r') as data:
for path in GooseHDF5.getpaths(data, max_depth=2, fold='/data'):
print(path)
for path in GooseHDF5.getpaths(data, max_depth=2, fold='/data'):
print(path)
Will print:
Expand All @@ -149,8 +157,9 @@ def getpaths(data, root='/', max_depth=None, fold=None):
/data/...
/e
The ``...`` indicate it concerns a folded group, not a dataset. The first group was folded because
of the maximum depth, and the second because if was specifically requested to be folded.
The ``...`` indicate that it concerns a folded group, not a dataset.
Here, the first group was folded because of the maximum depth, and the second because it was
specifically requested to be folded.
'''

if max_depth and fold:
Expand Down Expand Up @@ -317,6 +326,29 @@ def iterator(g, prefix, fold, max_depth):

# ==================================================================================================

def filter_datasets(data, paths):
r'''
From a list of paths filter those paths that do not point to datasets.
This function can for example be used in conjunction with "getpaths":
.. code-block:: python
with h5py.File('...', 'r') as data:
datasets = GooseHDF5.filter_datasets(data,
GooseHDF5.getpaths(data, max_depth=2, fold='/data'))
'''

import re

paths = list(paths)
paths = [path for path in paths if not re.match(r'(.*)(/\.\.\.)', path)]
paths = [path for path in paths if isinstance(data[path], h5py.Dataset)]
return paths

# ==================================================================================================

def verify(data, datasets, error=False):
r'''
Try reading each dataset of a list of datasets. Return a list with only those datasets that can be
Expand Down Expand Up @@ -394,13 +426,15 @@ def copydatasets(source, dest, source_datasets, dest_datasets=None, root=None):
In addition a 'root' (path prefix) for the destination datasets name can be specified.
'''

import posixpath

source_datasets = [abspath(path) for path in source_datasets]

if not dest_datasets:
dest_datasets = [path for path in source_datasets]

if root:
dest_datasets = [join(path, root=True) for path in dest_datasets]
dest_datasets = [join(root, path, root=True) for path in dest_datasets]

for dest_path in dest_datasets:
if exists(dest, dest_path):
Expand All @@ -420,3 +454,68 @@ def copydatasets(source, dest, source_datasets, dest_datasets=None, root=None):
for source_path, dest_path in zip(source_datasets, dest_datasets):
group = posixpath.split(dest_path)[0]
source.copy(source_path, dest[group], posixpath.split(dest_path)[1])

# ==================================================================================================

def _equal(a, b):

import numpy as np

if isinstance(a, h5py.Group) and isinstance(b, h5py.Group):
return True

if not isinstance(a, h5py.Dataset) or not isinstance(b, h5py.Dataset):
raise IOError('Not a Dataset')

if np.issubdtype(a.dtype, np.number) and np.issubdtype(b.dtype, np.number):
if np.allclose(a, b):
return True
else:
return False

if a.size != b.size:
return False

if a.size == 1:
if a[...] == b[...]:
return True
else:
return False

if list(a) == list(b):
return True
else:
return False

return True

# --------------------------------------------------------------------------------------------------

def equal(source, dest, source_dataset, dest_dataset=None):

if not dest_dataset:
dest_dataset = source_dataset

if source_dataset not in source:
raise IOError('"{0:s} not in {1:s}'.format(source_dataset, source.filename))

if dest_dataset not in dest:
raise IOError('"{0:s} not in {1:s}'.format(dest_dataset, dest.filename))

return _equal(source[source_dataset], dest[dest_dataset])

# --------------------------------------------------------------------------------------------------

def allequal(source, dest, source_datasets, dest_datasets=None):
r'''
Check that all listed datasets are equal in both files.
'''

if not dest_datasets:
dest_datasets = [path for path in source_datasets]

for source_dataset, dest_dataset in zip(source_datasets, dest_datasets):
if not equal(source, dest, source_dataset, dest_dataset):
return False

return True
3 changes: 3 additions & 0 deletions docs/module.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Iterators

GooseHDF5.getdatasets
GooseHDF5.getpaths
GooseHDF5.filter_datasets
GooseHDF5.copydatasets

Verify
Expand All @@ -32,6 +33,8 @@ Verify
GooseHDF5.exists
GooseHDF5.exists_any
GooseHDF5.exists_all
GooseHDF5.equal
GooseHDF5.allequal

Documentation
=============
Expand Down

0 comments on commit 181790a

Please sign in to comment.