Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: Find datadicts matching a set of conditions #379

Closed
109 changes: 105 additions & 4 deletions plottr/data/datadict_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
The lock file has the following format: ~<file_name>.lock. The file lock will get deleted even if the program
crashes. If the process is suddenly stopped however, we cannot guarantee that the file lock will be deleted.
"""
from operator import itemgetter
import os
import logging
import time
Expand All @@ -15,7 +16,7 @@
import json
import shutil
from enum import Enum
from typing import Any, Union, Optional, Dict, Type, Collection
from typing import Any, Iterator, Tuple, Union, Optional, Dict, Type, Collection
from types import TracebackType
from pathlib import Path

Expand Down Expand Up @@ -279,7 +280,7 @@ def datadict_from_hdf5(path: Union[str, Path],

if stopidx is None or stopidx > min(lens):
stopidx = min(lens)
else:
elif len(set(lens)) == 1:
if stopidx is None or stopidx > lens[0]:
stopidx = lens[0]

Expand Down Expand Up @@ -561,7 +562,7 @@ class DDH5Writer(object):
:param basedir: The root directory in which data is stored.
:meth:`.create_file_structure` is creating the structure inside this root and
determines the file name of the data. The default structure implemented here is
``<root>/YYYY-MM-DD/YYYY-mm-dd_THHMMSS_<ID>-<name>/<filename>.ddh5``,
``<root>/YYYY-mm-dd/YYYY-mm-ddTHHMMSS_<ID>-<name>/<filename>.ddh5``,
where <ID> is a short identifier string and <name> is the value of parameter `name`.
To change this, re-implement :meth:`.data_folder` and/or
:meth:`.create_file_structure`.
Expand Down Expand Up @@ -639,7 +640,7 @@ def data_folder(self) -> Path:
be saved.

Default format:
``<basedir>/YYYY-MM-DD/YYYY-mm-ddTHHMMSS_<ID>-<name>``.
``<basedir>/YYYY-mm-dd/YYYY-mm-ddTHHMMSS_<ID>-<name>``.
In this implementation we use the first 8 characters of a UUID as ID.

:returns: The folder path.
Expand Down Expand Up @@ -716,3 +717,103 @@ def save_dict(self, name: str, d: dict) -> None:
assert self.filepath is not None
with open(self.filepath.parent / name, "x") as f:
json.dump(d, f, indent=4, ensure_ascii=False, cls=NumpyJSONEncoder)


def search_datadicts(
basedir: Union[str, Path],
since: str,
until: Optional[str] = None,
name: Optional[str] = None,
groupname: str = 'data',
filename: str = 'data',
structure_only: bool = False,
only_complete: bool = True,
skip_trash: bool = True,
) -> Iterator[Tuple[str, DataDict]]:
"""Iterate over datadicts matching a set of conditions.

:param basedir: The root directory in which data is stored.
:param since: Date (and time) in the format `YYYY-mm-dd` (or `YYYY-mm-ddTHHMMSS`).
:param until: Date (and time) in the format `YYYY-mm-dd` (or `YYYY-mm-ddTHHMMSS`).
If not given, default to `until = since`.
:param name: Name of the dataset (if not given, match all datasets).
:param groupname: Name of hdf5 group.
:param filename: Name of the ddh5 file without the extension.
:param structure_only: If `True`, don't load the data values.
:param only_complete: If `True`, only return datadicts tagged as complete.
:param skip_trash: If `True`, skip datadicts tagged as trash.
:return: Iterator over (foldername, datadict).
"""
basedir = Path(basedir)
if until is None:
until = since
assert len(since) == len(until)
date = datetime.datetime.strptime(since[:10], "%Y-%m-%d")
until_date = datetime.datetime.strptime(until[:10], "%Y-%m-%d")

while date <= until_date:
date_str = datetime.datetime.strftime(date, "%Y-%m-%d")
for folder_path in sorted((basedir / date_str).iterdir()):
if not folder_path.is_dir():
continue
foldername = folder_path.name
if not (name is None or foldername.endswith(name)):
continue
if not (since <= foldername[:len(since)] <= until):
continue
if only_complete and not ((folder_path / "__complete__.tag").is_file()):
continue
if skip_trash and (folder_path / "__trash__.tag").is_file():
continue
datadict = datadict_from_hdf5(
folder_path / filename,
groupname,
structure_only=structure_only
)
yield foldername, datadict
date += datetime.timedelta(days=1)


def search_datadict(
basedir: Union[str, Path],
since: str,
until: Optional[str] = None,
name: Optional[str] = None,
groupname: str = 'data',
filename: str = 'data',
structure_only: bool = False,
only_complete: bool = True,
skip_trash: bool = True,
newest: bool = False,
) -> Tuple[str, DataDict]:
"""Find the datadict which matches a set of conditions.
`AssertionError` is raised if there are zero or multiple matching datadicts.

:param basedir: The root directory in which data is stored.
:param since: Date (and time) in the format `YYYY-mm-dd` (or `YYYY-mm-ddTHHMMSS`).
:param until: Date (and time) in the format `YYYY-mm-dd` (or `YYYY-mm-ddTHHMMSS`).
If not given, default to `until = since`.
:param name: Name of the dataset (if not given, match all datasets).
:param groupname: Name of hdf5 group.
:param filename: Name of the ddh5 file without the extension.
:param structure_only: If `True`, don't load the data values.
:param only_complete: If `True`, only return datadicts tagged as complete.
:param skip_trash: If `True`, skip datadicts tagged as trash.
:param newest: If `True`, return the newest matching datadict
:return: (foldername, datadict).
"""
result = list(search_datadicts(
basedir,
since,
until=until,
name=name,
groupname=groupname,
filename=filename,
structure_only=structure_only,
only_complete=only_complete,
skip_trash=skip_trash,
))
assert len(result) > 0, "no matching datadict found"
if not newest:
assert len(result) == 1, f"{len(result)} matching datadicts found"
return max(result, key=itemgetter(0))