diff --git a/plottr/data/datadict_storage.py b/plottr/data/datadict_storage.py index fbf612cb..384abe40 100644 --- a/plottr/data/datadict_storage.py +++ b/plottr/data/datadict_storage.py @@ -7,6 +7,7 @@ The lock file has the following format: ~.lock. The file lock will get deleted even if the program crashes. If the process is suddenly stopped however, we cannot guarantee that the file lock will be deleted. """ +from operator import itemgetter import os import logging import time @@ -15,7 +16,7 @@ import json import shutil from enum import Enum -from typing import Any, Union, Optional, Dict, Type, Collection +from typing import Any, Iterator, Tuple, Union, Optional, Dict, Type, Collection from types import TracebackType from pathlib import Path @@ -279,7 +280,7 @@ def datadict_from_hdf5(path: Union[str, Path], if stopidx is None or stopidx > min(lens): stopidx = min(lens) - else: + elif len(set(lens)) == 1: if stopidx is None or stopidx > lens[0]: stopidx = lens[0] @@ -561,7 +562,7 @@ class DDH5Writer(object): :param basedir: The root directory in which data is stored. :meth:`.create_file_structure` is creating the structure inside this root and determines the file name of the data. The default structure implemented here is - ``/YYYY-MM-DD/YYYY-mm-dd_THHMMSS_-/.ddh5``, + ``/YYYY-mm-dd/YYYY-mm-ddTHHMMSS_-/.ddh5``, where is a short identifier string and is the value of parameter `name`. To change this, re-implement :meth:`.data_folder` and/or :meth:`.create_file_structure`. @@ -639,7 +640,7 @@ def data_folder(self) -> Path: be saved. Default format: - ``/YYYY-MM-DD/YYYY-mm-ddTHHMMSS_-``. + ``/YYYY-mm-dd/YYYY-mm-ddTHHMMSS_-``. In this implementation we use the first 8 characters of a UUID as ID. :returns: The folder path. @@ -716,3 +717,103 @@ def save_dict(self, name: str, d: dict) -> None: assert self.filepath is not None with open(self.filepath.parent / name, "x") as f: json.dump(d, f, indent=4, ensure_ascii=False, cls=NumpyJSONEncoder) + + +def search_datadicts( + basedir: Union[str, Path], + since: str, + until: Optional[str] = None, + name: Optional[str] = None, + groupname: str = 'data', + filename: str = 'data', + structure_only: bool = False, + only_complete: bool = True, + skip_trash: bool = True, +) -> Iterator[Tuple[str, DataDict]]: + """Iterate over datadicts matching a set of conditions. + + :param basedir: The root directory in which data is stored. + :param since: Date (and time) in the format `YYYY-mm-dd` (or `YYYY-mm-ddTHHMMSS`). + :param until: Date (and time) in the format `YYYY-mm-dd` (or `YYYY-mm-ddTHHMMSS`). + If not given, default to `until = since`. + :param name: Name of the dataset (if not given, match all datasets). + :param groupname: Name of hdf5 group. + :param filename: Name of the ddh5 file without the extension. + :param structure_only: If `True`, don't load the data values. + :param only_complete: If `True`, only return datadicts tagged as complete. + :param skip_trash: If `True`, skip datadicts tagged as trash. + :return: Iterator over (foldername, datadict). + """ + basedir = Path(basedir) + if until is None: + until = since + assert len(since) == len(until) + date = datetime.datetime.strptime(since[:10], "%Y-%m-%d") + until_date = datetime.datetime.strptime(until[:10], "%Y-%m-%d") + + while date <= until_date: + date_str = datetime.datetime.strftime(date, "%Y-%m-%d") + for folder_path in sorted((basedir / date_str).iterdir()): + if not folder_path.is_dir(): + continue + foldername = folder_path.name + if not (name is None or foldername.endswith(name)): + continue + if not (since <= foldername[:len(since)] <= until): + continue + if only_complete and not ((folder_path / "__complete__.tag").is_file()): + continue + if skip_trash and (folder_path / "__trash__.tag").is_file(): + continue + datadict = datadict_from_hdf5( + folder_path / filename, + groupname, + structure_only=structure_only + ) + yield foldername, datadict + date += datetime.timedelta(days=1) + + +def search_datadict( + basedir: Union[str, Path], + since: str, + until: Optional[str] = None, + name: Optional[str] = None, + groupname: str = 'data', + filename: str = 'data', + structure_only: bool = False, + only_complete: bool = True, + skip_trash: bool = True, + newest: bool = False, +) -> Tuple[str, DataDict]: + """Find the datadict which matches a set of conditions. + `AssertionError` is raised if there are zero or multiple matching datadicts. + + :param basedir: The root directory in which data is stored. + :param since: Date (and time) in the format `YYYY-mm-dd` (or `YYYY-mm-ddTHHMMSS`). + :param until: Date (and time) in the format `YYYY-mm-dd` (or `YYYY-mm-ddTHHMMSS`). + If not given, default to `until = since`. + :param name: Name of the dataset (if not given, match all datasets). + :param groupname: Name of hdf5 group. + :param filename: Name of the ddh5 file without the extension. + :param structure_only: If `True`, don't load the data values. + :param only_complete: If `True`, only return datadicts tagged as complete. + :param skip_trash: If `True`, skip datadicts tagged as trash. + :param newest: If `True`, return the newest matching datadict + :return: (foldername, datadict). + """ + result = list(search_datadicts( + basedir, + since, + until=until, + name=name, + groupname=groupname, + filename=filename, + structure_only=structure_only, + only_complete=only_complete, + skip_trash=skip_trash, + )) + assert len(result) > 0, "no matching datadict found" + if not newest: + assert len(result) == 1, f"{len(result)} matching datadicts found" + return max(result, key=itemgetter(0))