From d07b5736b6c164bed0af6184af347783f828cd04 Mon Sep 17 00:00:00 2001 From: yoshi74ls181 Date: Tue, 21 Feb 2023 11:14:48 +0900 Subject: [PATCH 1/6] Feature: Find datadicts matching a set of conditions --- plottr/data/datadict_storage.py | 82 ++++++++++++++++++++++++++++++++- 1 file changed, 80 insertions(+), 2 deletions(-) diff --git a/plottr/data/datadict_storage.py b/plottr/data/datadict_storage.py index 2d266326..a7245fd6 100644 --- a/plottr/data/datadict_storage.py +++ b/plottr/data/datadict_storage.py @@ -15,7 +15,7 @@ import json import shutil from enum import Enum -from typing import Any, Union, Optional, Dict, Type, Collection +from typing import Any, Iterator, Tuple, Union, Optional, Dict, Type, Collection from types import TracebackType from pathlib import Path @@ -561,7 +561,7 @@ class DDH5Writer(object): :param basedir: The root directory in which data is stored. :meth:`.create_file_structure` is creating the structure inside this root and determines the file name of the data. The default structure implemented here is - ``/YYYY-MM-DD/YYYY-mm-dd_THHMMSS_-/.ddh5``, + ``/YYYY-MM-DD/YYYY-mm-ddTHHMMSS_-/.ddh5``, where is a short identifier string and is the value of parameter `name`. To change this, re-implement :meth:`.data_folder` and/or :meth:`.create_file_structure`. @@ -724,3 +724,81 @@ def save_dict(self, name: str, d: dict) -> None: assert self.filepath is not None with open(self.filepath.parent / name, "x") as f: json.dump(d, f, indent=4, ensure_ascii=False, cls=NumpyJSONEncoder) + + +def search_datadicts( + since: str, + until: Optional[str] = None, + name: Optional[str] = None, + basedir: Union[str, Path] = '.', + groupname: str = 'data', + filename: str = 'data', + structure_only: bool = False, +) -> Iterator[Tuple[str, DataDict]]: + """Iterate over datadicts matching a set of conditions. + :param since: Date (and time) in the format `YYYY-MM-DD` (or `YYYY-mm-ddTHHMMSS`). + :param until: Date (and time) in the format `YYYY-MM-DD` (or `YYYY-mm-ddTHHMMSS`). Defaults to `since`. + :param name: Name of the dataset (if not given, match all datasets). + :param basedir: The root directory in which data is stored. + :param groupname: Name of hdf5 group. + :param filename: Name of the ddh5 file without the extension. + :param structure_only: If `True`, don't load the data values. + :return: Iterator over (foldername, datadict). + """ + basedir = Path(basedir) + if until is None: + until = since + assert len(since) == len(until) + date = datetime.datetime.strptime(since[:10], "%Y-%m-%d") + until_date = datetime.datetime.strptime(until[:10], "%Y-%m-%d") + + while date <= until_date: + date_str = datetime.datetime.strftime(date, "%Y-%m-%d") + for folder_path in sorted((basedir / date_str).iterdir()): + if not folder_path.is_dir(): + continue + foldername = folder_path.name + if not (name is None or foldername.endswith(name)): + continue + if not (since <= foldername[:len(since)] <= until): + continue + datadict = datadict_from_hdf5( + folder_path / filename, + groupname, + structure_only=structure_only + ) + yield foldername, datadict + date += datetime.timedelta(days=1) + + +def search_datadict( + since: str, + until: Optional[str] = None, + name: Optional[str] = None, + basedir: Union[str, Path] = '.', + groupname: str = 'data', + filename: str = 'data', + structure_only: bool = False, +) -> Tuple[str, DataDict]: + """Find the datadict which matches a set of conditions. + `AssertionError` is raised if there are zero or multiple matching datadicts. + :param since: Date (and time) in the format `YYYY-MM-DD` (or `YYYY-mm-ddTHHMMSS`). + :param until: Date (and time) in the format `YYYY-MM-DD` (or `YYYY-mm-ddTHHMMSS`). Defaults to `since`. + :param name: Name of the dataset (if not given, match all datasets). + :param basedir: The root directory in which data is stored. + :param groupname: Name of hdf5 group. + :param filename: Name of the ddh5 file without the extension. + :param structure_only: If `True`, don't load the data values. + :return: (foldername, datadict). + """ + result = list(search_datadicts( + since, + until=until, + name=name, + basedir=basedir, + groupname=groupname, + filename=filename, + structure_only=structure_only, + )) + assert len(result) == 1, f"{len(result)} matching datadicts found" + return result[0] From b2fbee99c5874572eb784000833341660a25ff94 Mon Sep 17 00:00:00 2001 From: yoshi74ls181 Date: Tue, 21 Feb 2023 11:27:02 +0900 Subject: [PATCH 2/6] Second thought: reorder arguments --- plottr/data/datadict_storage.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/plottr/data/datadict_storage.py b/plottr/data/datadict_storage.py index a7245fd6..9c1ec83c 100644 --- a/plottr/data/datadict_storage.py +++ b/plottr/data/datadict_storage.py @@ -727,19 +727,20 @@ def save_dict(self, name: str, d: dict) -> None: def search_datadicts( + basedir: Union[str, Path], since: str, until: Optional[str] = None, name: Optional[str] = None, - basedir: Union[str, Path] = '.', groupname: str = 'data', filename: str = 'data', structure_only: bool = False, ) -> Iterator[Tuple[str, DataDict]]: """Iterate over datadicts matching a set of conditions. + + :param basedir: The root directory in which data is stored. :param since: Date (and time) in the format `YYYY-MM-DD` (or `YYYY-mm-ddTHHMMSS`). :param until: Date (and time) in the format `YYYY-MM-DD` (or `YYYY-mm-ddTHHMMSS`). Defaults to `since`. :param name: Name of the dataset (if not given, match all datasets). - :param basedir: The root directory in which data is stored. :param groupname: Name of hdf5 group. :param filename: Name of the ddh5 file without the extension. :param structure_only: If `True`, don't load the data values. @@ -772,30 +773,31 @@ def search_datadicts( def search_datadict( + basedir: Union[str, Path], since: str, until: Optional[str] = None, name: Optional[str] = None, - basedir: Union[str, Path] = '.', groupname: str = 'data', filename: str = 'data', structure_only: bool = False, ) -> Tuple[str, DataDict]: """Find the datadict which matches a set of conditions. `AssertionError` is raised if there are zero or multiple matching datadicts. + + :param basedir: The root directory in which data is stored. :param since: Date (and time) in the format `YYYY-MM-DD` (or `YYYY-mm-ddTHHMMSS`). :param until: Date (and time) in the format `YYYY-MM-DD` (or `YYYY-mm-ddTHHMMSS`). Defaults to `since`. :param name: Name of the dataset (if not given, match all datasets). - :param basedir: The root directory in which data is stored. :param groupname: Name of hdf5 group. :param filename: Name of the ddh5 file without the extension. :param structure_only: If `True`, don't load the data values. :return: (foldername, datadict). """ result = list(search_datadicts( + basedir, since, until=until, name=name, - basedir=basedir, groupname=groupname, filename=filename, structure_only=structure_only, From 570a1de115e8ea58c26572592d02abf67713f0f1 Mon Sep 17 00:00:00 2001 From: yoshi74ls181 Date: Fri, 24 Feb 2023 15:08:53 +0900 Subject: [PATCH 3/6] Add a keyword argument `newest` to search_datadict --- plottr/data/datadict_storage.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/plottr/data/datadict_storage.py b/plottr/data/datadict_storage.py index 9c1ec83c..0ad75b9e 100644 --- a/plottr/data/datadict_storage.py +++ b/plottr/data/datadict_storage.py @@ -7,6 +7,7 @@ The lock file has the following format: ~.lock. The file lock will get deleted even if the program crashes. If the process is suddenly stopped however, we cannot guarantee that the file lock will be deleted. """ +from operator import itemgetter import os import logging import time @@ -780,6 +781,7 @@ def search_datadict( groupname: str = 'data', filename: str = 'data', structure_only: bool = False, + newest: bool = False, ) -> Tuple[str, DataDict]: """Find the datadict which matches a set of conditions. `AssertionError` is raised if there are zero or multiple matching datadicts. @@ -791,6 +793,7 @@ def search_datadict( :param groupname: Name of hdf5 group. :param filename: Name of the ddh5 file without the extension. :param structure_only: If `True`, don't load the data values. + :param newest: If `True`, return the newest matching datadict :return: (foldername, datadict). """ result = list(search_datadicts( @@ -802,5 +805,7 @@ def search_datadict( filename=filename, structure_only=structure_only, )) - assert len(result) == 1, f"{len(result)} matching datadicts found" - return result[0] + assert len(result) > 0, "no matching datadict found" + if not newest: + assert len(result) == 1, f"{len(result)} matching datadicts found" + return max(result, key=itemgetter(0)) From 067fe42f62be0d00490a3b2c36d18ccd22088c2f Mon Sep 17 00:00:00 2001 From: yoshi74ls181 Date: Sat, 25 Feb 2023 12:35:57 +0900 Subject: [PATCH 4/6] Fix a bug in datadict_from_hdf5 --- plottr/data/datadict_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plottr/data/datadict_storage.py b/plottr/data/datadict_storage.py index 7e3253c4..d642a43c 100644 --- a/plottr/data/datadict_storage.py +++ b/plottr/data/datadict_storage.py @@ -280,7 +280,7 @@ def datadict_from_hdf5(path: Union[str, Path], if stopidx is None or stopidx > min(lens): stopidx = min(lens) - else: + elif len(set(lens)) == 1: if stopidx is None or stopidx > lens[0]: stopidx = lens[0] From c1add113c2440380cc25b1c0ab221cc18508e980 Mon Sep 17 00:00:00 2001 From: yoshi74ls181 Date: Sat, 25 Feb 2023 12:50:31 +0900 Subject: [PATCH 5/6] Fix typos in docstring --- plottr/data/datadict_storage.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/plottr/data/datadict_storage.py b/plottr/data/datadict_storage.py index d642a43c..434a03b3 100644 --- a/plottr/data/datadict_storage.py +++ b/plottr/data/datadict_storage.py @@ -562,7 +562,7 @@ class DDH5Writer(object): :param basedir: The root directory in which data is stored. :meth:`.create_file_structure` is creating the structure inside this root and determines the file name of the data. The default structure implemented here is - ``/YYYY-MM-DD/YYYY-mm-ddTHHMMSS_-/.ddh5``, + ``/YYYY-mm-dd/YYYY-mm-ddTHHMMSS_-/.ddh5``, where is a short identifier string and is the value of parameter `name`. To change this, re-implement :meth:`.data_folder` and/or :meth:`.create_file_structure`. @@ -642,7 +642,7 @@ def data_folder(self) -> Path: be saved. Default format: - ``/YYYY-MM-DD/YYYY-mm-ddTHHMMSS_-``. + ``/YYYY-mm-dd/YYYY-mm-ddTHHMMSS_-``. In this implementation we use the first 8 characters of a UUID as ID. :returns: The folder path. @@ -739,8 +739,9 @@ def search_datadicts( """Iterate over datadicts matching a set of conditions. :param basedir: The root directory in which data is stored. - :param since: Date (and time) in the format `YYYY-MM-DD` (or `YYYY-mm-ddTHHMMSS`). - :param until: Date (and time) in the format `YYYY-MM-DD` (or `YYYY-mm-ddTHHMMSS`). Defaults to `since`. + :param since: Date (and time) in the format `YYYY-mm-dd` (or `YYYY-mm-ddTHHMMSS`). + :param until: Date (and time) in the format `YYYY-mm-dd` (or `YYYY-mm-ddTHHMMSS`). + If not given, default to `until = since`. :param name: Name of the dataset (if not given, match all datasets). :param groupname: Name of hdf5 group. :param filename: Name of the ddh5 file without the extension. @@ -787,8 +788,9 @@ def search_datadict( `AssertionError` is raised if there are zero or multiple matching datadicts. :param basedir: The root directory in which data is stored. - :param since: Date (and time) in the format `YYYY-MM-DD` (or `YYYY-mm-ddTHHMMSS`). - :param until: Date (and time) in the format `YYYY-MM-DD` (or `YYYY-mm-ddTHHMMSS`). Defaults to `since`. + :param since: Date (and time) in the format `YYYY-mm-dd` (or `YYYY-mm-ddTHHMMSS`). + :param until: Date (and time) in the format `YYYY-mm-dd` (or `YYYY-mm-ddTHHMMSS`). + If not given, default to `until = since`. :param name: Name of the dataset (if not given, match all datasets). :param groupname: Name of hdf5 group. :param filename: Name of the ddh5 file without the extension. From aac8112a66c80550408315f8277ab343e264ffca Mon Sep 17 00:00:00 2001 From: yoshi74ls181 Date: Sun, 26 Feb 2023 00:54:37 +0900 Subject: [PATCH 6/6] Add `only_complete` and `skip_trash` features to search_datadicts --- plottr/data/datadict_storage.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/plottr/data/datadict_storage.py b/plottr/data/datadict_storage.py index 434a03b3..5b9c218f 100644 --- a/plottr/data/datadict_storage.py +++ b/plottr/data/datadict_storage.py @@ -735,6 +735,8 @@ def search_datadicts( groupname: str = 'data', filename: str = 'data', structure_only: bool = False, + only_complete: bool = True, + skip_trash: bool = True, ) -> Iterator[Tuple[str, DataDict]]: """Iterate over datadicts matching a set of conditions. @@ -746,6 +748,8 @@ def search_datadicts( :param groupname: Name of hdf5 group. :param filename: Name of the ddh5 file without the extension. :param structure_only: If `True`, don't load the data values. + :param only_complete: If `True`, only return datadicts tagged as complete. + :param skip_trash: If `True`, skip datadicts tagged as trash. :return: Iterator over (foldername, datadict). """ basedir = Path(basedir) @@ -765,6 +769,10 @@ def search_datadicts( continue if not (since <= foldername[:len(since)] <= until): continue + if only_complete and not ((folder_path / "__complete__.tag").is_file()): + continue + if skip_trash and (folder_path / "__trash__.tag").is_file(): + continue datadict = datadict_from_hdf5( folder_path / filename, groupname, @@ -782,6 +790,8 @@ def search_datadict( groupname: str = 'data', filename: str = 'data', structure_only: bool = False, + only_complete: bool = True, + skip_trash: bool = True, newest: bool = False, ) -> Tuple[str, DataDict]: """Find the datadict which matches a set of conditions. @@ -795,6 +805,8 @@ def search_datadict( :param groupname: Name of hdf5 group. :param filename: Name of the ddh5 file without the extension. :param structure_only: If `True`, don't load the data values. + :param only_complete: If `True`, only return datadicts tagged as complete. + :param skip_trash: If `True`, skip datadicts tagged as trash. :param newest: If `True`, return the newest matching datadict :return: (foldername, datadict). """ @@ -806,6 +818,8 @@ def search_datadict( groupname=groupname, filename=filename, structure_only=structure_only, + only_complete=only_complete, + skip_trash=skip_trash, )) assert len(result) > 0, "no matching datadict found" if not newest: