toolsforexperiments · yoshi74ls181 · Feb 21, 2023 · Feb 21, 2023 · Feb 24, 2023 · Feb 25, 2023
diff --git a/plottr/data/datadict_storage.py b/plottr/data/datadict_storage.py
@@ -7,6 +7,7 @@
     The lock file has the following format: ~<file_name>.lock. The file lock will get deleted even if the program
     crashes. If the process is suddenly stopped however, we cannot guarantee that the file lock will be deleted.
 """
+from operator import itemgetter
 import os
 import logging
 import time
@@ -15,7 +16,7 @@
 import json
 import shutil
 from enum import Enum
-from typing import Any, Union, Optional, Dict, Type, Collection
+from typing import Any, Iterator, Tuple, Union, Optional, Dict, Type, Collection
 from types import TracebackType
 from pathlib import Path
 
@@ -279,7 +280,7 @@ def datadict_from_hdf5(path: Union[str, Path],
 
             if stopidx is None or stopidx > min(lens):
                 stopidx = min(lens)
-        else:
+        elif len(set(lens)) == 1:
             if stopidx is None or stopidx > lens[0]:
                 stopidx = lens[0]
 
@@ -561,7 +562,7 @@ class DDH5Writer(object):
     :param basedir: The root directory in which data is stored.
         :meth:`.create_file_structure` is creating the structure inside this root and
         determines the file name of the data. The default structure implemented here is
-        ``<root>/YYYY-MM-DD/YYYY-mm-dd_THHMMSS_<ID>-<name>/<filename>.ddh5``,
+        ``<root>/YYYY-mm-dd/YYYY-mm-ddTHHMMSS_<ID>-<name>/<filename>.ddh5``,
         where <ID> is a short identifier string and <name> is the value of parameter `name`.
         To change this, re-implement :meth:`.data_folder` and/or
         :meth:`.create_file_structure`.
@@ -639,7 +640,7 @@ def data_folder(self) -> Path:
         be saved.
 
         Default format:
-        ``<basedir>/YYYY-MM-DD/YYYY-mm-ddTHHMMSS_<ID>-<name>``.
+        ``<basedir>/YYYY-mm-dd/YYYY-mm-ddTHHMMSS_<ID>-<name>``.
         In this implementation we use the first 8 characters of a UUID as ID.
 
         :returns: The folder path.
@@ -716,3 +717,103 @@ def save_dict(self, name: str, d: dict) -> None:
         assert self.filepath is not None
         with open(self.filepath.parent / name, "x") as f:
             json.dump(d, f, indent=4, ensure_ascii=False, cls=NumpyJSONEncoder)
+
+
+def search_datadicts(
+    basedir: Union[str, Path],
+    since: str,
+    until: Optional[str] = None,
+    name: Optional[str] = None,
+    groupname: str = 'data',
+    filename: str = 'data',
+    structure_only: bool = False,
+    only_complete: bool = True,
+    skip_trash: bool = True,
+) -> Iterator[Tuple[str, DataDict]]:
+    """Iterate over datadicts matching a set of conditions.
+
+    :param basedir: The root directory in which data is stored.
+    :param since: Date (and time) in the format `YYYY-mm-dd` (or `YYYY-mm-ddTHHMMSS`).
+    :param until: Date (and time) in the format `YYYY-mm-dd` (or `YYYY-mm-ddTHHMMSS`).
+        If not given, default to `until = since`.
+    :param name: Name of the dataset (if not given, match all datasets).
+    :param groupname: Name of hdf5 group.
+    :param filename: Name of the ddh5 file without the extension.
+    :param structure_only: If `True`, don't load the data values.
+    :param only_complete: If `True`, only return datadicts tagged as complete.
+    :param skip_trash: If `True`, skip datadicts tagged as trash.
+    :return: Iterator over (foldername, datadict).
+    """
+    basedir = Path(basedir)
+    if until is None:
+        until = since
+    assert len(since) == len(until)
+    date = datetime.datetime.strptime(since[:10], "%Y-%m-%d")
+    until_date = datetime.datetime.strptime(until[:10], "%Y-%m-%d")
+
+    while date <= until_date:
+        date_str = datetime.datetime.strftime(date, "%Y-%m-%d")
+        for folder_path in sorted((basedir / date_str).iterdir()):
+            if not folder_path.is_dir():
+                continue
+            foldername = folder_path.name
+            if not (name is None or foldername.endswith(name)):
+                continue
+            if not (since <= foldername[:len(since)] <= until):
+                continue
+            if only_complete and not ((folder_path / "__complete__.tag").is_file()):
+                continue
+            if skip_trash and (folder_path / "__trash__.tag").is_file():
+                continue
+            datadict = datadict_from_hdf5(
+                folder_path / filename,
+                groupname,
+                structure_only=structure_only
+            )
+            yield foldername, datadict
+        date += datetime.timedelta(days=1)
+
+
+def search_datadict(
+    basedir: Union[str, Path],
+    since: str,
+    until: Optional[str] = None,
+    name: Optional[str] = None,
+    groupname: str = 'data',
+    filename: str = 'data',
+    structure_only: bool = False,
+    only_complete: bool = True,
+    skip_trash: bool = True,
+    newest: bool = False,
+) -> Tuple[str, DataDict]:
+    """Find the datadict which matches a set of conditions.
+    `AssertionError` is raised if there are zero or multiple matching datadicts.
+
+    :param basedir: The root directory in which data is stored.
+    :param since: Date (and time) in the format `YYYY-mm-dd` (or `YYYY-mm-ddTHHMMSS`).
+    :param until: Date (and time) in the format `YYYY-mm-dd` (or `YYYY-mm-ddTHHMMSS`).
+        If not given, default to `until = since`.
+    :param name: Name of the dataset (if not given, match all datasets).
+    :param groupname: Name of hdf5 group.
+    :param filename: Name of the ddh5 file without the extension.
+    :param structure_only: If `True`, don't load the data values.
+    :param only_complete: If `True`, only return datadicts tagged as complete.
+    :param skip_trash: If `True`, skip datadicts tagged as trash.
+    :param newest: If `True`, return the newest matching datadict
+    :return: (foldername, datadict).
+    """
+    result = list(search_datadicts(
+        basedir,
+        since,
+        until=until,
+        name=name,
+        groupname=groupname,
+        filename=filename,
+        structure_only=structure_only,
+        only_complete=only_complete,
+        skip_trash=skip_trash,
+    ))
+    assert len(result) > 0, "no matching datadict found"
+    if not newest:
+        assert len(result) == 1, f"{len(result)} matching datadicts found"
+    return max(result, key=itemgetter(0))