Merge pull request #185 from vocalpy/unpack-example-data-locally

Unpack example data locally
vocalpy · May 22, 2022 · 6ac9576 · 6ac9576
2 parents 8a98782 + 9688423
commit 6ac9576
Show file tree

Hide file tree

Showing 12 changed files with 403 additions and 93 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,6 +10,7 @@ authors = [
 ]
 requires-python = ">=3.8"
 dependencies = [
+    "appdirs >=1.4.4",
     "attrs >=19.3.0",
     "evfuncs >=0.3.5",
     "birdsong-recognition-dataset >=0.3.2",

diff --git a/src/crowsetta/data/__init__.py b/src/crowsetta/data/__init__.py
@@ -8,4 +8,9 @@
     timit,
 )
 
-from .data import get, available_formats
+from .data import (
+    available_formats,
+    ExampleAnnotFile,
+    extract_data_files,
+    get,
+)
diff --git a/src/crowsetta/data/data.py b/src/crowsetta/data/data.py
@@ -1,11 +1,20 @@
+import contextlib
 try:
     from importlib.resources import as_file, files, open_text 
 except ImportError:
     from importlib_resources import as_file, files, open_text 
+import shutil
+from typing import Union
 
 import pathlib
 
 import attr
+import appdirs
+
+from ..__about__ import __version__ as version
+from ..typing import PathLike
+
+APP_DIRS = appdirs.AppDirs(appname="crowsetta", appauthor="vocalpy", version=version)
 
 
 @attr.define
@@ -23,24 +32,25 @@ class FormatPathArgs:
 @attr.define
 class ExampleAnnotFile:
     """class representing
-    an example annotation file
+    an example annotation file.
+    Returned by ``crowsetta.data.get``.
 
     Attributes
     ----------
-    annot_path : pathlib.Path
-        to annotation file,
-        can be used to load
-    about : str
-        brief description of dataset
-        from which example is taken
-    doi : str
-        DOI for dataset
-        from which example is taken
+    annot_path : pathlib.Path, contextlib._GeneratorContextManager
+        Path to annotation file,
+        can be used to load.
+        If annotation files are not been extracted to
+        the local file system using the function
+        ``crowsetta.data.extract_data_files``,
+        then ``crowsetta.data.get`` will return 
+        ``annot_path`` as a context manager 
+        that will provide a path to a temporary file.
     citation : str
         citation for dataset
         from which example is taken
     """
-    annot_path: str
+    annot_path: Union[PathLike, contextlib._GeneratorContextManager]
     citation: str
 
 
@@ -64,30 +74,96 @@ class ExampleAnnotFile:
 }
 
 
-def get(format: str) -> pathlib.Path:
-    """get example annotation files
+def extract_data_files(user_data_dir: PathLike = APP_DIRS.user_data_dir):
+    """extract example annotation files from ``crowsetta.data`` 
+    to a local directory on the file system.
+
+    Parameters
+    ----------
+    user_data_dir : str, pathlib.Path
+        Location where example annotation files should be extracted to.
+        Default is ``crowsetta.data.data.APP_DIRS.user_data_dir``.
+    """
+    user_data_dir = pathlib.Path(user_data_dir)
+    user_data_dir.mkdir(parents=True)
+    for format_name, path_args in DATA.items():
+        # don't use full name `importlib.resources` here
+        # because we need to use backport package, not stdlib, on Python 3.8
+        source = files(path_args.package).joinpath(path_args.resource)
+        annot_path = as_file(source)
+        dst_annot_dir = user_data_dir / path_args.package.split('.')[-1]
+        dst_annot_dir.mkdir()
+        dst_annot_path = dst_annot_dir / path_args.resource
+        with annot_path as annot_path:
+            shutil.copy(annot_path, dst_annot_path)
+        dst_citation_txt_path = dst_annot_dir / 'citation.txt'
+        with as_file(files(path_args.package).joinpath('citation.txt')) as citation_txt_path:
+            shutil.copy(citation_txt_path, dst_citation_txt_path)
+
+
+def _get_example_from_user_data_dir(format: str,
+                                    user_data_dir: PathLike = APP_DIRS.user_data_dir) -> ExampleAnnotFile:
+    """returns example from ``user_data_dir``.
+    Assumes that example data has already been copied to 
+    ``user_data_dir`` by calling ``_extract_data_files``.
+
+    Helper function used by ``crowsetta.data.get``.
 
     Parameters
     ----------
     format : str
-        name of annotation format.
+        Name of annotation format.
         Should be the shorthand string name,
         as listed by ``crowsetta.formats.as_list``.
+    user_data_dir : str, pathlib.Path
+        Location where example annotation files have been extracted to,
+        by calling ``crowsetta.data.extract_
+        Default is ``crowsetta.data.data.APP_DIRS.user_data_dir``.
 
     Returns
     -------
-    example_annot_file : ExampleAnnotFile
+    example : ExampleAnnotFile
+        with ``annot_path`` and ``citation`` attributes.
+    """
+    try:
+        path_args = DATA[format]
+    except KeyError as e:
+        raise ValueError(
+            f'format not recognized: {format}'
+        ) from e
+
+    format_pkg = path_args.package.split('.')[-1]
+    annot_path = user_data_dir / format_pkg / path_args.resource
+    citation_txt = user_data_dir / format_pkg / 'citation.txt'
+    with citation_txt.open('r') as fp:
+        citation = fp.read().replace("\n", "")
+
+    return ExampleAnnotFile(annot_path=annot_path,
+                            citation=citation)
+
+
+def _get_example_as_context_manager(format: str) -> ExampleAnnotFile:
+    """gets an example annotation file
+    as a context manager, that can be used 
+    as shown in the example below.
+
+    Helper function used by ``crowsetta.data.get``.
+
+    Parameters
+    ----------
+    format : str
+        Name of annotation format.
+        Should be the shorthand string name,
+        as listed by ``crowsetta.formats.as_list``.
+
+    Returns
+    -------
+    example_annot_file : crowsetta.data.ExampleAnnotFile
         class instance with attributes ``annot_path`` 
         and ``citation``. The ``annot_path`` 
         attribute should be used as part of a ``with`` 
         statement to open the file; see Examples below
         or examples in the docstrings.
-
-    Examples
-    --------
-    >>> example = crowsetta.data.get('textgrid')
-    >>> with example.annot_path as annot_path:
-    ...     textgrid = crowsetta.formats.seq.TextGrid.from_file(annot_path)
     """
     try:
         path_args = DATA[format]
@@ -111,6 +187,81 @@ class instance with attributes ``annot_path``
                             citation=citation)
 
 
+def get(format: str,
+        user_data_dir: PathLike = APP_DIRS.user_data_dir) -> ExampleAnnotFile:
+    """get example annotation files
+
+    Parameters
+    ----------
+    format : str
+        Name of annotation format.
+        Should be the shorthand string name,
+        as listed by ``crowsetta.formats.as_list``.
+    user_data_dir : str, pathlib.Path
+        Location where example annotation files 
+        are stored.
+        Default is ``crowsetta.data.data.APP_DIRS.user_data_dir``.
+        This default can be changed, but will require 
+        passing the same path in every time 
+        this function is called to avoid 
+        being prompted about extracting the example files 
+        to the default location.
+
+    Returns
+    -------
+    example_annot_file : ExampleAnnotFile
+        class instance with attributes ``annot_path`` 
+        and ``citation``. 
+        If the annotation files have been 
+        extracted to the local file system, 
+        then ``annot_path`` will be a path 
+        to a file.
+        Otherwise, ``annot_path`` will be 
+        a context manager that should be
+        used as part of a ``with`` 
+        statement to open the file; see Examples below
+        or examples in the docstrings.
+
+    Examples
+    --------
+    >>> # example of a context manager
+    >>> example = crowsetta.data.get('textgrid')
+    >>> with example.annot_path as annot_path:
+    ...     textgrid = crowsetta.formats.seq.TextGrid.from_file(annot_path)
+    """
+    if not format in DATA:
+        raise ValueError(
+            f'format not recognized: {format}'
+        )
+
+    user_data_dir = pathlib.Path(user_data_dir)
+    if not user_data_dir.exists():
+        y_or_n = input(
+            f"``user_data_dir`` does not exist at default location:\n{user_data_dir}\n"
+            "(To choose a location besides the default, call this function with that location "
+            "as the argument for ``user_data_dir``.)\n\n"
+            "Do you want to create this ``user_data_dir`` and extract example annotation files into it?\n"
+            "[yes]/no >>>"
+        )
+        if y_or_n.lower().startswith('y') or y_or_n == "":
+            extract_data_files(user_data_dir)
+        else:
+            print(
+                """Not extracting data. Will return a context manager.\n
+                Use the context manager to get a path to a temporary path 
+                like in the following example:\n
+
+                >>> example = crowsetta.data.get('timit')
+                >>> with example.annot_path as annot_path:
+                ...     timit = crowsetta.formats.seq.Timit.from_file(annot_path=annot_path)
+                >>> annot = timit.to_annot()
+            """
+            )
+            return _get_example_as_context_manager(format)
+    else:
+        return _get_example_from_user_data_dir(format, user_data_dir)
+
+
 def available_formats() -> list:
     """return list of string names
     of annotation formats.

diff --git a/src/crowsetta/formats/bbox/raven.py b/src/crowsetta/formats/bbox/raven.py
@@ -94,8 +94,7 @@ def from_file(cls,
         Examples
         --------
         >>> example = crowsetta.data.get('raven')
-        >>> with example.annot_path as annot_path:
-        ...     raven = crowsetta.formats.bbox.Raven.from_file(annot_path=annot_path)
+        >>> raven = crowsetta.formats.bbox.Raven.from_file(example.annot_path)
         """
         annot_path = pathlib.Path(annot_path)
         crowsetta.validation.validate_ext(annot_path, extension=cls.ext)
@@ -130,8 +129,7 @@ def to_bbox(self) -> List[crowsetta.BBox]:
         Examples
         --------
         >>> example = crowsetta.data.get('raven')
-        >>> with example.annot_path as annot_path:
-        ...     raven = crowsetta.formats.bbox.Raven.from_file(annot_path=annot_path)
+        >>> raven = crowsetta.formats.bbox.Raven.from_file(example.annot_path)
         >>> bboxes = raven.to_bbox()
         """
         bboxes = []
@@ -161,8 +159,7 @@ def to_annot(self) -> crowsetta.Annotation:
         Examples
         --------
         >>> example = crowsetta.data.get('raven')
-        >>> with example.annot_path as annot_path:
-        ...     raven = crowsetta.formats.bbox.Raven.from_file(annot_path=annot_path)
+        >>> raven = crowsetta.formats.bbox.Raven.from_file(example.annot_path)
         >>> annot = raven.to_annot()
         """
         bboxes = self.to_bbox()

diff --git a/src/crowsetta/formats/seq/audtxt.py b/src/crowsetta/formats/seq/audtxt.py
@@ -98,8 +98,7 @@ def from_file(cls,
         Examples
         --------
         >>> example = crowsetta.data.get('aud-txt')
-        >>> with example.annot_path as annot_path:
-        ...    simple = crowsetta.formats.seq.AudTxt.from_file(annot_path)
+        >>> audtxt = crowsetta.formats.seq.AudTxt.from_file(example.annot_path)
         """
         annot_path = pathlib.Path(annot_path)
         crowsetta.validation.validate_ext(annot_path, extension=cls.ext)
@@ -137,9 +136,8 @@ def to_seq(self,
         Examples
         --------
         >>> example = crowsetta.data.get('aud-txt')
-        >>> with example.annot_path as annot_path:
-        ...    simple = crowsetta.formats.seq.AudTxt.from_file(annot_path)
-        >>> seq = simple.to_seq()
+        >>> audtxt = crowsetta.formats.seq.AudTxt.from_file(example.annot_path)
+        >>> seq = audtxt.to_seq()
 
         Notes
         -----
@@ -183,9 +181,8 @@ def to_annot(self,
         Examples
         --------
         >>> example = crowsetta.data.get('aud-txt')
-        >>> with example.annot_path as annot_path:
-        ...    simple = crowsetta.formats.seq.AudTxt.from_file(annot_path)
-        >>> annot = simple.to_annot()
+        >>> audtxt = crowsetta.formats.seq.AudTxt.from_file(example.annot_path)
+        >>> annot = audtxt.to_annot()
 
         Notes
         -----

diff --git a/src/crowsetta/formats/seq/birdsongrec.py b/src/crowsetta/formats/seq/birdsongrec.py
@@ -120,8 +120,7 @@ def from_file(cls,
         Examples
         --------
         >>> example = crowsetta.data.get('birdsong-recognition-dataset')
-        >>> with example.annot_path as annot_path:
-        ...     birdsongrec = crowsetta.formats.seq.BirdsongRec.from_file(annot_path=annot_path)
+        >>> birdsongrec = crowsetta.formats.seq.BirdsongRec.from_file(example.annot_path)
         """
         annot_path = pathlib.Path(annot_path)
         crowsetta.validation.validate_ext(annot_path, extension=cls.ext)
@@ -175,8 +174,7 @@ def to_seq(self,
         Examples
         --------
         >>> example = crowsetta.data.get('birdsong-recognition-dataset')
-        >>> with example.annot_path as annot_path:
-        ...     birdsongrec = crowsetta.formats.seq.BirdsongRec.from_file(annot_path=annot_path)
+        >>> birdsongrec = crowsetta.formats.seq.BirdsongRec.from_file(example.annot_path)
         >>> seqs = birdsongrec.to_seq()
 
         Notes
@@ -195,7 +193,7 @@ def to_seq(self,
         If you need to specify some other location for the ``.wav`` files,
         pass in the ``wavpath`` argument when you first load the annotations:
 
-        >>> birdsongrec = crowsetta.formats.BirdsongRec.from_file(wav_path='./actually/wavs/are/here')
+        >>> birdsongrec = crowsetta.formats.BirdsongRec.from_file(annot_path, wav_path='./actually/wavs/are/here')  # doctest: +SKIP
         """
         seqs = []
         for birdsongrec_seq in self.sequences:
@@ -277,8 +275,7 @@ def to_annot(self,
         Examples
         --------
         >>> example = crowsetta.data.get('birdsong-recognition-dataset')
-        >>> with example.annot_path as annot_path:
-        ...     birdsongrec = crowsetta.formats.seq.BirdsongRec.from_file(annot_path=annot_path)
+        >>> birdsongrec = crowsetta.formats.seq.BirdsongRec.from_file(example.annot_path)
         >>> annots = birdsongrec.to_annot()
 
         Notes
@@ -297,7 +294,7 @@ def to_annot(self,
         If you need to specify some other location for the ``.wav`` files,
         pass in the ``wavpath`` argument when you first load the annotations:
 
-        >>> birdsongrec = crowsetta.formats.BirdsongRec.from_file(wav_path='./actually/wavs/are/here')  # doctest: +SKIP
+        >>> birdsongrec = crowsetta.formats.BirdsongRec.from_file(annot_path, wav_path='./actually/wavs/are/here')  # doctest: +SKIP
         """
         seqs = self.to_seq(round_times=round_times,
                            decimals=decimals,