Skip to content

Commit

Permalink
Merge pull request #185 from vocalpy/unpack-example-data-locally
Browse files Browse the repository at this point in the history
Unpack example data locally
  • Loading branch information
NickleDave committed May 22, 2022
2 parents 8a98782 + 9688423 commit 6ac9576
Show file tree
Hide file tree
Showing 12 changed files with 403 additions and 93 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ authors = [
]
requires-python = ">=3.8"
dependencies = [
"appdirs >=1.4.4",
"attrs >=19.3.0",
"evfuncs >=0.3.5",
"birdsong-recognition-dataset >=0.3.2",
Expand Down
7 changes: 6 additions & 1 deletion src/crowsetta/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,9 @@
timit,
)

from .data import get, available_formats
from .data import (
available_formats,
ExampleAnnotFile,
extract_data_files,
get,
)
193 changes: 172 additions & 21 deletions src/crowsetta/data/data.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,20 @@
import contextlib
try:
from importlib.resources import as_file, files, open_text
except ImportError:
from importlib_resources import as_file, files, open_text
import shutil
from typing import Union

import pathlib

import attr
import appdirs

from ..__about__ import __version__ as version
from ..typing import PathLike

APP_DIRS = appdirs.AppDirs(appname="crowsetta", appauthor="vocalpy", version=version)


@attr.define
Expand All @@ -23,24 +32,25 @@ class FormatPathArgs:
@attr.define
class ExampleAnnotFile:
"""class representing
an example annotation file
an example annotation file.
Returned by ``crowsetta.data.get``.
Attributes
----------
annot_path : pathlib.Path
to annotation file,
can be used to load
about : str
brief description of dataset
from which example is taken
doi : str
DOI for dataset
from which example is taken
annot_path : pathlib.Path, contextlib._GeneratorContextManager
Path to annotation file,
can be used to load.
If annotation files are not been extracted to
the local file system using the function
``crowsetta.data.extract_data_files``,
then ``crowsetta.data.get`` will return
``annot_path`` as a context manager
that will provide a path to a temporary file.
citation : str
citation for dataset
from which example is taken
"""
annot_path: str
annot_path: Union[PathLike, contextlib._GeneratorContextManager]
citation: str


Expand All @@ -64,30 +74,96 @@ class ExampleAnnotFile:
}


def get(format: str) -> pathlib.Path:
"""get example annotation files
def extract_data_files(user_data_dir: PathLike = APP_DIRS.user_data_dir):
"""extract example annotation files from ``crowsetta.data``
to a local directory on the file system.
Parameters
----------
user_data_dir : str, pathlib.Path
Location where example annotation files should be extracted to.
Default is ``crowsetta.data.data.APP_DIRS.user_data_dir``.
"""
user_data_dir = pathlib.Path(user_data_dir)
user_data_dir.mkdir(parents=True)
for format_name, path_args in DATA.items():
# don't use full name `importlib.resources` here
# because we need to use backport package, not stdlib, on Python 3.8
source = files(path_args.package).joinpath(path_args.resource)
annot_path = as_file(source)
dst_annot_dir = user_data_dir / path_args.package.split('.')[-1]
dst_annot_dir.mkdir()
dst_annot_path = dst_annot_dir / path_args.resource
with annot_path as annot_path:
shutil.copy(annot_path, dst_annot_path)
dst_citation_txt_path = dst_annot_dir / 'citation.txt'
with as_file(files(path_args.package).joinpath('citation.txt')) as citation_txt_path:
shutil.copy(citation_txt_path, dst_citation_txt_path)


def _get_example_from_user_data_dir(format: str,
user_data_dir: PathLike = APP_DIRS.user_data_dir) -> ExampleAnnotFile:
"""returns example from ``user_data_dir``.
Assumes that example data has already been copied to
``user_data_dir`` by calling ``_extract_data_files``.
Helper function used by ``crowsetta.data.get``.
Parameters
----------
format : str
name of annotation format.
Name of annotation format.
Should be the shorthand string name,
as listed by ``crowsetta.formats.as_list``.
user_data_dir : str, pathlib.Path
Location where example annotation files have been extracted to,
by calling ``crowsetta.data.extract_
Default is ``crowsetta.data.data.APP_DIRS.user_data_dir``.
Returns
-------
example_annot_file : ExampleAnnotFile
example : ExampleAnnotFile
with ``annot_path`` and ``citation`` attributes.
"""
try:
path_args = DATA[format]
except KeyError as e:
raise ValueError(
f'format not recognized: {format}'
) from e

format_pkg = path_args.package.split('.')[-1]
annot_path = user_data_dir / format_pkg / path_args.resource
citation_txt = user_data_dir / format_pkg / 'citation.txt'
with citation_txt.open('r') as fp:
citation = fp.read().replace("\n", "")

return ExampleAnnotFile(annot_path=annot_path,
citation=citation)


def _get_example_as_context_manager(format: str) -> ExampleAnnotFile:
"""gets an example annotation file
as a context manager, that can be used
as shown in the example below.
Helper function used by ``crowsetta.data.get``.
Parameters
----------
format : str
Name of annotation format.
Should be the shorthand string name,
as listed by ``crowsetta.formats.as_list``.
Returns
-------
example_annot_file : crowsetta.data.ExampleAnnotFile
class instance with attributes ``annot_path``
and ``citation``. The ``annot_path``
attribute should be used as part of a ``with``
statement to open the file; see Examples below
or examples in the docstrings.
Examples
--------
>>> example = crowsetta.data.get('textgrid')
>>> with example.annot_path as annot_path:
... textgrid = crowsetta.formats.seq.TextGrid.from_file(annot_path)
"""
try:
path_args = DATA[format]
Expand All @@ -111,6 +187,81 @@ class instance with attributes ``annot_path``
citation=citation)


def get(format: str,
user_data_dir: PathLike = APP_DIRS.user_data_dir) -> ExampleAnnotFile:
"""get example annotation files
Parameters
----------
format : str
Name of annotation format.
Should be the shorthand string name,
as listed by ``crowsetta.formats.as_list``.
user_data_dir : str, pathlib.Path
Location where example annotation files
are stored.
Default is ``crowsetta.data.data.APP_DIRS.user_data_dir``.
This default can be changed, but will require
passing the same path in every time
this function is called to avoid
being prompted about extracting the example files
to the default location.
Returns
-------
example_annot_file : ExampleAnnotFile
class instance with attributes ``annot_path``
and ``citation``.
If the annotation files have been
extracted to the local file system,
then ``annot_path`` will be a path
to a file.
Otherwise, ``annot_path`` will be
a context manager that should be
used as part of a ``with``
statement to open the file; see Examples below
or examples in the docstrings.
Examples
--------
>>> # example of a context manager
>>> example = crowsetta.data.get('textgrid')
>>> with example.annot_path as annot_path:
... textgrid = crowsetta.formats.seq.TextGrid.from_file(annot_path)
"""
if not format in DATA:
raise ValueError(
f'format not recognized: {format}'
)

user_data_dir = pathlib.Path(user_data_dir)
if not user_data_dir.exists():
y_or_n = input(
f"``user_data_dir`` does not exist at default location:\n{user_data_dir}\n"
"(To choose a location besides the default, call this function with that location "
"as the argument for ``user_data_dir``.)\n\n"
"Do you want to create this ``user_data_dir`` and extract example annotation files into it?\n"
"[yes]/no >>>"
)
if y_or_n.lower().startswith('y') or y_or_n == "":
extract_data_files(user_data_dir)
else:
print(
"""Not extracting data. Will return a context manager.\n
Use the context manager to get a path to a temporary path
like in the following example:\n
>>> example = crowsetta.data.get('timit')
>>> with example.annot_path as annot_path:
... timit = crowsetta.formats.seq.Timit.from_file(annot_path=annot_path)
>>> annot = timit.to_annot()
"""
)
return _get_example_as_context_manager(format)
else:
return _get_example_from_user_data_dir(format, user_data_dir)


def available_formats() -> list:
"""return list of string names
of annotation formats.
Expand Down
9 changes: 3 additions & 6 deletions src/crowsetta/formats/bbox/raven.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,7 @@ def from_file(cls,
Examples
--------
>>> example = crowsetta.data.get('raven')
>>> with example.annot_path as annot_path:
... raven = crowsetta.formats.bbox.Raven.from_file(annot_path=annot_path)
>>> raven = crowsetta.formats.bbox.Raven.from_file(example.annot_path)
"""
annot_path = pathlib.Path(annot_path)
crowsetta.validation.validate_ext(annot_path, extension=cls.ext)
Expand Down Expand Up @@ -130,8 +129,7 @@ def to_bbox(self) -> List[crowsetta.BBox]:
Examples
--------
>>> example = crowsetta.data.get('raven')
>>> with example.annot_path as annot_path:
... raven = crowsetta.formats.bbox.Raven.from_file(annot_path=annot_path)
>>> raven = crowsetta.formats.bbox.Raven.from_file(example.annot_path)
>>> bboxes = raven.to_bbox()
"""
bboxes = []
Expand Down Expand Up @@ -161,8 +159,7 @@ def to_annot(self) -> crowsetta.Annotation:
Examples
--------
>>> example = crowsetta.data.get('raven')
>>> with example.annot_path as annot_path:
... raven = crowsetta.formats.bbox.Raven.from_file(annot_path=annot_path)
>>> raven = crowsetta.formats.bbox.Raven.from_file(example.annot_path)
>>> annot = raven.to_annot()
"""
bboxes = self.to_bbox()
Expand Down
13 changes: 5 additions & 8 deletions src/crowsetta/formats/seq/audtxt.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,7 @@ def from_file(cls,
Examples
--------
>>> example = crowsetta.data.get('aud-txt')
>>> with example.annot_path as annot_path:
... simple = crowsetta.formats.seq.AudTxt.from_file(annot_path)
>>> audtxt = crowsetta.formats.seq.AudTxt.from_file(example.annot_path)
"""
annot_path = pathlib.Path(annot_path)
crowsetta.validation.validate_ext(annot_path, extension=cls.ext)
Expand Down Expand Up @@ -137,9 +136,8 @@ def to_seq(self,
Examples
--------
>>> example = crowsetta.data.get('aud-txt')
>>> with example.annot_path as annot_path:
... simple = crowsetta.formats.seq.AudTxt.from_file(annot_path)
>>> seq = simple.to_seq()
>>> audtxt = crowsetta.formats.seq.AudTxt.from_file(example.annot_path)
>>> seq = audtxt.to_seq()
Notes
-----
Expand Down Expand Up @@ -183,9 +181,8 @@ def to_annot(self,
Examples
--------
>>> example = crowsetta.data.get('aud-txt')
>>> with example.annot_path as annot_path:
... simple = crowsetta.formats.seq.AudTxt.from_file(annot_path)
>>> annot = simple.to_annot()
>>> audtxt = crowsetta.formats.seq.AudTxt.from_file(example.annot_path)
>>> annot = audtxt.to_annot()
Notes
-----
Expand Down
13 changes: 5 additions & 8 deletions src/crowsetta/formats/seq/birdsongrec.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,7 @@ def from_file(cls,
Examples
--------
>>> example = crowsetta.data.get('birdsong-recognition-dataset')
>>> with example.annot_path as annot_path:
... birdsongrec = crowsetta.formats.seq.BirdsongRec.from_file(annot_path=annot_path)
>>> birdsongrec = crowsetta.formats.seq.BirdsongRec.from_file(example.annot_path)
"""
annot_path = pathlib.Path(annot_path)
crowsetta.validation.validate_ext(annot_path, extension=cls.ext)
Expand Down Expand Up @@ -175,8 +174,7 @@ def to_seq(self,
Examples
--------
>>> example = crowsetta.data.get('birdsong-recognition-dataset')
>>> with example.annot_path as annot_path:
... birdsongrec = crowsetta.formats.seq.BirdsongRec.from_file(annot_path=annot_path)
>>> birdsongrec = crowsetta.formats.seq.BirdsongRec.from_file(example.annot_path)
>>> seqs = birdsongrec.to_seq()
Notes
Expand All @@ -195,7 +193,7 @@ def to_seq(self,
If you need to specify some other location for the ``.wav`` files,
pass in the ``wavpath`` argument when you first load the annotations:
>>> birdsongrec = crowsetta.formats.BirdsongRec.from_file(wav_path='./actually/wavs/are/here')
>>> birdsongrec = crowsetta.formats.BirdsongRec.from_file(annot_path, wav_path='./actually/wavs/are/here') # doctest: +SKIP
"""
seqs = []
for birdsongrec_seq in self.sequences:
Expand Down Expand Up @@ -277,8 +275,7 @@ def to_annot(self,
Examples
--------
>>> example = crowsetta.data.get('birdsong-recognition-dataset')
>>> with example.annot_path as annot_path:
... birdsongrec = crowsetta.formats.seq.BirdsongRec.from_file(annot_path=annot_path)
>>> birdsongrec = crowsetta.formats.seq.BirdsongRec.from_file(example.annot_path)
>>> annots = birdsongrec.to_annot()
Notes
Expand All @@ -297,7 +294,7 @@ def to_annot(self,
If you need to specify some other location for the ``.wav`` files,
pass in the ``wavpath`` argument when you first load the annotations:
>>> birdsongrec = crowsetta.formats.BirdsongRec.from_file(wav_path='./actually/wavs/are/here') # doctest: +SKIP
>>> birdsongrec = crowsetta.formats.BirdsongRec.from_file(annot_path, wav_path='./actually/wavs/are/here') # doctest: +SKIP
"""
seqs = self.to_seq(round_times=round_times,
decimals=decimals,
Expand Down

0 comments on commit 6ac9576

Please sign in to comment.