Merge pull request #31 from NickleDave/rewrite-transcriber

Rewrite transcriber
vocalpy · May 5, 2019 · 26a8ae6 · 26a8ae6
2 parents 2c36d04 + 964ca42
commit 26a8ae6
Show file tree

Hide file tree

Showing 13 changed files with 277 additions and 445 deletions.
diff --git a/setup.py b/setup.py
@@ -25,6 +25,12 @@
 REQUIRES_PYTHON = '>=3.6.0'
 VERSION = '0.2.0a5'
 LICENSE = 'BSD'
+ENTRY_POINTS = {
+    'crowsetta.format': [
+        'notmat = crowsetta.notmat',
+        'koumura = crowsetta.koumura',
+    ]
+}
 
 REQUIRED = [
     'numpy', 'attrs', 'evfuncs', 'koumura',
@@ -121,9 +127,7 @@ def run(self):
     package_dir={"": "src"},
     package_data={'': ['*.csv', '*.json', '*.mat']},  # files types to install
     # scripts=['src/bin/crowsetta-cli.py'],
-    # entry_points={
-    #     'console_scripts': ['crowsetta-cli=crowsetta.__main__:main'],
-    # },
+    entry_points=ENTRY_POINTS,
     install_requires=REQUIRED,
     extras_require=EXTRAS,
     include_package_data=True,

diff --git a/src/crowsetta/__init__.py b/src/crowsetta/__init__.py
@@ -3,6 +3,8 @@
 from .transcriber import Transcriber
 from .segment import Segment
 from .sequence import Sequence
+from .meta import Meta
 from . import csv
 from . import data
-from .data import formats
+from . import formats
+
diff --git a/src/crowsetta/config.json b/src/crowsetta/config.json
diff --git a/src/crowsetta/data.py b/src/crowsetta/data.py
@@ -398,12 +398,6 @@ def _fetch_file(url, file_name, print_destination=True, resume=True,
 }
 
 
-def formats():
-    """prints annotation formats built in to Crowsetta"""
-    formats_str = ', '.join([key for key in FORMATS.keys()])
-    return f'Annotation formats built in to Crowsetta: {formats_str}'
-
-
 def fetch(format, destination_path='.', remove_compressed_file=True):
     """fetches data from repositories
 

diff --git a/src/crowsetta/formats.py b/src/crowsetta/formats.py
@@ -0,0 +1,17 @@
+"""module for formats."""
+import sys
+import pkg_resources
+
+_INSTALLED = []
+current_module = sys.modules[__name__]
+for entry_point in pkg_resources.iter_entry_points('crowsetta.format'):
+    setattr(current_module, entry_point.name, entry_point.load())
+    _INSTALLED.append(entry_point.name)
+
+
+def show():
+    """shows what vocal annotation formats are currently installed"""
+    formats_str = ', '.join([format for format in _INSTALLED])
+    print(
+        f'installed vocal annotation formats:\n{formats_str}'
+    )
diff --git a/src/crowsetta/koumura.py b/src/crowsetta/koumura.py
@@ -15,15 +15,16 @@
 
 from .sequence import Sequence
 from . import csv
+from .meta import Meta
 
 
-def koumura2seq(xml_file='Annotation.xml', concat_seqs_into_songs=True,
+def koumura2seq(file='Annotation.xml', concat_seqs_into_songs=True,
                 wavpath='./Wave'):
     """converts Annotation.xml from [1]_ into an annotation list
 
     Parameters
     ----------
-    xml_file : str or pathlib.Path
+    file : str or pathlib.Path
         Path to Annotation.xml
     concat_seqs_into_songs : bool
         if True, concatenate sequences from xml_file, so that
@@ -47,15 +48,15 @@ def koumura2seq(xml_file='Annotation.xml', concat_seqs_into_songs=True,
         raise NotADirectoryError('Path specified for wavpath, {}, not recognized as an '
                                  'existing directory'.format(wavpath))
 
-    if not xml_file.endswith('.xml'):
+    if not file.endswith('.xml'):
         raise ValueError('Name of annotation file should end with .xml, '
                          'but name passed was {}'.format(xml_file))
 
     # confusingly, koumura also has an object named 'Sequence'
     # (which is where I borrowed the idea from)
     # but it has a totally different structure
-    seq_list_xml = koumura.parse_xml(xml_file,
-                                     concat_seqs_into_songs=concat_seqs_into_songs)
+    seq_list_xml = koumura.parse_xml(file, concat_seqs_into_songs=concat_seqs_into_songs)
+
     seq_list_out = []
     for seq_xml in seq_list_xml:
         onsets_Hz = np.asarray([syl.position for syl in seq_xml.syls])
@@ -86,7 +87,7 @@ def koumura2seq(xml_file='Annotation.xml', concat_seqs_into_songs=True,
     return seq_list_out
 
 
-def koumura2csv(xml_file, concat_seqs_into_songs=True, wavpath='./Wave',
+def koumura2csv(file, concat_seqs_into_songs=True, wavpath='./Wave',
                 csv_filename=None, abspath=False, basename=False):
     """takes Annotation.xml file from Koumura dataset into and saves the
     annotation from all files in one comma-separated values (csv)
@@ -128,9 +129,17 @@ def koumura2csv(xml_file, concat_seqs_into_songs=True, wavpath='./Wave',
     see seq2scv function for explanation of when you would want to use
     the abspath and basename parameters
     """
-    seq_list = koumura2seq(xml_file, concat_seqs_into_songs=concat_seqs_into_songs,
+    seq_list = koumura2seq(file, concat_seqs_into_songs=concat_seqs_into_songs,
                            wavpath=wavpath)
     if csv_filename is None:
-        csv_filename = os.path.abspath(xml_file)
+        csv_filename = os.path.abspath(file)
         csv_filename = csv_filename.replace('xml', 'csv')
     csv.seq2csv(seq_list, csv_filename, abspath=abspath, basename=basename)
+
+
+meta = Meta(
+    name='koumura',
+    ext='xml',
+    to_seq=koumura2seq,
+    to_csv=koumura2csv,
+)
diff --git a/src/crowsetta/meta.py b/src/crowsetta/meta.py
@@ -0,0 +1,41 @@
+import typing
+
+import attr
+from attr.validators import instance_of, optional
+
+
+@attr.s
+class Meta:
+    """class that represents metadata about a vocal annotation format
+    and functions for working with it using Crowsetta
+
+    Attributes
+    ----------
+    name : str
+        name of vocal annotation format. E.g., "textgrid"
+    ext : str
+        extension of files associated with format, e.g. "TextGrid"
+    to_seq : typing.Callable
+        a function that accepts the name of a file containing
+        annotations in the format and returns a Sequence or list of
+        Sequences. Required.
+    to_csv : typing.Callable
+        a function that accepts a Sequence or list of Sequences and
+        saves them as a comma-separated value file. Default is None.
+    to_format : typing.Callable
+        a function that accepts a Sequence or list of Sequences and
+        saves files in the format. Default is None.
+
+    module : str
+        path to module (a .py file) containing functions for working with format,
+        e.g. 'home/users/me/Documents/code/textgrid/textgrid.py'.
+        Default is None. Optional; enables format to be loaded without
+        making it part of a package that adds it as
+        a 'crowsetta.format' entry point in a setup.py file.
+    """
+    name = attr.ib(validator=instance_of(str))
+    ext = attr.ib(validator=instance_of(str))
+    to_seq = attr.ib(validator=instance_of(typing.Callable))
+    to_csv = attr.ib(validator=optional(instance_of(typing.Callable)), default=None)
+    to_format = attr.ib(validator=optional(instance_of(typing.Callable)), default=None)
+    module = attr.ib(validator=optional(instance_of(str)), default=None)
diff --git a/src/crowsetta/notmat.py b/src/crowsetta/notmat.py
@@ -10,43 +10,42 @@
 
 from .sequence import Sequence
 from .csv import seq2csv
+from .meta import Meta
 
 
-def _parse_notmat(notmat):
-    """helper function that parses/validates value for notmat argument;
+def _parse_file(file):
+    """helper function that parses/validates value for file argument;
     puts a single string or Path into a list to iterate over it (cheap hack
     that lets functions accept multiple types), and checks list to make sure
     all types are consistent
     """
-    if type(notmat) == str or type(notmat) == Path:
+    if type(file) == str or type(file) == Path:
         # put in a list to iterate over
-        notmat = [notmat]
+        file = [file]
 
-    for a_notmat in notmat:
-        if type(a_notmat) == str:
-            if not a_notmat.endswith('.not.mat'):
+    for a_file in file:
+        if type(a_file) == str:
+            if not a_file.endswith('.not.mat'):
                 raise ValueError("all filenames in .not.mat must end with '.not.mat' "
-                                 f"but {a_notmat} does not")
-        elif type(a_notmat) == Path:
-            if not a_notmat.suffixes == ['.not', '.mat']:
+                                 f"but {a_file} does not")
+        elif type(a_file) == Path:
+            if not a_file.suffixes == ['.not', '.mat']:
                 raise ValueError("all filenames in .not.mat must end with '.not.mat' "
-                                 f"but {a_notmat} does not")
+                                 f"but {a_file} does not")
 
-    return notmat
+    return file
 
 
-def notmat2seq(notmat,
+def notmat2seq(file,
                abspath=False,
                basename=False,
                round_times=True,
                decimals=3):
-    """open .not.mat file and return as Sequence
-    (data structure that used internally to represent
-    annotation for one audio file)
+    """parse annotation from .not.mat and return as Sequence
 
     Parameters
     ----------
-    notmat : str, Path, or list
+    file : str, Path, or list
         filename of a .not.mat annotation file,
         created by the evsonganaly GUI for MATLAB
     abspath : bool
@@ -77,15 +76,15 @@ def notmat2seq(notmat,
     due to floating point error, e.g. when loading .not.mat files and then sending them to
     a csv file, the result should be the same on Windows and Linux
     """
-    notmat = _parse_notmat(notmat)
+    file = _parse_file(file)
 
     if abspath and basename:
         raise ValueError('abspath and basename arguments cannot both be set to True, '
                          'unclear whether absolute path should be saved or if no path '
                          'information (just base filename) should be saved.')
 
     seq = []
-    for a_notmat in notmat:
+    for a_notmat in file:
         notmat_dict = evfuncs.load_notmat(a_notmat)
         # in .not.mat files saved by evsonganaly,
         # onsets and offsets are in units of ms, have to convert to s
@@ -132,14 +131,14 @@ def notmat2seq(notmat,
         return seq
 
 
-def notmat2csv(notmat, csv_filename, abspath=False, basename=False):
+def notmat2csv(file, csv_filename, abspath=False, basename=False):
     """saves annotation from .not.mat file(s) in a comma-separated values
     (csv) file, where each row represents one syllable from one
     .not.mat file.
 
     Parameters
     ----------
-    notmat : str, Path, or list
+    file : str, Path, or list
         if list, list of strings or Path objects pointing to .not.mat files
     csv_filename : str
         name for csv file that is created
@@ -159,14 +158,14 @@ def notmat2csv(notmat, csv_filename, abspath=False, basename=False):
     -------
     None
     """
-    notmat = _parse_notmat(notmat)
+    file = _parse_file(file)
 
     if abspath and basename:
         raise ValueError('abspath and basename arguments cannot both be set to True, '
                          'unclear whether absolute path should be saved or if no path '
                          'information (just base filename) should be saved.')
 
-    seq = notmat2seq(notmat)
+    seq = notmat2seq(file)
     seq2csv(seq, csv_filename, abspath=abspath, basename=basename)
 
 
@@ -278,3 +277,11 @@ def make_notmat(filename,
                                   .format(notmat_name))
     else:
         scipy.io.savemat(notmat_name, notmat_dict)
+
+meta = Meta(
+    name='notmat',
+    ext='not.mat',
+    to_seq=notmat2seq,
+    to_csv=notmat2csv,
+    to_format=make_notmat,
+)