Skip to content

Commit

Permalink
Merge pull request #31 from NickleDave/rewrite-transcriber
Browse files Browse the repository at this point in the history
Rewrite transcriber
  • Loading branch information
NickleDave committed May 5, 2019
2 parents 2c36d04 + 964ca42 commit 26a8ae6
Show file tree
Hide file tree
Showing 13 changed files with 277 additions and 445 deletions.
10 changes: 7 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@
REQUIRES_PYTHON = '>=3.6.0'
VERSION = '0.2.0a5'
LICENSE = 'BSD'
ENTRY_POINTS = {
'crowsetta.format': [
'notmat = crowsetta.notmat',
'koumura = crowsetta.koumura',
]
}

REQUIRED = [
'numpy', 'attrs', 'evfuncs', 'koumura',
Expand Down Expand Up @@ -121,9 +127,7 @@ def run(self):
package_dir={"": "src"},
package_data={'': ['*.csv', '*.json', '*.mat']}, # files types to install
# scripts=['src/bin/crowsetta-cli.py'],
# entry_points={
# 'console_scripts': ['crowsetta-cli=crowsetta.__main__:main'],
# },
entry_points=ENTRY_POINTS,
install_requires=REQUIRED,
extras_require=EXTRAS,
include_package_data=True,
Expand Down
4 changes: 3 additions & 1 deletion src/crowsetta/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from .transcriber import Transcriber
from .segment import Segment
from .sequence import Sequence
from .meta import Meta
from . import csv
from . import data
from .data import formats
from . import formats

14 changes: 0 additions & 14 deletions src/crowsetta/config.json

This file was deleted.

6 changes: 0 additions & 6 deletions src/crowsetta/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,12 +398,6 @@ def _fetch_file(url, file_name, print_destination=True, resume=True,
}


def formats():
"""prints annotation formats built in to Crowsetta"""
formats_str = ', '.join([key for key in FORMATS.keys()])
return f'Annotation formats built in to Crowsetta: {formats_str}'


def fetch(format, destination_path='.', remove_compressed_file=True):
"""fetches data from repositories
Expand Down
17 changes: 17 additions & 0 deletions src/crowsetta/formats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""module for formats."""
import sys
import pkg_resources

_INSTALLED = []
current_module = sys.modules[__name__]
for entry_point in pkg_resources.iter_entry_points('crowsetta.format'):
setattr(current_module, entry_point.name, entry_point.load())
_INSTALLED.append(entry_point.name)


def show():
"""shows what vocal annotation formats are currently installed"""
formats_str = ', '.join([format for format in _INSTALLED])
print(
f'installed vocal annotation formats:\n{formats_str}'
)
25 changes: 17 additions & 8 deletions src/crowsetta/koumura.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,16 @@

from .sequence import Sequence
from . import csv
from .meta import Meta


def koumura2seq(xml_file='Annotation.xml', concat_seqs_into_songs=True,
def koumura2seq(file='Annotation.xml', concat_seqs_into_songs=True,
wavpath='./Wave'):
"""converts Annotation.xml from [1]_ into an annotation list
Parameters
----------
xml_file : str or pathlib.Path
file : str or pathlib.Path
Path to Annotation.xml
concat_seqs_into_songs : bool
if True, concatenate sequences from xml_file, so that
Expand All @@ -47,15 +48,15 @@ def koumura2seq(xml_file='Annotation.xml', concat_seqs_into_songs=True,
raise NotADirectoryError('Path specified for wavpath, {}, not recognized as an '
'existing directory'.format(wavpath))

if not xml_file.endswith('.xml'):
if not file.endswith('.xml'):
raise ValueError('Name of annotation file should end with .xml, '
'but name passed was {}'.format(xml_file))

# confusingly, koumura also has an object named 'Sequence'
# (which is where I borrowed the idea from)
# but it has a totally different structure
seq_list_xml = koumura.parse_xml(xml_file,
concat_seqs_into_songs=concat_seqs_into_songs)
seq_list_xml = koumura.parse_xml(file, concat_seqs_into_songs=concat_seqs_into_songs)

seq_list_out = []
for seq_xml in seq_list_xml:
onsets_Hz = np.asarray([syl.position for syl in seq_xml.syls])
Expand Down Expand Up @@ -86,7 +87,7 @@ def koumura2seq(xml_file='Annotation.xml', concat_seqs_into_songs=True,
return seq_list_out


def koumura2csv(xml_file, concat_seqs_into_songs=True, wavpath='./Wave',
def koumura2csv(file, concat_seqs_into_songs=True, wavpath='./Wave',
csv_filename=None, abspath=False, basename=False):
"""takes Annotation.xml file from Koumura dataset into and saves the
annotation from all files in one comma-separated values (csv)
Expand Down Expand Up @@ -128,9 +129,17 @@ def koumura2csv(xml_file, concat_seqs_into_songs=True, wavpath='./Wave',
see seq2scv function for explanation of when you would want to use
the abspath and basename parameters
"""
seq_list = koumura2seq(xml_file, concat_seqs_into_songs=concat_seqs_into_songs,
seq_list = koumura2seq(file, concat_seqs_into_songs=concat_seqs_into_songs,
wavpath=wavpath)
if csv_filename is None:
csv_filename = os.path.abspath(xml_file)
csv_filename = os.path.abspath(file)
csv_filename = csv_filename.replace('xml', 'csv')
csv.seq2csv(seq_list, csv_filename, abspath=abspath, basename=basename)


meta = Meta(
name='koumura',
ext='xml',
to_seq=koumura2seq,
to_csv=koumura2csv,
)
41 changes: 41 additions & 0 deletions src/crowsetta/meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import typing

import attr
from attr.validators import instance_of, optional


@attr.s
class Meta:
"""class that represents metadata about a vocal annotation format
and functions for working with it using Crowsetta
Attributes
----------
name : str
name of vocal annotation format. E.g., "textgrid"
ext : str
extension of files associated with format, e.g. "TextGrid"
to_seq : typing.Callable
a function that accepts the name of a file containing
annotations in the format and returns a Sequence or list of
Sequences. Required.
to_csv : typing.Callable
a function that accepts a Sequence or list of Sequences and
saves them as a comma-separated value file. Default is None.
to_format : typing.Callable
a function that accepts a Sequence or list of Sequences and
saves files in the format. Default is None.
module : str
path to module (a .py file) containing functions for working with format,
e.g. 'home/users/me/Documents/code/textgrid/textgrid.py'.
Default is None. Optional; enables format to be loaded without
making it part of a package that adds it as
a 'crowsetta.format' entry point in a setup.py file.
"""
name = attr.ib(validator=instance_of(str))
ext = attr.ib(validator=instance_of(str))
to_seq = attr.ib(validator=instance_of(typing.Callable))
to_csv = attr.ib(validator=optional(instance_of(typing.Callable)), default=None)
to_format = attr.ib(validator=optional(instance_of(typing.Callable)), default=None)
module = attr.ib(validator=optional(instance_of(str)), default=None)
53 changes: 30 additions & 23 deletions src/crowsetta/notmat.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,43 +10,42 @@

from .sequence import Sequence
from .csv import seq2csv
from .meta import Meta


def _parse_notmat(notmat):
"""helper function that parses/validates value for notmat argument;
def _parse_file(file):
"""helper function that parses/validates value for file argument;
puts a single string or Path into a list to iterate over it (cheap hack
that lets functions accept multiple types), and checks list to make sure
all types are consistent
"""
if type(notmat) == str or type(notmat) == Path:
if type(file) == str or type(file) == Path:
# put in a list to iterate over
notmat = [notmat]
file = [file]

for a_notmat in notmat:
if type(a_notmat) == str:
if not a_notmat.endswith('.not.mat'):
for a_file in file:
if type(a_file) == str:
if not a_file.endswith('.not.mat'):
raise ValueError("all filenames in .not.mat must end with '.not.mat' "
f"but {a_notmat} does not")
elif type(a_notmat) == Path:
if not a_notmat.suffixes == ['.not', '.mat']:
f"but {a_file} does not")
elif type(a_file) == Path:
if not a_file.suffixes == ['.not', '.mat']:
raise ValueError("all filenames in .not.mat must end with '.not.mat' "
f"but {a_notmat} does not")
f"but {a_file} does not")

return notmat
return file


def notmat2seq(notmat,
def notmat2seq(file,
abspath=False,
basename=False,
round_times=True,
decimals=3):
"""open .not.mat file and return as Sequence
(data structure that used internally to represent
annotation for one audio file)
"""parse annotation from .not.mat and return as Sequence
Parameters
----------
notmat : str, Path, or list
file : str, Path, or list
filename of a .not.mat annotation file,
created by the evsonganaly GUI for MATLAB
abspath : bool
Expand Down Expand Up @@ -77,15 +76,15 @@ def notmat2seq(notmat,
due to floating point error, e.g. when loading .not.mat files and then sending them to
a csv file, the result should be the same on Windows and Linux
"""
notmat = _parse_notmat(notmat)
file = _parse_file(file)

if abspath and basename:
raise ValueError('abspath and basename arguments cannot both be set to True, '
'unclear whether absolute path should be saved or if no path '
'information (just base filename) should be saved.')

seq = []
for a_notmat in notmat:
for a_notmat in file:
notmat_dict = evfuncs.load_notmat(a_notmat)
# in .not.mat files saved by evsonganaly,
# onsets and offsets are in units of ms, have to convert to s
Expand Down Expand Up @@ -132,14 +131,14 @@ def notmat2seq(notmat,
return seq


def notmat2csv(notmat, csv_filename, abspath=False, basename=False):
def notmat2csv(file, csv_filename, abspath=False, basename=False):
"""saves annotation from .not.mat file(s) in a comma-separated values
(csv) file, where each row represents one syllable from one
.not.mat file.
Parameters
----------
notmat : str, Path, or list
file : str, Path, or list
if list, list of strings or Path objects pointing to .not.mat files
csv_filename : str
name for csv file that is created
Expand All @@ -159,14 +158,14 @@ def notmat2csv(notmat, csv_filename, abspath=False, basename=False):
-------
None
"""
notmat = _parse_notmat(notmat)
file = _parse_file(file)

if abspath and basename:
raise ValueError('abspath and basename arguments cannot both be set to True, '
'unclear whether absolute path should be saved or if no path '
'information (just base filename) should be saved.')

seq = notmat2seq(notmat)
seq = notmat2seq(file)
seq2csv(seq, csv_filename, abspath=abspath, basename=basename)


Expand Down Expand Up @@ -278,3 +277,11 @@ def make_notmat(filename,
.format(notmat_name))
else:
scipy.io.savemat(notmat_name, notmat_dict)

meta = Meta(
name='notmat',
ext='not.mat',
to_seq=notmat2seq,
to_csv=notmat2csv,
to_format=make_notmat,
)

0 comments on commit 26a8ae6

Please sign in to comment.