Merge branch 'master' into publish-workflow

dippedrusk · May 18, 2020 · 43aa844 · 43aa844
2 parents 9a256ba + 9191007
commit 43aa844
Show file tree

Hide file tree

Showing 6 changed files with 130 additions and 81 deletions.
diff --git a/README.md b/README.md
@@ -1,14 +1,15 @@
 # Syllabifier
 
+[![Build Status](https://img.shields.io/github/workflow/status/vgautam/arpabet-syllabifier/Test/master)](https://coveralls.io/github/vgautam/arpabet-syllabifier?branch=master) [![Coverage Status](https://coveralls.io/repos/github/vgautam/arpabet-syllabifier/badge.svg?branch=master)](https://coveralls.io/github/vgautam/arpabet-syllabifier?branch=master)
+
 Syllabifier is a Python module to syllabify your English pronunciations. Currently only
 ARPABET syllabification is supported.
-It will take an ARPABET transcription in array or string form and return a Pandas Series or Python
-list with the syllables chunked.
+It will take an ARPABET transcription in array or string form and return a list with the syllables
+chunked.
 
 ## Dependencies
 
 * python>=3.5
-* pandas>=0.20.0
 * jupyter>=1.0.0 (only if you want to run the test notebook locally)
 
 ## How to Use
@@ -17,7 +18,6 @@ list with the syllables chunked.
 * Import the function `from syllabifier import syllabifyARPA`.
 * Function parameters
   * A 2-letter ARPABET transcription in string form (with phones delimited by spaces) or as a Python list (stress markers on the vowels are optional)
-  * (Optional) bool return_list to return in Python list form instead of the default Pandas Series
   * (Optional) bool silence_warnings to suppress ValueErrors thrown because of unsyllabifiable input
 * Sample calls are in the Jupyter Notebook test.ipynb, using CMU Pronouncing Dictionary data.
 
@@ -26,7 +26,7 @@ list with the syllables chunked.
 * **tests/test.ipynb**: Jupyter Notebook demonstrating sample calls to syllabifyARPA using CMUDict data
 * **tests/cmudict.txt**: Very large text file containing over 100,000 ARPABET-syllabified English words
 * **tests/cmusubset.txt**: Subset of ~60 words and transcriptions from the CMU Dictionary text file for testing convenience
-* **tests/test_syllabifyARPA.py**: Unit and integration tests for the package
+* **tests/test_syllabifier.py**: Unit and integration tests for the package
 
 ## ARPABET
 ARPABET is a method of transcribing General American English phonetically with only ASCII characters. Refer [here](https://en.wikipedia.org/wiki/ARPABET) for a table of mappings between IPA and ARPABET. This syllabifier accepts only the 2-letter ARPABET codes but case does not matter.

diff --git a/requirements.txt b/requirements.txt
@@ -1 +0,0 @@
-pandas>=0.20.0

diff --git a/src/syllabifier/__init__.py b/src/syllabifier/__init__.py
@@ -1 +1,2 @@
+#!/usr/bin/env python3
 from syllabifier.syllabifyARPA import syllabifyARPA
diff --git a/src/syllabifier/constants.py b/src/syllabifier/constants.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import re
 
 # Sets required to check for valid onset and coda clusters

diff --git a/src/syllabifier/syllabifyARPA.py b/src/syllabifier/syllabifyARPA.py
@@ -8,53 +8,37 @@
 # Vasundhara Gautam
 # October 3rd, 2017
 
-import pandas as pd
 import re
 import sys
 
-# Sets required to check for valid onset and coda clusters
-VOICELESS = set(['K', 'P', 'T', 'F', 'HH', 'S', 'SH', 'TH', 'CH'])
-VOICED = set(['G', 'B', 'D', 'DH', 'V', 'Z', 'ZH', 'JH'])
-
-STOPS = set(['K', 'P', 'T', 'G', 'B', 'D'])
-FRICATIVES = set(['F', 'DH', 'HH', 'S', 'SH', 'TH', 'V', 'Z', 'ZH'])
-AFFRICATES = set(['CH', 'JH'])
-NASALS = set(['M', 'N', 'NG'])
-APPROXIMANTS = set(['L', 'R', 'W', 'Y'])
-CONSONANTS = STOPS.union(FRICATIVES).union(AFFRICATES).union(NASALS).union(APPROXIMANTS)
-
-S_EXTENDED_CODAS = set(['K', 'P', 'T', 'F', 'TH', 'D', 'NG'])
-Z_EXTENDED_CODAS = set(['G', 'B', 'D', 'DH', 'V', 'M', 'N', 'NG', 'L'])
-
-T_EXTENDED_CODAS = set(['K', 'P', 'F', 'S', 'SH', 'TH', 'CH', 'N'])
-D_EXTENDED_CODAS = set(['G', 'B', 'DH', 'V', 'Z', 'ZH', 'JH', 'M', 'N', 'NG'])
-
-PHONESET = set(['AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B',
-                'CH', 'D', 'DH', 'EH', 'ER', 'EY', 'F', 'G',
-                'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N',
-                'NG', 'OW', 'OY', 'P', 'R', 'S', 'SH', 'T',
-                'TH', 'UH', 'UW', 'V', 'W', 'Y', 'Z', 'ZH'])
-
-# Optional stress markers (0,1,2) after the vowel for flexibility
-VOWELS_REGEX = re.compile(r'(?:AA|AE|AH|AO|AW|AY|EH|ER|EY|IH|IY|OW|OY|UW|UH)[012]?')
-
-def syllabifyARPA(arpa_arr, return_list=False, silence_warnings=False):
+from syllabifier.constants import VOICELESS
+from syllabifier.constants import VOICED
+from syllabifier.constants import STOPS
+from syllabifier.constants import FRICATIVES
+from syllabifier.constants import AFFRICATES
+from syllabifier.constants import NASALS
+from syllabifier.constants import APPROXIMANTS
+from syllabifier.constants import CONSONANTS
+from syllabifier.constants import S_EXTENDED_CODAS
+from syllabifier.constants import Z_EXTENDED_CODAS
+from syllabifier.constants import T_EXTENDED_CODAS
+from syllabifier.constants import D_EXTENDED_CODAS
+from syllabifier.constants import PHONESET
+from syllabifier.constants import VOWELS_REGEX
+
+def syllabifyARPA(arpa_arr, silence_warnings=False):
     """
     Syllabifies ARPABET transcriptions according to General American English
     syllabification rules.
 
     Args:
         arpa_arr: A string or array of ARPABET phones with optional stress markers
         on the vowels.
-        return_list: Boolean (default False) to return list of syllable strings
         silence_warnings: Boolean (default False) to suppress ValueErrors
 
     Returns:
-        Pandas Series of dtype 'Object' with syllables in each row.
-        If return_list set to True, returns a Python list of strings containing
-        the syllables.
-        In case the input is unsyllabifiable, an empty Series or list is
-        returned.
+        List of strings with syllables in each row.
+        In case the input is unsyllabifiable, an empty list is returned.
 
     Raises:
         ValueError if input contains non-ARPABET phonemes, no vowels or if it
@@ -65,9 +49,7 @@ def handleError(string):
         if not silence_warnings:
             raise ValueError(string)
 
-    ret = [] #pd.Series(None)
-    if return_list:
-        ret = []
+    ret = []
 
     try:
         arpa_arr = arpa_arr.split() # Allows for phoneme array and string input
@@ -80,7 +62,7 @@ def handleError(string):
     word = ' '.join(arpa_arr)
 
     if not (testInPhoneset(arpa_arr)):
-        handleError('Input %s contains non-ARPABET phonemes' % word)
+        handleError('Input %s contains non-ARPABET phones' % word)
         return ret
 
     final_arr = []
@@ -112,14 +94,10 @@ def handleError(string):
 
     for i in range(len(final_arr)):
         if not testLegalCoda(final_arr[i]):
-            handleError('Impossible to syllabify %s according to English '
-                          'syllabification rules.' % word)
+            handleError('Bad coda cluster in %s' % word)
             return ret
 
-    #ret = pd.Series([' '.join(syllable) for syllable in final_arr])
     ret = [' '.join(syllable) for syllable in final_arr]
-    if return_list:
-        ret = list(ret)
 
     return ret
 

diff --git a/tests/test_syllabifier.py b/tests/test_syllabifier.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 import itertools
+import pytest
 from syllabifier import syllabifyARPA
 from syllabifier.constants import CONSONANTS
 from syllabifier.constants import PHONESET
@@ -11,57 +12,102 @@
 
 def test_syllabifyARPA():
     test_string = 'HH AE NG M AE N'
-    assert syllabifyARPA(test_string, return_list=True) == ['HH AE NG', 'M AE N']
+    assert syllabifyARPA(test_string) == ['HH AE NG', 'M AE N']
 
 
-def test_unsyllabifiable():
+def test_non_ARPABET_phones():
+    test_string = 'banana'
+    with pytest.raises(ValueError, match='contains non-ARPABET phones'):
+        syllabifyARPA(test_string)
+    assert not syllabifyARPA(test_string, silence_warnings=True)
+
+
+def test_no_vowel():
+    test_string = 'K R F JH'
+    with pytest.raises(ValueError, match='no vowel in'):
+        syllabifyARPA(test_string)
+    assert not syllabifyARPA(test_string, silence_warnings=True)
+    test_string = 'K S'
+    with pytest.raises(ValueError, match='no vowel in'):
+        syllabifyARPA(test_string)
+    assert not syllabifyARPA(test_string, silence_warnings=True)
+
+
+def test_bad_onset_cluster():
     test_string = 'M G L AA'
-    assert not syllabifyARPA(test_string, return_list=True, silence_warnings=True)
+    with pytest.raises(ValueError, match='Bad onset cluster'):
+        syllabifyARPA(test_string)
+    assert not syllabifyARPA(test_string, silence_warnings=True)
 
 
-def test_empty():
-    test_string = ''
-    assert not syllabifyARPA(test_string, return_list=True, silence_warnings=True)
+def test_NG_onset():
+    test_string = 'NG OW'
+    with pytest.raises(ValueError, match='Bad onset cluster'):
+        syllabifyARPA(test_string)
+    assert not syllabifyARPA(test_string, silence_warnings=True)
 
 
-def test_no_vowel():
-    test_string = 'K S'
-    assert not syllabifyARPA(test_string, return_list=True, silence_warnings=True)
+def test_bad_coda():
+    test_string = 'AE G R P'
+    with pytest.raises(ValueError, match='Bad coda cluster'):
+        syllabifyARPA(test_string)
+    assert not syllabifyARPA(test_string, silence_warnings=True)
+
+
+def test_length_5_coda():
+    test_string = 'AE N G L S F'
+    with pytest.raises(ValueError, match='Bad coda cluster'):
+        syllabifyARPA(test_string)
+    assert not syllabifyARPA(test_string, silence_warnings=True)
+
+
+def test_length_4_coda():
+    test_string = 'AE N S G F'
+    with pytest.raises(ValueError, match='Bad coda cluster'):
+        syllabifyARPA(test_string)
+    assert not syllabifyARPA(test_string, silence_warnings=True)
+    test_string = 'S IH K S TH S'
+    assert syllabifyARPA(test_string) == [test_string]
+
+
+def test_empty():
+    test_string = ''
+    assert not syllabifyARPA(test_string, silence_warnings=True)
 
 
 def test_sixths():
     test_string = 'S IH K S TH S'
-    assert syllabifyARPA(test_string, return_list=True) == ['S IH K S TH S']
+    assert syllabifyARPA(test_string) == ['S IH K S TH S']
 
 
 def test_lowercase():
     test_string = 'ow'
-    assert syllabifyARPA(test_string, return_list=True) == ['OW']
+    assert syllabifyARPA(test_string) == ['OW']
 
 
 def test_mixedcase():
     test_string = 'oW'
-    assert syllabifyARPA(test_string, return_list=True) == ['OW']
+    assert syllabifyARPA(test_string) == ['OW']
 
 
 def test_non_arpabet():
     test_string = 'GH IY'
-    assert not syllabifyARPA(test_string, return_list=True, silence_warnings=True)
+    assert not syllabifyARPA(test_string, silence_warnings=True)
 
 
 def test_array():
     test_array = ['K', 'AE', 'T']
-    assert syllabifyARPA(test_array, return_list=True) == ['K AE T']
+    assert syllabifyARPA(test_array) == ['K AE T']
 
 
 def test_weird_array():
     test_array = ['K AE', 'T']
-    assert not syllabifyARPA(test_array, return_list=True, silence_warnings=True)
+    assert not syllabifyARPA(test_array, silence_warnings=True)
 
 
 def test_CVC_syllables():
     for syllable in itertools.product(legal_onsets, VOWELS, legal_codas):
-        assert syllabifyARPA(list(syllable), return_list=True)
+        assert syllabifyARPA(list(syllable))
 
 
 def test_SZ_extension():
@@ -70,8 +116,8 @@ def test_SZ_extension():
     for syllable in itertools.product(legal_onsets, VOWELS, sz_extendable_codas):
         s_extended = list(syllable) + ['S']
         z_extended = list(syllable) + ['Z']
-        assert (syllabifyARPA(s_extended, return_list=True, silence_warnings=True) !=
-                syllabifyARPA(z_extended, return_list=True, silence_warnings=True))
+        assert (syllabifyARPA(s_extended, silence_warnings=True) !=
+                syllabifyARPA(z_extended, silence_warnings=True))
 
 
 def test_TD_extension():
@@ -80,67 +126,91 @@ def test_TD_extension():
     for syllable in itertools.product(legal_onsets, VOWELS, td_extendable_codas):
         t_extended = list(syllable) + ['T']
         d_extended = list(syllable) + ['D']
-        assert (syllabifyARPA(t_extended, return_list=True, silence_warnings=True) !=
-                syllabifyARPA(d_extended, return_list=True, silence_warnings=True))
+        assert (syllabifyARPA(t_extended, silence_warnings=True) !=
+                syllabifyARPA(d_extended, silence_warnings=True))
 
 
 def test_CCVC_syllables():
     syllables = ['P L EY', 'D R IY M', 'HH Y UW JH', 'S L IY P', 'G W AA M', 'T W IH N', 'K R AW D']
     for syllable in syllables:
-        assert syllabifyARPA(syllable, return_list=True) == [syllable]
+        assert syllabifyARPA(syllable) == [syllable]
 
 
 def test_CVCC_syllables():
     syllables = ['HH EH L P', 'W EH L SH', 'F IH F TH', 'L AH NG Z', 'B EH L T', 'T AE K T']
     # TODO:FIX for mensch
     for syllable in syllables:
-        assert syllabifyARPA(syllable, return_list=True) == [syllable]
+        assert syllabifyARPA(syllable) == [syllable]
 
 
 def test_V_syllables():
     syllables = ['OW', 'AY', 'UW', 'AA', 'EY']
     for syllable in syllables:
-        assert syllabifyARPA(syllable, return_list=True) == [syllable]
+        assert syllabifyARPA(syllable) == [syllable]
 
 
 def test_CVCCC_syllables():
-    syllables = ['Y EH L P S', 'K AE L K S', 'W AO R M TH', 'W IH L S T', 'JH IH NG K S']
+    syllables = [
+        'Y EH L P S', 'K AE L K S', 'W AO R M TH', 'W IH L S T', 'JH IH NG K S', 'M IH L K T',
+        'P AH M P T'
+    ]
     for syllable in syllables:
-        assert syllabifyARPA(syllable, return_list=True) == [syllable]
+        assert syllabifyARPA(syllable) == [syllable]
+
+
+def test_CCVCCC_syllables():
+    syllables = [
+        'K R AE M P T', 'B L IH NG K T', 'T W AE NG G D'
+    ]
+    for syllable in syllables:
+        assert syllabifyARPA(syllable) == [syllable]
 
 
 def test_CCVCC_syllables():
-    syllables = ['TH W AO R T', 'K L AH T S', 'B L AH N T', 'F Y OW R D', 'G R IH N CH']
+    syllables = [
+        'TH W AO R T', 'K L AH T S', 'B L AH N T', 'F Y OW R D', 'G R IH N CH'
+    ]
     for syllable in syllables:
-        assert syllabifyARPA(syllable, return_list=True) == [syllable]
+        assert syllabifyARPA(syllable) == [syllable]
 
 
 def test_s_clusters():
-    syllables = ['S P OW R T', 'S M AY L', 'S F IH NG K S', 'S P Y UW', 'S T AH D', 'S K W EY R']
+    syllables = [
+        'S P OW R T', 'S M AY L', 'S F IH NG K S', 'S P Y UW', 'S T AH D', 'S K W EY R',
+        'S T R AY K']
     for syllable in syllables:
-        assert syllabifyARPA(syllable, return_list=True) == [syllable]
+        assert syllabifyARPA(syllable) == [syllable]
     prons_syllabified = {
         'S F R AH JH IH S T IH K S': ['S F R AH', 'JH IH', 'S T IH K S'],
         'B L AE S T IH D': ['B L AE', 'S T IH D'],
         'S K L EY R AH': ['S K L EY', 'R AH'],
         'S T EH TH AH S K OW P': ['S T EH', 'TH AH', 'S K OW P']
     }
     for pron, syllabification in prons_syllabified.items():
-        assert syllabifyARPA(pron, return_list=True) == syllabification
+        assert syllabifyARPA(pron) == syllabification
 
 
 def test_long_words():
     prons_syllabified = {
         'AH S F IH K S IY EY T AH D': ['AH', 'S F IH K', 'S IY', 'EY', 'T AH D'],
         'M AY K R AH S K AA P IH K': ['M AY', 'K R AH', 'S K AA', 'P IH K'],
         'N Y UH M AE T IH K S': ['N Y UH', 'M AE', 'T IH K S'],
-        'F L AE JH AH L EY SH AH N Z': ['F L AE', 'JH AH', 'L EY', 'SH AH N Z']
+        'F L AE JH AH L EY SH AH N Z': ['F L AE', 'JH AH', 'L EY', 'SH AH N Z'],
+        'CH AA R L S T AH N': ['CH AA R L', 'S T AH N'],
+        'JH AH M P T B AE K': ['JH AH M P T', 'B AE K'],
+        'K R UW S CH Y AO F': ['K R UW S', 'CH Y AO F'],
+        'TH AW Z AH N D TH': ['TH AW', 'Z AH N D TH'],
     }
     for pron, syllabification in prons_syllabified.items():
-        assert syllabifyARPA(pron, return_list=True) == syllabification
+        assert syllabifyARPA(pron) == syllabification
 
 
 # def test_word_boundaries():
 # TODO:ADD this feature
 #        'S W IY P S T EY K S': ['S W IY P', 'S T EY K S']
 #        'S EH S AH M IY S T R IY T': ['S EH', 'S AH', 'M IY', 'S T R IY T'
+
+# def test_syllabic_consonant_nuclei():
+# TODO: ADD this feature
+# len(syllabifyARPA('B IY T L') == len(syllabifyARPA('B IY T AH L')
+# syllabifyARPA('B IY T L') == ['B IY', 'T L']