Skip to content

Commit

Permalink
Dump pandas requirement
Browse files Browse the repository at this point in the history
  • Loading branch information
vgautam committed May 17, 2020
1 parent 969bd0a commit 72af029
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 43 deletions.
8 changes: 3 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@

Syllabifier is a Python module to syllabify your English pronunciations. Currently only
ARPABET syllabification is supported.
It will take an ARPABET transcription in array or string form and return a Pandas Series or Python
list with the syllables chunked.
It will take an ARPABET transcription in array or string form and return a list with the syllables
chunked.

## Dependencies

* python>=3.5
* pandas>=0.20.0
* jupyter>=1.0.0 (only if you want to run the test notebook locally)

## How to Use
Expand All @@ -17,7 +16,6 @@ list with the syllables chunked.
* Import the function `from syllabifier import syllabifyARPA`.
* Function parameters
* A 2-letter ARPABET transcription in string form (with phones delimited by spaces) or as a Python list (stress markers on the vowels are optional)
* (Optional) bool return_list to return in Python list form instead of the default Pandas Series
* (Optional) bool silence_warnings to suppress ValueErrors thrown because of unsyllabifiable input
* Sample calls are in the Jupyter Notebook test.ipynb, using CMU Pronouncing Dictionary data.

Expand All @@ -26,7 +24,7 @@ list with the syllables chunked.
* **tests/test.ipynb**: Jupyter Notebook demonstrating sample calls to syllabifyARPA using CMUDict data
* **tests/cmudict.txt**: Very large text file containing over 100,000 ARPABET-syllabified English words
* **tests/cmusubset.txt**: Subset of ~60 words and transcriptions from the CMU Dictionary text file for testing convenience
* **tests/test_syllabifyARPA.py**: Unit and integration tests for the package
* **tests/test_syllabifier.py**: Unit and integration tests for the package

## ARPABET
ARPABET is a method of transcribing General American English phonetically with only ASCII characters. Refer [here](https://en.wikipedia.org/wiki/ARPABET) for a table of mappings between IPA and ARPABET. This syllabifier accepts only the 2-letter ARPABET codes but case does not matter.
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@
pandas>=0.20.0
18 changes: 4 additions & 14 deletions src/syllabifier/syllabifyARPA.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
# Vasundhara Gautam
# October 3rd, 2017

import pandas as pd
import re
import sys

Expand Down Expand Up @@ -38,23 +37,19 @@
# Optional stress markers (0,1,2) after the vowel for flexibility
VOWELS_REGEX = re.compile(r'(?:AA|AE|AH|AO|AW|AY|EH|ER|EY|IH|IY|OW|OY|UW|UH)[012]?')

def syllabifyARPA(arpa_arr, return_list=False, silence_warnings=False):
def syllabifyARPA(arpa_arr, silence_warnings=False):
"""
Syllabifies ARPABET transcriptions according to General American English
syllabification rules.
Args:
arpa_arr: A string or array of ARPABET phones with optional stress markers
on the vowels.
return_list: Boolean (default False) to return list of syllable strings
silence_warnings: Boolean (default False) to suppress ValueErrors
Returns:
Pandas Series of dtype 'Object' with syllables in each row.
If return_list set to True, returns a Python list of strings containing
the syllables.
In case the input is unsyllabifiable, an empty Series or list is
returned.
List of strings with syllables in each row.
In case the input is unsyllabifiable, an empty list is returned.
Raises:
ValueError if input contains non-ARPABET phonemes, no vowels or if it
Expand All @@ -65,9 +60,7 @@ def handleError(string):
if not silence_warnings:
raise ValueError(string)

ret = [] #pd.Series(None)
if return_list:
ret = []
ret = []

try:
arpa_arr = arpa_arr.split() # Allows for phoneme array and string input
Expand Down Expand Up @@ -116,10 +109,7 @@ def handleError(string):
'syllabification rules.' % word)
return ret

#ret = pd.Series([' '.join(syllable) for syllable in final_arr])
ret = [' '.join(syllable) for syllable in final_arr]
if return_list:
ret = list(ret)

return ret

Expand Down
46 changes: 23 additions & 23 deletions tests/test_syllabifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,57 +11,57 @@

def test_syllabifyARPA():
test_string = 'HH AE NG M AE N'
assert syllabifyARPA(test_string, return_list=True) == ['HH AE NG', 'M AE N']
assert syllabifyARPA(test_string) == ['HH AE NG', 'M AE N']


def test_unsyllabifiable():
test_string = 'M G L AA'
assert not syllabifyARPA(test_string, return_list=True, silence_warnings=True)
assert not syllabifyARPA(test_string, silence_warnings=True)


def test_empty():
test_string = ''
assert not syllabifyARPA(test_string, return_list=True, silence_warnings=True)
assert not syllabifyARPA(test_string, silence_warnings=True)


def test_no_vowel():
test_string = 'K S'
assert not syllabifyARPA(test_string, return_list=True, silence_warnings=True)
assert not syllabifyARPA(test_string, silence_warnings=True)


def test_sixths():
test_string = 'S IH K S TH S'
assert syllabifyARPA(test_string, return_list=True) == ['S IH K S TH S']
assert syllabifyARPA(test_string) == ['S IH K S TH S']


def test_lowercase():
test_string = 'ow'
assert syllabifyARPA(test_string, return_list=True) == ['OW']
assert syllabifyARPA(test_string) == ['OW']


def test_mixedcase():
test_string = 'oW'
assert syllabifyARPA(test_string, return_list=True) == ['OW']
assert syllabifyARPA(test_string) == ['OW']


def test_non_arpabet():
test_string = 'GH IY'
assert not syllabifyARPA(test_string, return_list=True, silence_warnings=True)
assert not syllabifyARPA(test_string, silence_warnings=True)


def test_array():
test_array = ['K', 'AE', 'T']
assert syllabifyARPA(test_array, return_list=True) == ['K AE T']
assert syllabifyARPA(test_array) == ['K AE T']


def test_weird_array():
test_array = ['K AE', 'T']
assert not syllabifyARPA(test_array, return_list=True, silence_warnings=True)
assert not syllabifyARPA(test_array, silence_warnings=True)


def test_CVC_syllables():
for syllable in itertools.product(legal_onsets, VOWELS, legal_codas):
assert syllabifyARPA(list(syllable), return_list=True)
assert syllabifyARPA(list(syllable))


def test_SZ_extension():
Expand All @@ -70,8 +70,8 @@ def test_SZ_extension():
for syllable in itertools.product(legal_onsets, VOWELS, sz_extendable_codas):
s_extended = list(syllable) + ['S']
z_extended = list(syllable) + ['Z']
assert (syllabifyARPA(s_extended, return_list=True, silence_warnings=True) !=
syllabifyARPA(z_extended, return_list=True, silence_warnings=True))
assert (syllabifyARPA(s_extended, silence_warnings=True) !=
syllabifyARPA(z_extended, silence_warnings=True))


def test_TD_extension():
Expand All @@ -80,53 +80,53 @@ def test_TD_extension():
for syllable in itertools.product(legal_onsets, VOWELS, td_extendable_codas):
t_extended = list(syllable) + ['T']
d_extended = list(syllable) + ['D']
assert (syllabifyARPA(t_extended, return_list=True, silence_warnings=True) !=
syllabifyARPA(d_extended, return_list=True, silence_warnings=True))
assert (syllabifyARPA(t_extended, silence_warnings=True) !=
syllabifyARPA(d_extended, silence_warnings=True))


def test_CCVC_syllables():
syllables = ['P L EY', 'D R IY M', 'HH Y UW JH', 'S L IY P', 'G W AA M', 'T W IH N', 'K R AW D']
for syllable in syllables:
assert syllabifyARPA(syllable, return_list=True) == [syllable]
assert syllabifyARPA(syllable) == [syllable]


def test_CVCC_syllables():
syllables = ['HH EH L P', 'W EH L SH', 'F IH F TH', 'L AH NG Z', 'B EH L T', 'T AE K T']
# TODO:FIX for mensch
for syllable in syllables:
assert syllabifyARPA(syllable, return_list=True) == [syllable]
assert syllabifyARPA(syllable) == [syllable]


def test_V_syllables():
syllables = ['OW', 'AY', 'UW', 'AA', 'EY']
for syllable in syllables:
assert syllabifyARPA(syllable, return_list=True) == [syllable]
assert syllabifyARPA(syllable) == [syllable]


def test_CVCCC_syllables():
syllables = ['Y EH L P S', 'K AE L K S', 'W AO R M TH', 'W IH L S T', 'JH IH NG K S']
for syllable in syllables:
assert syllabifyARPA(syllable, return_list=True) == [syllable]
assert syllabifyARPA(syllable) == [syllable]


def test_CCVCC_syllables():
syllables = ['TH W AO R T', 'K L AH T S', 'B L AH N T', 'F Y OW R D', 'G R IH N CH']
for syllable in syllables:
assert syllabifyARPA(syllable, return_list=True) == [syllable]
assert syllabifyARPA(syllable) == [syllable]


def test_s_clusters():
syllables = ['S P OW R T', 'S M AY L', 'S F IH NG K S', 'S P Y UW', 'S T AH D', 'S K W EY R']
for syllable in syllables:
assert syllabifyARPA(syllable, return_list=True) == [syllable]
assert syllabifyARPA(syllable) == [syllable]
prons_syllabified = {
'S F R AH JH IH S T IH K S': ['S F R AH', 'JH IH', 'S T IH K S'],
'B L AE S T IH D': ['B L AE', 'S T IH D'],
'S K L EY R AH': ['S K L EY', 'R AH'],
'S T EH TH AH S K OW P': ['S T EH', 'TH AH', 'S K OW P']
}
for pron, syllabification in prons_syllabified.items():
assert syllabifyARPA(pron, return_list=True) == syllabification
assert syllabifyARPA(pron) == syllabification


def test_long_words():
Expand All @@ -137,7 +137,7 @@ def test_long_words():
'F L AE JH AH L EY SH AH N Z': ['F L AE', 'JH AH', 'L EY', 'SH AH N Z']
}
for pron, syllabification in prons_syllabified.items():
assert syllabifyARPA(pron, return_list=True) == syllabification
assert syllabifyARPA(pron) == syllabification


# def test_word_boundaries():
Expand Down

0 comments on commit 72af029

Please sign in to comment.