Skip to content

Commit

Permalink
Merge branch 'master' into publish-workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
dippedrusk committed May 18, 2020
2 parents 9a256ba + 9191007 commit 43aa844
Show file tree
Hide file tree
Showing 6 changed files with 130 additions and 81 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
# Syllabifier

[![Build Status](https://img.shields.io/github/workflow/status/vgautam/arpabet-syllabifier/Test/master)](https://coveralls.io/github/vgautam/arpabet-syllabifier?branch=master) [![Coverage Status](https://coveralls.io/repos/github/vgautam/arpabet-syllabifier/badge.svg?branch=master)](https://coveralls.io/github/vgautam/arpabet-syllabifier?branch=master)

Syllabifier is a Python module to syllabify your English pronunciations. Currently only
ARPABET syllabification is supported.
It will take an ARPABET transcription in array or string form and return a Pandas Series or Python
list with the syllables chunked.
It will take an ARPABET transcription in array or string form and return a list with the syllables
chunked.

## Dependencies

* python>=3.5
* pandas>=0.20.0
* jupyter>=1.0.0 (only if you want to run the test notebook locally)

## How to Use
Expand All @@ -17,7 +18,6 @@ list with the syllables chunked.
* Import the function `from syllabifier import syllabifyARPA`.
* Function parameters
* A 2-letter ARPABET transcription in string form (with phones delimited by spaces) or as a Python list (stress markers on the vowels are optional)
* (Optional) bool return_list to return in Python list form instead of the default Pandas Series
* (Optional) bool silence_warnings to suppress ValueErrors thrown because of unsyllabifiable input
* Sample calls are in the Jupyter Notebook test.ipynb, using CMU Pronouncing Dictionary data.

Expand All @@ -26,7 +26,7 @@ list with the syllables chunked.
* **tests/test.ipynb**: Jupyter Notebook demonstrating sample calls to syllabifyARPA using CMUDict data
* **tests/cmudict.txt**: Very large text file containing over 100,000 ARPABET-syllabified English words
* **tests/cmusubset.txt**: Subset of ~60 words and transcriptions from the CMU Dictionary text file for testing convenience
* **tests/test_syllabifyARPA.py**: Unit and integration tests for the package
* **tests/test_syllabifier.py**: Unit and integration tests for the package

## ARPABET
ARPABET is a method of transcribing General American English phonetically with only ASCII characters. Refer [here](https://en.wikipedia.org/wiki/ARPABET) for a table of mappings between IPA and ARPABET. This syllabifier accepts only the 2-letter ARPABET codes but case does not matter.
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@
pandas>=0.20.0
1 change: 1 addition & 0 deletions src/syllabifier/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
#!/usr/bin/env python3
from syllabifier.syllabifyARPA import syllabifyARPA
1 change: 1 addition & 0 deletions src/syllabifier/constants.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python3
import re

# Sets required to check for valid onset and coda clusters
Expand Down
64 changes: 21 additions & 43 deletions src/syllabifier/syllabifyARPA.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,53 +8,37 @@
# Vasundhara Gautam
# October 3rd, 2017

import pandas as pd
import re
import sys

# Sets required to check for valid onset and coda clusters
VOICELESS = set(['K', 'P', 'T', 'F', 'HH', 'S', 'SH', 'TH', 'CH'])
VOICED = set(['G', 'B', 'D', 'DH', 'V', 'Z', 'ZH', 'JH'])

STOPS = set(['K', 'P', 'T', 'G', 'B', 'D'])
FRICATIVES = set(['F', 'DH', 'HH', 'S', 'SH', 'TH', 'V', 'Z', 'ZH'])
AFFRICATES = set(['CH', 'JH'])
NASALS = set(['M', 'N', 'NG'])
APPROXIMANTS = set(['L', 'R', 'W', 'Y'])
CONSONANTS = STOPS.union(FRICATIVES).union(AFFRICATES).union(NASALS).union(APPROXIMANTS)

S_EXTENDED_CODAS = set(['K', 'P', 'T', 'F', 'TH', 'D', 'NG'])
Z_EXTENDED_CODAS = set(['G', 'B', 'D', 'DH', 'V', 'M', 'N', 'NG', 'L'])

T_EXTENDED_CODAS = set(['K', 'P', 'F', 'S', 'SH', 'TH', 'CH', 'N'])
D_EXTENDED_CODAS = set(['G', 'B', 'DH', 'V', 'Z', 'ZH', 'JH', 'M', 'N', 'NG'])

PHONESET = set(['AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B',
'CH', 'D', 'DH', 'EH', 'ER', 'EY', 'F', 'G',
'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N',
'NG', 'OW', 'OY', 'P', 'R', 'S', 'SH', 'T',
'TH', 'UH', 'UW', 'V', 'W', 'Y', 'Z', 'ZH'])

# Optional stress markers (0,1,2) after the vowel for flexibility
VOWELS_REGEX = re.compile(r'(?:AA|AE|AH|AO|AW|AY|EH|ER|EY|IH|IY|OW|OY|UW|UH)[012]?')

def syllabifyARPA(arpa_arr, return_list=False, silence_warnings=False):
from syllabifier.constants import VOICELESS
from syllabifier.constants import VOICED
from syllabifier.constants import STOPS
from syllabifier.constants import FRICATIVES
from syllabifier.constants import AFFRICATES
from syllabifier.constants import NASALS
from syllabifier.constants import APPROXIMANTS
from syllabifier.constants import CONSONANTS
from syllabifier.constants import S_EXTENDED_CODAS
from syllabifier.constants import Z_EXTENDED_CODAS
from syllabifier.constants import T_EXTENDED_CODAS
from syllabifier.constants import D_EXTENDED_CODAS
from syllabifier.constants import PHONESET
from syllabifier.constants import VOWELS_REGEX

def syllabifyARPA(arpa_arr, silence_warnings=False):
"""
Syllabifies ARPABET transcriptions according to General American English
syllabification rules.
Args:
arpa_arr: A string or array of ARPABET phones with optional stress markers
on the vowels.
return_list: Boolean (default False) to return list of syllable strings
silence_warnings: Boolean (default False) to suppress ValueErrors
Returns:
Pandas Series of dtype 'Object' with syllables in each row.
If return_list set to True, returns a Python list of strings containing
the syllables.
In case the input is unsyllabifiable, an empty Series or list is
returned.
List of strings with syllables in each row.
In case the input is unsyllabifiable, an empty list is returned.
Raises:
ValueError if input contains non-ARPABET phonemes, no vowels or if it
Expand All @@ -65,9 +49,7 @@ def handleError(string):
if not silence_warnings:
raise ValueError(string)

ret = [] #pd.Series(None)
if return_list:
ret = []
ret = []

try:
arpa_arr = arpa_arr.split() # Allows for phoneme array and string input
Expand All @@ -80,7 +62,7 @@ def handleError(string):
word = ' '.join(arpa_arr)

if not (testInPhoneset(arpa_arr)):
handleError('Input %s contains non-ARPABET phonemes' % word)
handleError('Input %s contains non-ARPABET phones' % word)
return ret

final_arr = []
Expand Down Expand Up @@ -112,14 +94,10 @@ def handleError(string):

for i in range(len(final_arr)):
if not testLegalCoda(final_arr[i]):
handleError('Impossible to syllabify %s according to English '
'syllabification rules.' % word)
handleError('Bad coda cluster in %s' % word)
return ret

#ret = pd.Series([' '.join(syllable) for syllable in final_arr])
ret = [' '.join(syllable) for syllable in final_arr]
if return_list:
ret = list(ret)

return ret

Expand Down
134 changes: 102 additions & 32 deletions tests/test_syllabifier.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python3
import itertools
import pytest
from syllabifier import syllabifyARPA
from syllabifier.constants import CONSONANTS
from syllabifier.constants import PHONESET
Expand All @@ -11,57 +12,102 @@

def test_syllabifyARPA():
test_string = 'HH AE NG M AE N'
assert syllabifyARPA(test_string, return_list=True) == ['HH AE NG', 'M AE N']
assert syllabifyARPA(test_string) == ['HH AE NG', 'M AE N']


def test_unsyllabifiable():
def test_non_ARPABET_phones():
test_string = 'banana'
with pytest.raises(ValueError, match='contains non-ARPABET phones'):
syllabifyARPA(test_string)
assert not syllabifyARPA(test_string, silence_warnings=True)


def test_no_vowel():
test_string = 'K R F JH'
with pytest.raises(ValueError, match='no vowel in'):
syllabifyARPA(test_string)
assert not syllabifyARPA(test_string, silence_warnings=True)
test_string = 'K S'
with pytest.raises(ValueError, match='no vowel in'):
syllabifyARPA(test_string)
assert not syllabifyARPA(test_string, silence_warnings=True)


def test_bad_onset_cluster():
test_string = 'M G L AA'
assert not syllabifyARPA(test_string, return_list=True, silence_warnings=True)
with pytest.raises(ValueError, match='Bad onset cluster'):
syllabifyARPA(test_string)
assert not syllabifyARPA(test_string, silence_warnings=True)


def test_empty():
test_string = ''
assert not syllabifyARPA(test_string, return_list=True, silence_warnings=True)
def test_NG_onset():
test_string = 'NG OW'
with pytest.raises(ValueError, match='Bad onset cluster'):
syllabifyARPA(test_string)
assert not syllabifyARPA(test_string, silence_warnings=True)


def test_no_vowel():
test_string = 'K S'
assert not syllabifyARPA(test_string, return_list=True, silence_warnings=True)
def test_bad_coda():
test_string = 'AE G R P'
with pytest.raises(ValueError, match='Bad coda cluster'):
syllabifyARPA(test_string)
assert not syllabifyARPA(test_string, silence_warnings=True)


def test_length_5_coda():
test_string = 'AE N G L S F'
with pytest.raises(ValueError, match='Bad coda cluster'):
syllabifyARPA(test_string)
assert not syllabifyARPA(test_string, silence_warnings=True)


def test_length_4_coda():
test_string = 'AE N S G F'
with pytest.raises(ValueError, match='Bad coda cluster'):
syllabifyARPA(test_string)
assert not syllabifyARPA(test_string, silence_warnings=True)
test_string = 'S IH K S TH S'
assert syllabifyARPA(test_string) == [test_string]


def test_empty():
test_string = ''
assert not syllabifyARPA(test_string, silence_warnings=True)


def test_sixths():
test_string = 'S IH K S TH S'
assert syllabifyARPA(test_string, return_list=True) == ['S IH K S TH S']
assert syllabifyARPA(test_string) == ['S IH K S TH S']


def test_lowercase():
test_string = 'ow'
assert syllabifyARPA(test_string, return_list=True) == ['OW']
assert syllabifyARPA(test_string) == ['OW']


def test_mixedcase():
test_string = 'oW'
assert syllabifyARPA(test_string, return_list=True) == ['OW']
assert syllabifyARPA(test_string) == ['OW']


def test_non_arpabet():
test_string = 'GH IY'
assert not syllabifyARPA(test_string, return_list=True, silence_warnings=True)
assert not syllabifyARPA(test_string, silence_warnings=True)


def test_array():
test_array = ['K', 'AE', 'T']
assert syllabifyARPA(test_array, return_list=True) == ['K AE T']
assert syllabifyARPA(test_array) == ['K AE T']


def test_weird_array():
test_array = ['K AE', 'T']
assert not syllabifyARPA(test_array, return_list=True, silence_warnings=True)
assert not syllabifyARPA(test_array, silence_warnings=True)


def test_CVC_syllables():
for syllable in itertools.product(legal_onsets, VOWELS, legal_codas):
assert syllabifyARPA(list(syllable), return_list=True)
assert syllabifyARPA(list(syllable))


def test_SZ_extension():
Expand All @@ -70,8 +116,8 @@ def test_SZ_extension():
for syllable in itertools.product(legal_onsets, VOWELS, sz_extendable_codas):
s_extended = list(syllable) + ['S']
z_extended = list(syllable) + ['Z']
assert (syllabifyARPA(s_extended, return_list=True, silence_warnings=True) !=
syllabifyARPA(z_extended, return_list=True, silence_warnings=True))
assert (syllabifyARPA(s_extended, silence_warnings=True) !=
syllabifyARPA(z_extended, silence_warnings=True))


def test_TD_extension():
Expand All @@ -80,67 +126,91 @@ def test_TD_extension():
for syllable in itertools.product(legal_onsets, VOWELS, td_extendable_codas):
t_extended = list(syllable) + ['T']
d_extended = list(syllable) + ['D']
assert (syllabifyARPA(t_extended, return_list=True, silence_warnings=True) !=
syllabifyARPA(d_extended, return_list=True, silence_warnings=True))
assert (syllabifyARPA(t_extended, silence_warnings=True) !=
syllabifyARPA(d_extended, silence_warnings=True))


def test_CCVC_syllables():
syllables = ['P L EY', 'D R IY M', 'HH Y UW JH', 'S L IY P', 'G W AA M', 'T W IH N', 'K R AW D']
for syllable in syllables:
assert syllabifyARPA(syllable, return_list=True) == [syllable]
assert syllabifyARPA(syllable) == [syllable]


def test_CVCC_syllables():
syllables = ['HH EH L P', 'W EH L SH', 'F IH F TH', 'L AH NG Z', 'B EH L T', 'T AE K T']
# TODO:FIX for mensch
for syllable in syllables:
assert syllabifyARPA(syllable, return_list=True) == [syllable]
assert syllabifyARPA(syllable) == [syllable]


def test_V_syllables():
syllables = ['OW', 'AY', 'UW', 'AA', 'EY']
for syllable in syllables:
assert syllabifyARPA(syllable, return_list=True) == [syllable]
assert syllabifyARPA(syllable) == [syllable]


def test_CVCCC_syllables():
syllables = ['Y EH L P S', 'K AE L K S', 'W AO R M TH', 'W IH L S T', 'JH IH NG K S']
syllables = [
'Y EH L P S', 'K AE L K S', 'W AO R M TH', 'W IH L S T', 'JH IH NG K S', 'M IH L K T',
'P AH M P T'
]
for syllable in syllables:
assert syllabifyARPA(syllable, return_list=True) == [syllable]
assert syllabifyARPA(syllable) == [syllable]


def test_CCVCCC_syllables():
syllables = [
'K R AE M P T', 'B L IH NG K T', 'T W AE NG G D'
]
for syllable in syllables:
assert syllabifyARPA(syllable) == [syllable]


def test_CCVCC_syllables():
syllables = ['TH W AO R T', 'K L AH T S', 'B L AH N T', 'F Y OW R D', 'G R IH N CH']
syllables = [
'TH W AO R T', 'K L AH T S', 'B L AH N T', 'F Y OW R D', 'G R IH N CH'
]
for syllable in syllables:
assert syllabifyARPA(syllable, return_list=True) == [syllable]
assert syllabifyARPA(syllable) == [syllable]


def test_s_clusters():
syllables = ['S P OW R T', 'S M AY L', 'S F IH NG K S', 'S P Y UW', 'S T AH D', 'S K W EY R']
syllables = [
'S P OW R T', 'S M AY L', 'S F IH NG K S', 'S P Y UW', 'S T AH D', 'S K W EY R',
'S T R AY K']
for syllable in syllables:
assert syllabifyARPA(syllable, return_list=True) == [syllable]
assert syllabifyARPA(syllable) == [syllable]
prons_syllabified = {
'S F R AH JH IH S T IH K S': ['S F R AH', 'JH IH', 'S T IH K S'],
'B L AE S T IH D': ['B L AE', 'S T IH D'],
'S K L EY R AH': ['S K L EY', 'R AH'],
'S T EH TH AH S K OW P': ['S T EH', 'TH AH', 'S K OW P']
}
for pron, syllabification in prons_syllabified.items():
assert syllabifyARPA(pron, return_list=True) == syllabification
assert syllabifyARPA(pron) == syllabification


def test_long_words():
prons_syllabified = {
'AH S F IH K S IY EY T AH D': ['AH', 'S F IH K', 'S IY', 'EY', 'T AH D'],
'M AY K R AH S K AA P IH K': ['M AY', 'K R AH', 'S K AA', 'P IH K'],
'N Y UH M AE T IH K S': ['N Y UH', 'M AE', 'T IH K S'],
'F L AE JH AH L EY SH AH N Z': ['F L AE', 'JH AH', 'L EY', 'SH AH N Z']
'F L AE JH AH L EY SH AH N Z': ['F L AE', 'JH AH', 'L EY', 'SH AH N Z'],
'CH AA R L S T AH N': ['CH AA R L', 'S T AH N'],
'JH AH M P T B AE K': ['JH AH M P T', 'B AE K'],
'K R UW S CH Y AO F': ['K R UW S', 'CH Y AO F'],
'TH AW Z AH N D TH': ['TH AW', 'Z AH N D TH'],
}
for pron, syllabification in prons_syllabified.items():
assert syllabifyARPA(pron, return_list=True) == syllabification
assert syllabifyARPA(pron) == syllabification


# def test_word_boundaries():
# TODO:ADD this feature
# 'S W IY P S T EY K S': ['S W IY P', 'S T EY K S']
# 'S EH S AH M IY S T R IY T': ['S EH', 'S AH', 'M IY', 'S T R IY T'

# def test_syllabic_consonant_nuclei():
# TODO: ADD this feature
# len(syllabifyARPA('B IY T L') == len(syllabifyARPA('B IY T AH L')
# syllabifyARPA('B IY T L') == ['B IY', 'T L']

0 comments on commit 43aa844

Please sign in to comment.