<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [None]:
#default_exp text.symbols

In [None]:
#export

""" from https://github.com/keithito/tacotron """

"""
Defines the set of symbols used in text input to the model.
The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. """

from uberduck_ml_dev.text import cmudict

_punctuation = "!'\",.:;? "
_math = "#%&*+-/[]()"
_special = "_@©°½—₩€$"
_accented = "áçéêëñöøćž"
_numbers = "0123456789"
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"

# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as
# uppercase letters):
_arpabet = ["@" + s for s in cmudict.valid_symbols]

# Export all symbols:
symbols = (
    list(_punctuation + _math + _special + _accented + _numbers + _letters) + _arpabet
)

In [None]:
# export
import re

symbol_to_id = {s: i for i, s in enumerate(symbols)}
id_to_symbol = {i: s for i, s in enumerate(symbols)}
curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
words_re = re.compile(
    r"([a-zA-ZÀ-ž]+['][a-zA-ZÀ-ž]{1,2}|[a-zA-ZÀ-ž]+)|([{][^}]+[}]|[^a-zA-ZÀ-ž{}]+)"
)


def symbols_to_sequence(symbols):
    return [symbol_to_id[s] for s in symbols if should_keep_symbol(s)]


def arpabet_to_sequence(text):
    return symbols_to_sequence(["@" + s for s in text.split()])


def should_keep_symbol(s):
    return s in symbol_to_id and s != "_" and s != "~"

In [None]:
print(words_re.findall("The   quick"))
print(words_re.findall("I'm blue,"))
print(words_re.findall("L'monj'ello"))
print(words_re.findall("{ S IY } { EH M }"))

[('The', ''), ('', '   '), ('quick', '')]
[("I'm", ''), ('', ' '), ('blue', ''), ('', ',')]
[("L'mo", ''), ("nj'el", ''), ('lo', '')]
[('', '{ S IY }'), ('', ' '), ('', '{ EH M }')]


In [None]:
assert should_keep_symbol(" ")
assert not should_keep_symbol("\n")
assert should_keep_symbol(".")

In [None]:
#export
def say_hello2():
    print("hello, world")

In [None]:
# NOTE: arpabet_to_sequence does not properly handle whitespace, it should take single words only.
assert (
    len(arpabet_to_sequence("{ S IY } { EH M } { Y UW } { D IH K SH AH N EH R IY }"))
    == 15
)
assert arpabet_to_sequence("{ S IY }") == [168, 148]
# But symbols_to_sequence hanldes whitespace
assert len(symbols_to_sequence("C M U Dictionary")) == 16