<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [None]:
# default_exp text.symbols

In [None]:
# export

""" from https://github.com/keithito/tacotron """

"""
Defines the set of symbols used in text input to the model.
The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. """

from uberduck_ml_dev.text import cmudict

_pad = "_"
_punctuation_nvidia_taco2 = "!'(),.:;? "
_punctuation = "!'\",.:;? "
_math = "#%&*+-/[]()"
_special = "@©°½—₩€$"
_special_nvidia_taco2 = "-"
_accented = "áçéêëñöøćž"
_numbers = "0123456789"

_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"


# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as
# uppercase letters):
_arpabet = ["@" + s for s in cmudict.valid_symbols]


# Language-specific symbol sets:

_portuguese = "áàãâéèêíìîóòõôúùûçÁÀÃÂÉÈÊÍÌÎÓÒÕÔÚÙÛÇabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"

symbols_portuguese = (
    [_pad]
    + list(_special_nvidia_taco2)
    + list(_punctuation_nvidia_taco2)
    + list(_portuguese)
    + _arpabet
)

PORTUGUESE_SYMBOLS = "portuguese"

##

_polish = "AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻaąbcćdeęfghijklłmnńoóprsśtuwyzźż"
_punctuation_polish = "!,.? "

symbols_polish = (
    [_pad]
    + list(_special_nvidia_taco2)
    + list(_punctuation_polish)
    + list(_polish)
    + _arpabet
)

POLISH_SYMBOLS = "polish"

##

_dutch = "éèêëíìîüÉÈÊËÍÌÎÜabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"

symbols_dutch = (
    [_pad]
    + list(_special_nvidia_taco2)
    + list(_punctuation_nvidia_taco2)
    + list(_dutch)
    + _arpabet
)

DUTCH_SYMBOLS = "dutch"

##


# Export all symbols:
symbols = (
    list(_pad + _punctuation + _math + _special + _accented + _numbers + _letters)
    + _arpabet
)

symbols_nvidia_taco2 = (
    [_pad]
    + list(_special_nvidia_taco2)
    + list(_punctuation_nvidia_taco2)
    + list(_letters)
    + _arpabet
)

symbols_with_ipa = symbols + list(_letters_ipa)
grad_tts_symbols = list(_pad + "-" + "!'(),.:;? " + _letters) + _arpabet

DEFAULT_SYMBOLS = "default"
IPA_SYMBOLS = "ipa"
NVIDIA_TACO2_SYMBOLS = "nvidia_taco2"
GRAD_TTS_SYMBOLS = "gradtts"

SYMBOL_SETS = {
    DEFAULT_SYMBOLS: symbols,
    IPA_SYMBOLS: symbols_with_ipa,
    NVIDIA_TACO2_SYMBOLS: symbols_nvidia_taco2,
    GRAD_TTS_SYMBOLS: grad_tts_symbols,
    # Language-specific symbol sets:
    PORTUGUESE_SYMBOLS: symbols_portuguese,
    POLISH_SYMBOLS: symbols_polish,
    DUTCH_SYMBOLS: symbols_dutch,
}

In [None]:
# export
import re

symbol_to_id = {
    DEFAULT_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[DEFAULT_SYMBOLS])},
    IPA_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[IPA_SYMBOLS])},
    NVIDIA_TACO2_SYMBOLS: {
        s: i for i, s in enumerate(SYMBOL_SETS[NVIDIA_TACO2_SYMBOLS])
    },
    GRAD_TTS_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[GRAD_TTS_SYMBOLS])},
    # Language-specific symbol sets:
    PORTUGUESE_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[PORTUGUESE_SYMBOLS])},
    POLISH_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[POLISH_SYMBOLS])},
    DUTCH_SYMBOLS: {s: i for i, s in enumerate(SYMBOL_SETS[DUTCH_SYMBOLS])},
}
id_to_symbol = {
    DEFAULT_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[DEFAULT_SYMBOLS])},
    IPA_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[IPA_SYMBOLS])},
    NVIDIA_TACO2_SYMBOLS: {
        i: s for i, s in enumerate(SYMBOL_SETS[NVIDIA_TACO2_SYMBOLS])
    },
    GRAD_TTS_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[GRAD_TTS_SYMBOLS])},
    # Language-specific symbol sets:
    PORTUGUESE_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[PORTUGUESE_SYMBOLS])},
    POLISH_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[POLISH_SYMBOLS])},
    DUTCH_SYMBOLS: {i: s for i, s in enumerate(SYMBOL_SETS[DUTCH_SYMBOLS])},
}

curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
words_re = re.compile(
    r"([a-zA-ZÀ-ž]+['][a-zA-ZÀ-ž]{1,2}|[a-zA-ZÀ-ž]+)|([{][^}]+[}]|[^a-zA-ZÀ-ž{}]+)"
)


def symbols_to_sequence(symbols, symbol_set=DEFAULT_SYMBOLS, ignore_symbols=["_", "~"]):
    return [
        symbol_to_id[symbol_set][s]
        for s in symbols
        if should_keep_symbol(s, symbol_set, ignore_symbols)
    ]


def arpabet_to_sequence(text, symbol_set=DEFAULT_SYMBOLS):
    return symbols_to_sequence(["@" + s for s in text.split()], symbol_set=symbol_set)


def should_keep_symbol(s, symbol_set=DEFAULT_SYMBOLS, ignore_symbols=["_", "~"]):
    return s in symbol_to_id[symbol_set] and s not in ignore_symbols

In [None]:
print(words_re.findall("The   quick"))
print(words_re.findall("I'm blue,"))
print(words_re.findall("L'monj'ello"))
print(words_re.findall("{ S IY } { EH M }"))

[('The', ''), ('', '   '), ('quick', '')]
[("I'm", ''), ('', ' '), ('blue', ''), ('', ',')]
[("L'mo", ''), ("nj'el", ''), ('lo', '')]
[('', '{ S IY }'), ('', ' '), ('', '{ EH M }')]


In [None]:
assert should_keep_symbol(" ")
assert not should_keep_symbol("\n")
assert should_keep_symbol(".")

In [None]:
# NOTE: arpabet_to_sequence does not properly handle whitespace, it should take single words only.
assert (
    len(arpabet_to_sequence("{ S IY } { EH M } { Y UW } { D IH K SH AH N EH R IY }"))
    == 15
)
assert arpabet_to_sequence("{ S IY }") == [168, 148]
# But symbols_to_sequence hanldes whitespace
assert len(symbols_to_sequence("C M U Dictionary")) == 16
arpabet_to_sequence("{ H AH1 N D R IH D}")

[111, 156, 127, 167, 144]

In [None]:
len(SYMBOL_SETS["default"])

185