<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [None]:
#default_exp text.util

In [None]:
#export
""" from https://github.com/keithito/tacotron """

'''
Cleaners are transformations that run over the input text at both training and eval time.
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
  1. "english_cleaners" for English text
  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
     the symbols in symbols.py to match your data).
'''

import re
from unidecode import unidecode

from uberduck_ml_dev.text.symbols import curly_re


# Regular expression matching whitespace:
_whitespace_re = re.compile(r'\s+')

# List of (regular expression, replacement) pairs for abbreviations:
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
  ('mrs', 'misess'),
  ('mr', 'mister'),
  ('dr', 'doctor'),
  ('st', 'saint'),
  ('co', 'company'),
  ('jr', 'junior'),
  ('maj', 'major'),
  ('gen', 'general'),
  ('drs', 'doctors'),
  ('rev', 'reverend'),
  ('lt', 'lieutenant'),
  ('hon', 'honorable'),
  ('sgt', 'sergeant'),
  ('capt', 'captain'),
  ('esq', 'esquire'),
  ('ltd', 'limited'),
  ('col', 'colonel'),
  ('ft', 'fort'),
]]

import inflect
import re


_inflect = inflect.engine()
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
_number_re = re.compile(r'[0-9]+')


def _remove_commas(m):
    return m.group(1).replace(',', '')


def _expand_decimal_point(m):
    return m.group(1).replace('.', ' point ')


def _expand_dollars(m):
    match = m.group(1)
    parts = match.split('.')
    if len(parts) > 2:
        return match + ' dollars'  # Unexpected format
    dollars = int(parts[0]) if parts[0] else 0
    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
    if dollars and cents:
        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
        cent_unit = 'cent' if cents == 1 else 'cents'
        return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
    elif dollars:
        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
        return '%s %s' % (dollars, dollar_unit)
    elif cents:
        cent_unit = 'cent' if cents == 1 else 'cents'
        return '%s %s' % (cents, cent_unit)
    else:
        return 'zero dollars'


def _expand_ordinal(m):
    return _inflect.number_to_words(m.group(0))


def _expand_number(m):
    num = int(m.group(0))
    if num > 1000 and num < 3000:
        if num == 2000:
            return 'two thousand'
        elif num > 2000 and num < 2010:
            return 'two thousand ' + _inflect.number_to_words(num % 100)
        elif num % 100 == 0:
            return _inflect.number_to_words(num // 100) + ' hundred'
        else:
            return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
    else:
        return _inflect.number_to_words(num, andword='')


def normalize_numbers(text):
    text = re.sub(_comma_number_re, _remove_commas, text)
    text = re.sub(_pounds_re, r'\1 pounds', text)
    text = re.sub(_dollars_re, _expand_dollars, text)
    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
    text = re.sub(_ordinal_re, _expand_ordinal, text)
    text = re.sub(_number_re, _expand_number, text)
    return text


def expand_abbreviations(text):
    for regex, replacement in _abbreviations:
        text = re.sub(regex, replacement, text)
    return text


def expand_numbers(text):
    return normalize_numbers(text)


def lowercase(text):
    return text.lower()


def collapse_whitespace(text):
    return re.sub(_whitespace_re, ' ', text)


def convert_to_ascii(text):
    return unidecode(text)


def basic_cleaners(text):
    '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
    text = lowercase(text)
    text = collapse_whitespace(text)
    return text


def transliteration_cleaners(text):
    '''Pipeline for non-English text that transliterates to ASCII.'''
    text = convert_to_ascii(text)
    text = lowercase(text)
    text = collapse_whitespace(text)
    return text

def english_cleaners(text):
    """Pipeline for English text, including number and abbreviation expansion."""
    text = convert_to_ascii(text)
    text = lowercase(text)
    text = expand_numbers(text)
    text = expand_abbreviations(text)
    text = collapse_whitespace(text)
    return text

In [None]:
# export

from g2p_en import G2p

g2p = G2p()

from uberduck_ml_dev.text.symbols import (
    id_to_symbol,
    symbols_to_sequence,
    arpabet_to_sequence,
)

CLEANERS = {
    "english_cleaners": english_cleaners,
    "basic_cleaners": basic_cleaners,
    "transliteration_cleaners": transliteration_cleaners,
    
}

def clean_text(text, cleaner_names):
    for name in cleaner_names:
        cleaner = CLEANERS[name]
        text = cleaner(text)
    return text
        


def english_to_arpabet(english_text):
    arpabet_symbols = g2p(english_text)


def text_to_sequence(text, cleaner_names):
    """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
    The text can optionally have ARPAbet sequences enclosed in curly braces embedded
    in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
    Args:
      text: string to convert to a sequence
      cleaner_names: names of the cleaner functions to run the text through
    Returns:
      List of integers corresponding to the symbols in the text
    """
    sequence = []

    # Check for curly braces and treat their contents as ARPAbet:
    while len(text):
        m = curly_re.match(text)
        if not m:
            sequence += symbols_to_sequence(clean_text(text, cleaner_names))
            break
        sequence += symbols_to_sequence(clean_text(m.group(1), cleaner_names))
        sequence += arpabet_to_sequence(m.group(2))
        text = m.group(3)

    return sequence


def sequence_to_text(sequence):
    """Converts a sequence of IDs back to a string"""
    result = ""
    for symbol_id in sequence:
        if symbol_id in _id_to_symbol:
            s = _id_to_symbol[symbol_id]
            # Enclose ARPAbet back in curly braces:
            if len(s) > 1 and s[0] == "@":
                s = "{%s}" % s[1:]
            result += s
    return result.replace("}{", " ")

In [None]:
#skip
print(text_to_sequence("The pen is blue.", ["english_cleaners"]))
assert text_to_sequence("The pen is blue.", ["english_cleaners"]) == [57, 45, 42, 11, 53, 42, 51, 11, 46, 56, 11, 39, 49, 58, 42, 7]
assert text_to_sequence("The pen is {B L OW0}", ["english_cleaners"]) == [57, 45, 42, 11, 53, 42, 51, 11, 46, 56, 11, 88, 117, 122]
assert sequence_to_text(text_to_sequence("The pen is blue.", ["english_cleaners"])) == "the pen is blue."
assert sequence_to_text(text_to_sequence("The pen is {B L OW0}.", ["english_cleaners"])) == "the pen is {B L OW0}."
assert len(text_to_sequence("{N AA1 T} {B AE1 D} {B AA1 R T} , {N AA1 T} {B AE1 D} {AE1 T} {AO1 L} .", ["english_cleaners"])) == 30


[94, 82, 79, 8, 90, 79, 88, 8, 83, 93, 8, 76, 86, 95, 79, 4]


AssertionError: 

In [None]:
from uberduck_ml_dev.text.symbols import symbols, symbol_to_id

In [None]:
symbol_to_id

{'!': 0,
 "'": 1,
 '"': 2,
 ',': 3,
 '.': 4,
 ':': 5,
 ';': 6,
 '?': 7,
 ' ': 8,
 '#': 9,
 '%': 10,
 '&': 11,
 '*': 12,
 '+': 13,
 '-': 14,
 '/': 15,
 '[': 16,
 ']': 17,
 '(': 18,
 ')': 19,
 '_': 20,
 '@': 21,
 '©': 22,
 '°': 23,
 '½': 24,
 '—': 25,
 '₩': 26,
 '€': 27,
 '$': 28,
 'á': 29,
 'ç': 30,
 'é': 31,
 'ê': 32,
 'ë': 33,
 'ñ': 34,
 'ö': 35,
 'ø': 36,
 'ć': 37,
 'ž': 38,
 '0': 39,
 '1': 40,
 '2': 41,
 '3': 42,
 '4': 43,
 '5': 44,
 '6': 45,
 '7': 46,
 '8': 47,
 '9': 48,
 'A': 49,
 'B': 50,
 'C': 51,
 'D': 52,
 'E': 53,
 'F': 54,
 'G': 55,
 'H': 56,
 'I': 57,
 'J': 58,
 'K': 59,
 'L': 60,
 'M': 61,
 'N': 62,
 'O': 63,
 'P': 64,
 'Q': 65,
 'R': 66,
 'S': 67,
 'T': 68,
 'U': 69,
 'V': 70,
 'W': 71,
 'X': 72,
 'Y': 73,
 'Z': 74,
 'a': 75,
 'b': 76,
 'c': 77,
 'd': 78,
 'e': 79,
 'f': 80,
 'g': 81,
 'h': 82,
 'i': 83,
 'j': 84,
 'k': 85,
 'l': 86,
 'm': 87,
 'n': 88,
 'o': 89,
 'p': 90,
 'q': 91,
 'r': 92,
 's': 93,
 't': 94,
 'u': 95,
 'v': 96,
 'w': 97,
 'x': 98,
 'y': 99,
 'z': 100,