# Example for text processing

## Text cleaning / splitting

In [1]:
from utils.text.cleaners import french_cleaners, english_cleaners
from utils.text.cleaners import expand_numbers, collapse_whitespace, fr_convert_to_ascii, convert_to_ascii, lowercase

en_text = "Hello   World    !"
fr_text = "Bonjour  à Tous   !"

print("Text after english_cleaners    : {}".format(english_cleaners(en_text)))
print("Text after collapse_whitespace : {}".format(collapse_whitespace(en_text)))
print("Text after lowercase           : {}".format(lowercase(en_text)))
print()
print("Text after french_cleaners     : {}".format(french_cleaners(fr_text)))
print("Text after convert_to_ascii    : {}".format(convert_to_ascii(fr_text)))
print("Text after fr_convert_to_ascii : {}".format(fr_convert_to_ascii(fr_text)))


Text after english_cleaners    : hello world !
Text after collapse_whitespace : Hello World !
Text after lowercase           : hello   world    !

Text after french_cleaners     : bonjour a tous !
Text after convert_to_ascii    : Bonjour  a Tous   !
Text after fr_convert_to_ascii : Bonjour  a Tous   !


In [2]:
from utils.text import split_text

long_text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore 
et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut 
aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum 
dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia 
deserunt mollit anim id est laborum."""
for i, p in enumerate(split_text(long_text)):
    print("Text part {} (length = {}) : {}".format(i, len(p), p))

Text part 0 (length = 124) : Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore  et dolore magna aliqua.
Text part 1 (length = 108) : Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut  aliquip ex ea commodo consequat.
Text part 2 (length = 103) : Duis aute irure dolor in reprehenderit in voluptate velit esse cillum  dolore eu fugiat nulla pariatur.
Text part 3 (length = 111) : Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia  deserunt mollit anim id est laborum.


## Building a text encoder

### Initialize a Text encoder

A `vocab_size` higher than the actual number of tokens can be provided in order to create `ukn_x` tokens. It can be useful if you need a fixed `vocab_size` or if you would like to have the possibility to add tokens in the future. 

For instance, this feature is used in the French version of the `Tacotron-2` model, as the English version has a `vocab_size` of 148, while the French one has less characters (and thus a smaller vocabulary). 

It is also possible to initialize a `TextEncoder` based on a `transformers` pretrained `AutoTokenizer` : `BERT`, `BART`, `MBart` and `GPT-2` are currently supported.

PS : that's a bit strange that they use `<|endoftext|>` both for start and end of sequence but it is not an error ;)

In [3]:
from utils.text import TextEncoder, get_symbols, default_french_encoder, get_encoder

cleaners = [
    {'name' : 'french_cleaners', 'to_lowercase' : False}
]
# Equivalent to :
#cleaners = [
#    'fr_convert_to_ascii',
#    {'name' : 'expand_numbers', 'langue' : 'fr'},
#    'collapse_whitespace'
#]

default     = default_french_encoder()
encoder     = default_french_encoder(cleaners = cleaners, vocab_size = 148)
gpt_encoder = get_encoder(lang = 'en', text_encoder = 'gpt2')
print(default)
print()
print(encoder)
print()
print(gpt_encoder)


Using pad_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using mask_token, but it is not set yet.


Vocab (size = 70) : ['_', '-', '!', "'", '(', ')', ',', '.', ':', ';', '?', ' ', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', ...]
Config : {
  "level": 0,
  "lstrip": false,
  "rstrip": false,
  "cleaners": [
    "french_cleaners"
  ],
  "split_pattern": null,
  "bpe_end_of_word": null,
  "pad_token": "",
  "sep_token": null,
  "ukn_token": null,
  "sos_token": "[SOS]",
  "eos_token": "[EOS]",
  "mask_token": null,
  "sub_word_prefix": "",
  "use_sos_and_eos": false,
  "add_special_tokens_at_end": true
}

Vocab (size = 148) : ['_', '-', '!', "'", '(', ')', ',', '.', ':', ';', '?', ' ', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', ...]
Config : {
  "level": 0,
  "lstrip": false,
  "rstrip": false,
  "

### save / load it

In [4]:
filename = 'example_data/example_text_encoder.json'

encoder.save_to_file(filename)
restored = TextEncoder.load_from_file(filename)

print(restored)

Vocab (size = 148) : ['_', '-', '!', "'", '(', ')', ',', '.', ':', ';', '?', ' ', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', ...]
Config : {
  "level": 0,
  "lstrip": false,
  "rstrip": false,
  "cleaners": [
    {
      "name": "french_cleaners",
      "to_lowercase": false
    }
  ],
  "split_pattern": null,
  "bpe_end_of_word": null,
  "pad_token": "",
  "sep_token": null,
  "ukn_token": null,
  "sos_token": "[SOS]",
  "eos_token": "[EOS]",
  "mask_token": null,
  "sub_word_prefix": "",
  "use_sos_and_eos": false,
  "add_special_tokens_at_end": true
}


### Use it to encode / decode text

Note that the *manual encoding* is, in this case, equivalent to a simple encoding of each caracter after cleaning the text. It would not work for the `gpt_encoder` as it is a token-based tokenizer (and not a simple caracter-based one).

In [5]:
fr_text = "Bonjour  à Tous   !"

cleaned = default.clean_text(fr_text)
encoded = default.encode(fr_text)
decoded = default.decode(encoded)

print("Original text    : {}".format(fr_text))
print("Cleaned text     : {}".format(cleaned))
print("Encoded text     : {}".format(encoded))
print("Manually encoded : {}".format([default[c] for c in cleaned]))
print("Decoded text     : {}".format(decoded))


Original text    : Bonjour  à Tous   !
Cleaned text     : bonjour a tous !
Encoded text     : [39 52 51 47 52 58 55 11 38 11 57 52 58 56 11  2]
Manually encoded : [39, 52, 51, 47, 52, 58, 55, 11, 38, 11, 57, 52, 58, 56, 11, 2]
Decoded text     : bonjour a tous !


### Usage with tensorflow

As the encoder uses python objects and regex function, it cannot be used in pure tensorflow functions (for graph optimization). 

The classical way to deal with python functions is to use `tf.py_function()` or `tf.numpy_function`, which is shown in the `encodee` example.

Note : the `encode` and `decode` functions both handle `tf.Tensor`s (it simply converts them into `np.ndarray`).

In [17]:
import numpy as np
import tensorflow as tf

def encode_text(sentence):
    return default.encode(sentence)

@tf.function
def encode(text):
    encoded_text = tf.numpy_function(encode_text, [text], Tout = tf.int32)
    encoded_text.set_shape([None])
    
    return encoded_text

print(encode(tf.cast(fr_text, tf.string)))

tf.Tensor([39 52 51 47 52 58 55 11 38 11 57 52 58 56 11  2], shape=(16,), dtype=int32)


### The new `utils.execute_eagerly` function

A new function in `utils.tensorflow_utils` enables to automatically call a function inside `tf.numpy_function` ! It acts a bit like the new `#numpy_function` added in `tensorflow 2.14`, but it is now available in previous versions ;) It also offers a quite important aspect : it fixes the shape of the resulting tensor (which is done by the `encoded_text.set_shape` call in `encode` function above)

To demonstrate that the *original* `TextEncoder.encode` is not compatible with `tf.function`, the `encode_wrong` tries to call the non-wrapped function (accessible via the `func` variable), which correctly raises an error !

Note : for this test, it is important to pass `tf.Tensor` argument instead of regular `str` type. Otherwise, the graph function will simply call everythin as pure python function, and will always retrace (as passing raw python objects will cause tensorflow retracing) ! Do not hesitate to open a discussion if you want more details about this ;)

In [18]:
@tf.function
def encode_wrong(text):
    return default.encode.func(text)

@tf.function
def encode_new(text):
    return encode_text(text)

print(encode_new(tf.cast(fr_text, tf.string)))

try:
    print(encode_wrong(tf.cast(fr_text, tf.string)))
except AttributeError as e:
    print('An error occured !')

tf.Tensor([39 52 51 47 52 58 55 11 38 11 57 52 58 56 11  2], shape=(16,), dtype=int32)
An error occured !
