# Create Subword Tokenizer for Transformer

# Library

* tensorflow_datasets is needed to download data from google cloud storage (not used)
* tensorflow_text is needed in the tokenizer library

In [5]:
# %pip install -q tensorflow_datasets
# %pip install -q tensorflow_text tensorflow

In [6]:
import pathlib
import tensorflow as tf
import tensorflow_text as tftxt
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab
from pickle import dump, load
import numpy as np
from numpy.random import rand, shuffle
import re
from unicodedata import normalize

# Load Dataset

In [11]:
filename = '../data/deu-eng-train.pkl'
with open(filename, 'rb') as file:
  dataset = load(file)
for i, pair in enumerate(dataset):
  if i == 2: break
  print(pair[0])
  print(pair[0].decode('utf-8'))
  print(pair[1])
  print(pair[1].decode('utf-8'))
  print('\n')

b'Tom zahlt ein Bu\xc3\x9fgeld.'
Tom zahlt ein Bußgeld.
b'Tom is paying a fine.'
Tom is paying a fine.


b'Unter dem Bett ist eine Katze.'
Unter dem Bett ist eine Katze.
b'There is a cat under the bed.'
There is a cat under the bed.




# Create Vocabulary

## Create tf Dataset datastrcuture
The tf.data.Dataset.from_tensor_slices() method creates a dataset from a tensor or a list of tensors. The tensor or list of tensors is sliced along the first dimension, and the slices are the elements of the dataset.

In [12]:
trainX = np.array(dataset)[:, 0]
trainY = np.array(dataset)[:, 1]
train_deu = tf.data.Dataset.from_tensor_slices(trainX)
train_eng = tf.data.Dataset.from_tensor_slices(trainY)

In [13]:
train_deu

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [14]:
for i, e in enumerate(train_deu):
  if i == 3: break
  print(e, type(e))

tf.Tensor(b'Tom zahlt ein Bu\xc3\x9fgeld.', shape=(), dtype=string) <class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(b'Unter dem Bett ist eine Katze.', shape=(), dtype=string) <class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(b'Tom wollte, dass ich lu\xcc\x88ge.', shape=(), dtype=string) <class 'tensorflow.python.framework.ops.EagerTensor'>


In [15]:
bert_tokenizer_params = dict(lower_case=True)
bert_tokenizer_params

{'lower_case': True}

In [16]:
reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]
reserved_tokens

['[PAD]', '[UNK]', '[START]', '[END]']

In [17]:
bert_vocab_args = dict(
    vocab_size = 8000,
    reserved_tokens = reserved_tokens,
    bert_tokenizer_params = bert_tokenizer_params,
    learn_params = {}
)
bert_vocab_args

{'vocab_size': 8000,
 'reserved_tokens': ['[PAD]', '[UNK]', '[START]', '[END]'],
 'bert_tokenizer_params': {'lower_case': True},
 'learn_params': {}}

## Create vocab with bert_vocab.bert_vocab_from_dataset()

In [18]:
%%time
deu_vocab = bert_vocab.bert_vocab_from_dataset(
    train_deu.batch(1000).prefetch(2),
    **bert_vocab_args
)

CPU times: total: 1min 49s
Wall time: 2min 1s


In [19]:
type(deu_vocab)

list

In [20]:
print(deu_vocab[:20])
print(deu_vocab[100:120])
print(deu_vocab[1000:1020])
print(deu_vocab[-20:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '$', '%', "'", '(', ')', '+', ',', '-', '.', '/', '0', '1', '2', '3']
['maria', 'ihr', 'haben', 'an', 'sind', 'kann', 'noch', 'bin', 'einen', 'so', 'von', 'fur', 'hast', 'dem', 'als', 'sehr', 'sein', 'dir', 'dich', 'hier']
['kleine', 'meinte', 'messer', 'schreibtisch', 'wartet', 'zukunft', 'freunden', 'nun', '##sten', 'durfen', 'fuhlte', 'kleinen', 'schlaft', 'tu', 'gespielt', 'ordnung', 'acht', 'angelegenheit', 'bekam', 'fern']
['##;', '##?', '##@', '##j', '##q', '##°', '##ˋ', '##а', '##–', '##—', '##‘', '##’', '##‚', '##“', '##”', '##„', '##‟', '##‽', '##⁄', '##€']


In [27]:
%%time
eng_vocab = bert_vocab.bert_vocab_from_dataset(
    train_eng.batch(1000).prefetch(2),
    **bert_vocab_args
)

CPU times: total: 24.7 s
Wall time: 27.1 s


In [28]:
print(eng_vocab[:10])
print(eng_vocab[100:110])
print(eng_vocab[1000:1010])
print(eng_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '$', '%', "'", '(']
['how', 'there', 'has', 'll', 've', 'here', 'very', 'think', 'go', 'about']
['seeing', 'anywhere', 'suddenly', 'top', 'uncle', '##ment', 'common', 'earlier', 'keys', 'fault']
['##j', '##q', '##v', '##°', '##—', '##‘', '##’', '##“', '##”', '##€']


## Save the vocabulary list

In [29]:
def write_vocab_file(filepath, vocab):
  with open(filepath, 'w', encoding='utf-8') as f:
    for token in vocab:
      print(token, file=f)

In [30]:
write_vocab_file('../data/deu_vocab.txt', deu_vocab)
write_vocab_file('../data/eng_vocab.txt', eng_vocab)

# Build Tokensizer

In [32]:
deu_tokenizer = tftxt.BertTokenizer('../data/deu_vocab.txt', **bert_tokenizer_params)
eng_tokenizer = tftxt.BertTokenizer('../data/eng_vocab.txt', **bert_tokenizer_params)

In [38]:
# en_examples is 1 batch of size 3.
# it is a eagerTensor
print(trainX[:3])

print('\n')
token_batch = deu_tokenizer.tokenize(trainX[:3])
print(token_batch.shape)
print(token_batch)
print(token_batch.to_list())

print('\n')
token_batch = token_batch.merge_dims(-2,-1)
print(token_batch.shape)
print(token_batch)
print(token_batch.to_list())

print('\n')
for ex in token_batch.to_list():
  print(ex)

[b'Tom zahlt ein Bu\xc3\x9fgeld.' b'Unter dem Bett ist eine Katze.'
 b'Tom wollte, dass ich lu\xcc\x88ge.']


(3, None, None)
<tf.RaggedTensor [[[72], [1844], [85], [422, 177, 3316], [14]],
 [[369], [113], [399], [74], [97], [502], [14]],
 [[72], [201], [12], [86], [71], [1812], [14]]]>
[[[72], [1844], [85], [422, 177, 3316], [14]], [[369], [113], [399], [74], [97], [502], [14]], [[72], [201], [12], [86], [71], [1812], [14]]]


(3, None)
<tf.RaggedTensor [[72, 1844, 85, 422, 177, 3316, 14],
 [369, 113, 399, 74, 97, 502, 14],
 [72, 201, 12, 86, 71, 1812, 14]]>
[[72, 1844, 85, 422, 177, 3316, 14], [369, 113, 399, 74, 97, 502, 14], [72, 201, 12, 86, 71, 1812, 14]]


[72, 1844, 85, 422, 177, 3316, 14]
[369, 113, 399, 74, 97, 502, 14]
[72, 201, 12, 86, 71, 1812, 14]


### tf.gather()

In [34]:
another_batch = [0, 1, 2, 3, 4, 5]
txt_tokens = tf.gather(deu_vocab, another_batch)
print(txt_tokens)

tf.Tensor([b'[PAD]' b'[UNK]' b'[START]' b'[END]' b'!' b'"'], shape=(6,), dtype=string)


In [35]:
another_batch = [[1, 2, 3], [4, 5, 6], [7,8,9]]
txt_tokens = tf.gather(deu_vocab, another_batch)
print(type(txt_tokens))
print(txt_tokens)

<class 'tensorflow.python.framework.ops.EagerTensor'>
tf.Tensor(
[[b'[UNK]' b'[START]' b'[END]']
 [b'!' b'"' b'$']
 [b'%' b"'" b'(']], shape=(3, 3), dtype=string)


In [36]:
txt_tokens = tf.gather(deu_vocab, token_batch)
print(txt_tokens)
tf.strings.reduce_join(txt_tokens, separator=' ', axis=-1)

<tf.RaggedTensor [[b'tom', b'zahlt', b'ein', b'bus', b'##s', b'##geld', b'.'],
 [b'unter', b'dem', b'bett', b'ist', b'eine', b'katze', b'.'],
 [b'tom', b'wollte', b',', b'dass', b'ich', b'luge', b'.']]>


<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'tom zahlt ein bus ##s ##geld .',
       b'unter dem bett ist eine katze .',
       b'tom wollte , dass ich luge .'], dtype=object)>

### tokenize and detokenize

In [39]:
words = deu_tokenizer.detokenize(token_batch)
print(words)

<tf.RaggedTensor [[b'tom', b'zahlt', b'ein', b'bussgeld', b'.'],
 [b'unter', b'dem', b'bett', b'ist', b'eine', b'katze', b'.'],
 [b'tom', b'wollte', b',', b'dass', b'ich', b'luge', b'.']]>


In [40]:
tf.strings.reduce_join(words, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'tom zahlt ein bussgeld .', b'unter dem bett ist eine katze .',
       b'tom wollte , dass ich luge .'], dtype=object)>

## ADD [START], [END]

In [41]:
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")
print(START, END)

def add_start_end(ragged):
  count = ragged.bounding_shape()[0]
  print(count)
  starts = tf.fill([count, 1], START)
  ends = tf.fill([count, 1], END)
  return tf.concat([starts, ragged, ends], axis=1)

tf.Tensor(2, shape=(), dtype=int64) tf.Tensor(3, shape=(), dtype=int64)


In [42]:
words = eng_tokenizer.detokenize(add_start_end(token_batch))
tf.strings.reduce_join(words, separator=' ', axis=-1)

tf.Tensor(3, shape=(), dtype=int64)


<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'[START] do kissing are john lot bent . [END]',
       b'[START] parents all until was she mean . [END]',
       b'[START] do room , your in meaning . [END]'], dtype=object)>

## Cleanup detokenized text

In [43]:
bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
bad_tokens_re = "|".join(bad_tokens)
print(bad_tokens_re)
token_txt = ["[PAD]", "hello", "world", "[END]"]
bad_cells = tf.strings.regex_full_match(token_txt, bad_tokens_re)
print(bad_cells)
result = tf.ragged.boolean_mask(token_txt, ~bad_cells)
print(result)
result = tf.strings.reduce_join(result, separator=' ', axis=-1)
print(result)


\[PAD\]|\[START\]|\[END\]
tf.Tensor([ True False False  True], shape=(4,), dtype=bool)
tf.Tensor([b'hello' b'world'], shape=(2,), dtype=string)
tf.Tensor(b'hello world', shape=(), dtype=string)


In [44]:
def cleanup_text(reserved_tokens, token_txt):
  bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
  bad_tokens_re = "|".join(bad_tokens)
  #
  bad_cells = tf.strings.regex_full_match(token_txt, bad_tokens_re)
  result = tf.ragged.boolean_mask(token_txt, ~bad_cells)
  result = tf.strings.reduce_join(result, separator=' ', axis=-1)
  return result


In [45]:
eng_examples = trainY[:3]
print(eng_examples)
print(eng_examples.dtype)
eng_examples = eng_examples.astype('str')
print(eng_examples)
print(eng_examples.dtype)

[b'Tom is paying a fine.' b'There is a cat under the bed.'
 b'Tom wanted me to lie.']
|S537
['Tom is paying a fine.' 'There is a cat under the bed.'
 'Tom wanted me to lie.']
<U537


In [46]:
token_batch = eng_tokenizer.tokenize(eng_examples).merge_dims(-2,-1)
words = eng_tokenizer.detokenize(token_batch)
words

<tf.RaggedTensor [[b'tom', b'is', b'paying', b'a', b'fine', b'.'],
 [b'there', b'is', b'a', b'cat', b'under', b'the', b'bed', b'.'],
 [b'tom', b'wanted', b'me', b'to', b'lie', b'.']]>

In [47]:
cleanup_text(reserved_tokens, words).numpy()

array([b'tom is paying a fine .', b'there is a cat under the bed .',
       b'tom wanted me to lie .'], dtype=object)

# Export the Model

In [50]:
vocab_path = '../data/deu_vocab.txt'
vocab = pathlib.Path(vocab_path).read_text(encoding="utf-8").splitlines()
# tf.Variable(vocab)
print(type(vocab))
tf.Variable(vocab)

<class 'list'>


<tf.Variable 'Variable:0' shape=(7930,) dtype=string, numpy=
array([b'[PAD]', b'[UNK]', b'[START]', ..., b'##\xe2\x80\xbd',
       b'##\xe2\x81\x84', b'##\xe2\x82\xac'], dtype=object)>

In [53]:
class CustomTokenizer(tf.Module):
  def __init__(self, reserved_tokens, vocab_path):
    self.tokenizer = tftxt.BertTokenizer(vocab_path, lower_case=True)
    self._reserved_tokens = reserved_tokens
    self._vocab_path = tf.saved_model.Asset(vocab_path)
    #
    vocab = pathlib.Path(vocab_path).read_text(encoding="utf-8").splitlines()
    self.vocab = tf.Variable(vocab)

    ## Create signatures for export
    self.tokenize.get_concrete_function(
        tf.TensorSpec(shape=[None], dtype=tf.string))
    self.detokenize.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.detokenize.get_concrete_function(
        tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))
    self.lookup.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.lookup.get_concrete_function(
        tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))
    ##
    self.get_vocab_size.get_concrete_function()
    self.get_vocab_path.get_concrete_function()
    self.get_reserved_tokens.get_concrete_function()

  ##
  @tf.function
  def tokenize(self, strings):
    enc = self.tokenizer.tokenize(strings)
    enc = enc.merge_dims(-2,-1)
    enc = add_start_end(enc)
    return enc

  @tf.function
  def detokenize(self, tokenized):
    words = self.tokenizer.detokenize(tokenized)
    words = cleanup_text(self._reserved_tokens, words)
    return words

  @tf.function
  def lookup(self, token_ids):
    return tf.gather(self.vocab, token_ids)

  @tf.function
  def get_vocab_size(self):
    return tf.shape(self.vocab)[0]

  @tf.function
  def get_vocab_path(self):
    return self._vocab_path

  @tf.function
  def get_reserved_tokens(self):
    return tf.constant(self._reserved_tokens)


In [54]:
tokenizers = tf.Module()
tokenizers.deu = CustomTokenizer(reserved_tokens, '../data/deu_vocab.txt')
tokenizers.eng = CustomTokenizer(reserved_tokens, '../data/eng_vocab.txt')

Tensor("strided_slice:0", shape=(), dtype=int64)
Tensor("strided_slice:0", shape=(), dtype=int64)


In [55]:
model_name = './metadata/tokenizer_deu_eng'
tf.saved_model.save(tokenizers, model_name)

Tensor("strided_slice:0", shape=(), dtype=int64)
Tensor("strided_slice:0", shape=(), dtype=int64)
INFO:tensorflow:Assets written to: ./metadata/tokenizer_deu_eng\assets


# Load and Test The Model

In [56]:
reloaded_tokenizers = tf.saved_model.load(model_name)
reloaded_tokenizers.deu.get_vocab_size().numpy()

7930

In [57]:
tokens = reloaded_tokenizers.deu.tokenize(['Ich habe mein Geld für Kleidung, Essen und Bücher ausgegeben.'])
tokens.numpy()

array([[   2,   71,   88,  139,  194,  111, 1424,   12,  225,   99,  408,
        3031,   14,    3]], dtype=int64)

In [58]:
text_tokens = reloaded_tokenizers.deu.lookup(tokens)
text_tokens

<tf.RaggedTensor [[b'[START]', b'ich', b'habe', b'mein', b'geld', b'fur', b'kleidung',
  b',', b'essen', b'und', b'bucher', b'ausgegeben', b'.', b'[END]']]>

In [59]:
round_trip = reloaded_tokenizers.deu.detokenize(tokens)
print(round_trip.numpy()[0].decode('utf-8'))

ich habe mein geld fur kleidung , essen und bucher ausgegeben .


In [60]:
reloaded_tokenizers = tf.saved_model.load('./metadata/tokenizer_deu_eng')
string = "When writing a sentence, generally you start with a capital letter and finish with a period (.), an exclamation mark (!), or a question mark (?)."
tokens = reloaded_tokenizers.eng.tokenize([string])
tokens.numpy()
text_tokens = reloaded_tokenizers.eng.lookup(tokens)
text_tokens
round_trip = reloaded_tokenizers.eng.detokenize(tokens)
print(round_trip.numpy()[0].decode('utf-8'))

when writing a sentence , generally you start with a capital letter and finish with a period ( . ) , an exclamation mark ( ! ) , or a question mark ( ? ) .


# Conclusion