In [16]:
import tensorflow as tf
import tensorflow_text as text
import tensorflow_datasets as tfds
import numpy as np
import pathlib
import re
import collections

In [3]:
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
train_ex = examples['train']
val_ex = examples['validation']

In [11]:
for pt,en in train_ex.take(1):
    print(pt.numpy().decode('utf-8'))
    print(en.numpy().decode('utf-8'))

e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .
and when you improve searchability , you actually take away the one advantage of print , which is serendipity .


2024-03-19 09:02:00.559576: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [14]:
train_en = train_ex.map(lambda pt,en : en)
train_pt = train_ex.map(lambda pt,en : pt)

In [15]:
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

In [18]:
bert_tokenizer_params = dict(lower_case=True)
reserved_tokens=['[UNK]','[PAD]','[START]','[END]']
vocab_args= dict(vocab_size=8000,
    reserved_tokens=reserved_tokens,
    bert_tokenizer_params=bert_tokenizer_params,
    learn_params={})
vocab_en = bert_vocab.bert_vocab_from_dataset(train_en.batch(1000).prefetch(2), **vocab_args)
vocab_pt = bert_vocab.bert_vocab_from_dataset(train_pt.batch(1000).prefetch(2), **vocab_args)

In [19]:
def write_vocab_to_file(vocab, file_path):
    with open(file_path, "w") as f:
        for tokens in vocab:
            print(tokens, file=f)

In [20]:
write_vocab_to_file(vocab_en, "vocab_en.txt")
write_vocab_to_file(vocab_pt, "vocab_pt.txt")

In [21]:
tokenizer_en = text.BertTokenizer("vocab_en.txt", **bert_tokenizer_params)
tokenizer_pt = text.BertTokenizer("vocab_pt.txt", **bert_tokenizer_params)

In [22]:
for ex in train_en.batch(3).take(1):
    print(tokenizer_en.tokenize(ex))

<tf.RaggedTensor [[[72], [117], [79], [1259], [1491, 2362], [13], [79], [150], [184], [311],
  [71], [103], [2308], [74], [2679], [13], [148], [80],
  [55, 4840, 1434, 2423, 540], [15]]                                       ,
 [[87], [90], [107], [76], [129], [1852], [30]],
 [[87], [83], [149], [50], [9], [56], [664], [85], [2512], [15]]]>


In [23]:
ex

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'and when you improve searchability , you actually take away the one advantage of print , which is serendipity .',
       b'but what if it were active ?',
       b"but they did n't test for curiosity ."], dtype=object)>

In [25]:
token_batch = tokenizer_en.tokenize(ex).merge_dims(-2,-1)
token_batch

<tf.RaggedTensor [[72, 117, 79, 1259, 1491, 2362, 13, 79, 150, 184, 311, 71, 103, 2308, 74,
  2679, 13, 148, 80, 55, 4840, 1434, 2423, 540, 15]                       ,
 [87, 90, 107, 76, 129, 1852, 30],
 [87, 83, 149, 50, 9, 56, 664, 85, 2512, 15]]>

In [29]:
words = tf.gather(vocab_en, token_batch)
words

<tf.RaggedTensor [[b'and', b'when', b'you', b'improve', b'search', b'##ability', b',',
  b'you', b'actually', b'take', b'away', b'the', b'one', b'advantage',
  b'of', b'print', b',', b'which', b'is', b's', b'##ere', b'##nd', b'##ip',
  b'##ity', b'.']                                                          ,
 [b'but', b'what', b'if', b'it', b'were', b'active', b'?'],
 [b'but', b'they', b'did', b'n', b"'", b't', b'test', b'for', b'curiosity',
  b'.']                                                                    ]>

In [35]:
def add_start_end(ragged):
    size = ragged.bounding_shape()[0]
    start = tf.argmax(tf.constant(reserved_tokens)=='[START]')
    end = tf.argmax(tf.constant(reserved_tokens)=='[END]')
    start = tf.fill((size,1), start)
    end = tf.fill((size,1), end)
    return tf.concat([start, ragged, end], axis=-1)

In [36]:
add_start_end(token_batch)

<tf.RaggedTensor [[2, 72, 117, 79, 1259, 1491, 2362, 13, 79, 150, 184, 311, 71, 103, 2308,
  74, 2679, 13, 148, 80, 55, 4840, 1434, 2423, 540, 15, 3]               ,
 [2, 87, 90, 107, 76, 129, 1852, 30, 3],
 [2, 87, 83, 149, 50, 9, 56, 664, 85, 2512, 15, 3]]>

In [37]:
words = tf.gather(vocab_en, add_start_end(token_batch))
words

<tf.RaggedTensor [[b'[START]', b'and', b'when', b'you', b'improve', b'search', b'##ability',
  b',', b'you', b'actually', b'take', b'away', b'the', b'one', b'advantage',
  b'of', b'print', b',', b'which', b'is', b's', b'##ere', b'##nd', b'##ip',
  b'##ity', b'.', b'[END]']                                                 ,
 [b'[START]', b'but', b'what', b'if', b'it', b'were', b'active', b'?',
  b'[END]']                                                           ,
 [b'[START]', b'but', b'they', b'did', b'n', b"'", b't', b'test', b'for',
  b'curiosity', b'.', b'[END]']                                          ]>

In [38]:
tokenizer_en.detokenize(add_start_end(token_batch))

<tf.RaggedTensor [[b'[START]', b'and', b'when', b'you', b'improve', b'searchability', b',',
  b'you', b'actually', b'take', b'away', b'the', b'one', b'advantage',
  b'of', b'print', b',', b'which', b'is', b'serendipity', b'.', b'[END]'] ,
 [b'[START]', b'but', b'what', b'if', b'it', b'were', b'active', b'?',
  b'[END]']                                                           ,
 [b'[START]', b'but', b'they', b'did', b'n', b"'", b't', b'test', b'for',
  b'curiosity', b'.', b'[END]']                                          ]>

In [39]:
def cleanup_text(token_text):
    bad_tokens = [re.escape(token) for token in reserved_tokens if token != '[UNK]']
    bad_tokens_re = '|'.join(bad_tokens)
    bad_locs = tf.strings.regex_full_match(token_text, bad_tokens_re)
    cleaned_tokens = tf.ragged.boolean_mask(token_text, ~bad_locs)
    return tf.strings.reduce_join(cleaned_tokens, separator=' ', axis=-1)
    

In [40]:
cleanup_text(words)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'and when you improve search ##ability , you actually take away the one advantage of print , which is s ##ere ##nd ##ip ##ity .',
       b'but what if it were active ?',
       b"but they did n ' t test for curiosity ."], dtype=object)>

In [41]:
cleanup_text(tokenizer_en.detokenize(add_start_end(token_batch)))

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'and when you improve searchability , you actually take away the one advantage of print , which is serendipity .',
       b'but what if it were active ?',
       b"but they did n ' t test for curiosity ."], dtype=object)>

In [49]:
class CustomTokenizer(tf.Module):
    def __init__(self, reserved_tokens, vocab_path):
        self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
        self._reserved_tokens=reserved_tokens
        self._vocab_path = tf.saved_model.Asset(vocab_path)
        vocab = pathlib.Path(vocab_path).read_text().splitlines()
        self.vocab=tf.Variable(vocab)
        self.tokenize.get_concrete_function(tf.TensorSpec(shape=[None], dtype=tf.string))
        self.detokenize.get_concrete_function(tf.TensorSpec(shape=[None, None], dtype=tf.int64))
        self.detokenize.get_concrete_function(tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))
        self.lookup.get_concrete_function(tf.TensorSpec(shape=[None,None], dtype=tf.int64))
        self.lookup.get_concrete_function(tf.RaggedTensorSpec(shape=[None,None], dtype=tf.int64))
        self.get_reserved_tokens.get_concrete_function()
        self.get_vocab_path.get_concrete_function()
        self.get_vocab_size.get_concrete_function()
    @tf.function
    def tokenize(self, txt):
        tokens = self.tokenizer.tokenize(txt).merge_dims(-2,-1)
        return add_start_end(tokens)
    @tf.function
    def detokenize(self, tokens):
        words = self.tokenizer.detokenize(tokens)
        return cleanup_text(words)
    @tf.function
    def lookup(self, tokens):
        words = tf.gather(self.vocab, tokens)
        return words
    @tf.function
    def get_reserved_tokens(self):
        return self._reserved_tokens
    @tf.function
    def get_vocab_path(self):
        return self._vocab_path
    @tf.function
    def get_vocab_size(self):
        return len(self.vocab)

In [51]:
tokenizers=tf.Module()
tokenizers.pt = CustomTokenizer(reserved_tokens, 'vocab_pt.txt')
tokenizers.en = CustomTokenizer(reserved_tokens, 'vocab_en.txt')

In [52]:
model_name='krishna_tokenizer'
tf.saved_model.save(tokenizers, model_name)

INFO:tensorflow:Assets written to: krishna_tokenizer/assets


INFO:tensorflow:Assets written to: krishna_tokenizer/assets


In [53]:
reconstructed_tokenizer = tf.saved_model.load(model_name)
reconstructed_en = reconstructed_tokenizer.en
reconstructed_pt = reconstructed_tokenizer.pt

In [54]:
reconstructed_en.get_vocab_size()

<tf.Tensor: shape=(), dtype=int32, numpy=7010>

In [56]:
toks = reconstructed_en.tokenize(['Hello Tensorflow!'])
toks

<tf.RaggedTensor [[2, 4006, 2358, 687, 1192, 2365, 4, 3]]>

In [57]:
reconstructed_en.lookup(toks)

<tf.RaggedTensor [[b'[START]', b'hello', b'tens', b'##or', b'##f', b'##low', b'!',
  b'[END]']]>

In [58]:
reconstructed_en.detokenize(toks)

<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'hello tensorflow !'], dtype=object)>