In [1]:
import collections
import os
import pathlib
import re
import string
import sys
import tempfile
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_text as text
import tensorflow_datasets as tfds

import warnings
warnings.filterwarnings('ignore')

2024-03-19 07:28:40.653452: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-19 07:28:40.746896: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-19 07:28:40.746939: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-19 07:28:40.751703: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-19 07:28:40.775465: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-19 07:28:40.778546: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [2]:
tf.get_logger().setLevel('ERROR')
pwd=pathlib.Path.cwd()

In [3]:
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)

In [4]:
train_examples, val_examples = examples['train'], examples['validation']

In [5]:
for pt,en in train_examples.take(1):
    print("Portugese: ", pt.numpy().decode('utf-8'))
    print("English: ", en.numpy().decode('utf-8'))

Portugese:  e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .
English:  and when you improve searchability , you actually take away the one advantage of print , which is serendipity .


2024-03-19 07:28:45.242759: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [6]:
train_en = train_examples.map(lambda pt,en : en)
train_pt = train_examples.map(lambda pt,en : pt)

In [7]:
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

In [8]:
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens = ["[PAD]","[UNK]","[START]","[END]"]
bert_vocab_arguments=dict(
    vocab_size=8000,
    reserved_tokens=reserved_tokens,
    bert_tokenizer_params=bert_tokenizer_params,
    learn_params={}
)

In [9]:
%%time
pt_vocab=bert_vocab.bert_vocab_from_dataset(train_pt.batch(1000).prefetch(2), **bert_vocab_arguments)

CPU times: user 1min 35s, sys: 2.15 s, total: 1min 37s
Wall time: 1min 32s


In [10]:
print(pt_vocab[:10])
print(pt_vocab[100:110])
print(pt_vocab[1000:1010])
print(pt_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '#', '$', '%', '&', "'"]
['no', 'por', 'mais', 'na', 'eu', 'esta', 'muito', 'isso', 'isto', 'sao']
['90', 'desse', 'efeito', 'malaria', 'normalmente', 'palestra', 'recentemente', '##nca', 'bons', 'chave']
['##–', '##—', '##‘', '##’', '##“', '##”', '##⁄', '##€', '##♪', '##♫']


In [11]:
def write_vocab_file(filepath, vocab):
    with open(filepath,"w") as f:
        for token in vocab:
            print(token, file=f)

In [12]:
write_vocab_file("pt_vocab.txt", pt_vocab)

In [13]:
!ls -ltr *.txt

-rw-r--r-- 1 krsethur g680 52382 Mar 18 13:22 vocab.txt
-rw-r--r-- 1 krsethur g680 52382 Mar 19 07:22 en_vocab.txt
-rw-r--r-- 1 krsethur g680 61124 Mar 19 07:30 pt_vocab.txt


In [14]:
%%time
en_vocab = bert_vocab.bert_vocab_from_dataset(train_en.batch(1000).prefetch(2), **bert_vocab_arguments)

CPU times: user 1min, sys: 678 ms, total: 1min
Wall time: 58.9 s


In [15]:
print(en_vocab[:10])
print(en_vocab[100:110])
print(en_vocab[1000:1010])
print(en_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '#', '$', '%', '&', "'"]
['as', 'all', 'at', 'one', 'people', 're', 'like', 'if', 'our', 'from']
['choose', 'consider', 'extraordinary', 'focus', 'generation', 'killed', 'patterns', 'putting', 'scientific', 'wait']
['##_', '##`', '##ย', '##ร', '##อ', '##–', '##—', '##’', '##♪', '##♫']


In [16]:
write_vocab_file("en_vocab.txt", en_vocab)

In [17]:
!ls -ltr *.txt

-rw-r--r-- 1 krsethur g680 52382 Mar 18 13:22 vocab.txt
-rw-r--r-- 1 krsethur g680 61124 Mar 19 07:30 pt_vocab.txt
-rw-r--r-- 1 krsethur g680 52382 Mar 19 07:31 en_vocab.txt


In [18]:
pt_tokenizer = text.BertTokenizer("pt_vocab.txt", **bert_tokenizer_params)
en_tokenizer = text.BertTokenizer("en_vocab.txt", **bert_tokenizer_params)

In [19]:
for pt_examples, en_examples in train_examples.batch(3).take(1):
    for ex in en_examples:
        print(ex)

tf.Tensor(b'and when you improve searchability , you actually take away the one advantage of print , which is serendipity .', shape=(), dtype=string)
tf.Tensor(b'but what if it were active ?', shape=(), dtype=string)
tf.Tensor(b"but they did n't test for curiosity .", shape=(), dtype=string)


In [20]:
token_batch = en_tokenizer.tokenize(en_examples)
token_batch = token_batch.merge_dims(-2,-1)

for ex in token_batch.to_list():
    print(ex)

[72, 117, 79, 1259, 1491, 2362, 13, 79, 150, 184, 311, 71, 103, 2308, 74, 2679, 13, 148, 80, 55, 4840, 1434, 2423, 540, 15]
[87, 90, 107, 76, 129, 1852, 30]
[87, 83, 149, 50, 9, 56, 664, 85, 2512, 15]


In [21]:
txt_tokens = tf.gather(en_vocab,token_batch)
tf.strings.reduce_join(txt_tokens, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'and when you improve search ##ability , you actually take away the one advantage of print , which is s ##ere ##nd ##ip ##ity .',
       b'but what if it were active ?',
       b"but they did n ' t test for curiosity ."], dtype=object)>

In [22]:
words = en_tokenizer.detokenize(token_batch)
tf.strings.reduce_join(words, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'and when you improve searchability , you actually take away the one advantage of print , which is serendipity .',
       b'but what if it were active ?',
       b"but they did n ' t test for curiosity ."], dtype=object)>

In [23]:
START = tf.argmax(tf.constant(reserved_tokens)== "[START]")
END = tf.argmax(tf.constant(reserved_tokens)=="[END]")

def add_start_end(ragged):
    count = ragged.bounding_shape()[0]
    starts = tf.fill([count,1], START)
    ends = tf.fill([count,1], END)
    return tf.concat([starts, ragged, ends], axis=1)

In [24]:
words = en_tokenizer.detokenize(add_start_end(token_batch))
tf.strings.reduce_join(words, axis=-1, separator=' ')

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'[START] and when you improve searchability , you actually take away the one advantage of print , which is serendipity . [END]',
       b'[START] but what if it were active ? [END]',
       b"[START] but they did n ' t test for curiosity . [END]"],
      dtype=object)>

In [25]:
def cleanup_text(reserved_tokens, token_txt):
    bad_tokens = [re.escape(token) for token in reserved_tokens if token != '[PAD]']
    bad_tokens_re = '|'.join(bad_tokens)
    bad_cells = tf.strings.regex_full_match(token_txt, bad_tokens_re)
    result = tf.ragged.boolean_mask(token_txt, ~bad_cells)
    return tf.strings.reduce_join(result, axis=-1, separator=' ')

In [26]:
en_examples.numpy()

array([b'and when you improve searchability , you actually take away the one advantage of print , which is serendipity .',
       b'but what if it were active ?',
       b"but they did n't test for curiosity ."], dtype=object)

In [27]:
token_batch = en_tokenizer.tokenize(en_examples).merge_dims(-2,-1)
words = en_tokenizer.detokenize(token_batch)
words

<tf.RaggedTensor [[b'and', b'when', b'you', b'improve', b'searchability', b',', b'you',
  b'actually', b'take', b'away', b'the', b'one', b'advantage', b'of',
  b'print', b',', b'which', b'is', b'serendipity', b'.']              ,
 [b'but', b'what', b'if', b'it', b'were', b'active', b'?'],
 [b'but', b'they', b'did', b'n', b"'", b't', b'test', b'for', b'curiosity',
  b'.']                                                                    ]>

In [28]:
cleanup_text(reserved_tokens, words)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'and when you improve searchability , you actually take away the one advantage of print , which is serendipity .',
       b'but what if it were active ?',
       b"but they did n ' t test for curiosity ."], dtype=object)>

In [29]:
class CustomTokenizer(tf.Module):
    def __init__(self, reserved_tokens, vocab_path):
        self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
        self._reserved_tokens = reserved_tokens
        self._vocab_path = tf.saved_model.Asset(vocab_path)
        vocab = pathlib.Path(vocab_path).read_text().splitlines()
        self.vocab = tf.Variable(vocab)
        self.tokenize.get_concrete_function(tf.TensorSpec(shape=[None], dtype=tf.string))
        self.detokenize.get_concrete_function(tf.TensorSpec(shape=[None,None], dtype=tf.int64))
        self.detokenize.get_concrete_function(tf.RaggedTensorSpec(shape=[None,None], dtype=tf.int64))
        self.lookup.get_concrete_function(tf.TensorSpec(shape=[None,None], dtype=tf.int64))
        self.lookup.get_concrete_function(tf.RaggedTensorSpec(shape=[None,None], dtype=tf.int64))
        self.get_vocab_size.get_concrete_function()
        self.get_vocab_path.get_concrete_function()
        self.get_reserved_tokens.get_concrete_function()
    @tf.function()
    def tokenize(self, txt):
        tokens = self.tokenizer.tokenize(txt).merge_dims(-2,-1)
        return add_start_end(tokens)
    @tf.function()
    def detokenize(self, tokens):
        words = self.tokenizer.detokenize(tokens)
        return cleanup_text(self._reserved_tokens, words)
    @tf.function()
    def lookup(self, tokens):
        return tf.gather(self.vocab, tokens)
    @tf.function()
    def get_vocab_size(self, ):
        return self.vocab.shape[0]
    @tf.function()
    def get_reserved_tokens(self, ):
        return tf.constant(self._reserved_tokens)
    @tf.function()
    def get_vocab_path(self, ):
        return self._vocab_path
    

In [30]:
tokenizers = tf.Module()
tokenizers.pt = CustomTokenizer(reserved_tokens, "pt_vocab.txt")

In [31]:
tokenizers.en = CustomTokenizer(reserved_tokens, "en_vocab.txt")

In [33]:
model_name = 'ted_hrlr_translate_pt_en_converter'
tf.saved_model.save(tokenizers,model_name)

In [34]:
reloaded_tokenizers = tf.saved_model.load(model_name)
reloaded_tokenizers.en.get_vocab_size()

<tf.Tensor: shape=(), dtype=int32, numpy=7010>

In [35]:
tokens = reloaded_tokenizers.en.tokenize(['Hello TensorFlow!'])
tokens.numpy()

array([[   2, 4006, 2358,  687, 1192, 2365,    4,    3]])

In [37]:
text_tokens = reloaded_tokenizers.en.lookup(tokens)
text_tokens.numpy()

array([[b'[START]', b'hello', b'tens', b'##or', b'##f', b'##low', b'!',
        b'[END]']], dtype=object)

In [39]:
round_trip = reloaded_tokenizers.en.detokenize(tokens)
round_trip.numpy()

array([b'hello tensorflow !'], dtype=object)

In [41]:
!zip -r {model_name}.zip {model_name}

  adding: ted_hrlr_translate_pt_en_converter/ (stored 0%)
  adding: ted_hrlr_translate_pt_en_converter/fingerprint.pb (stored 0%)
  adding: ted_hrlr_translate_pt_en_converter/variables/ (stored 0%)
  adding: ted_hrlr_translate_pt_en_converter/variables/variables.data-00000-of-00001 (deflated 51%)
  adding: ted_hrlr_translate_pt_en_converter/variables/variables.index (deflated 33%)
  adding: ted_hrlr_translate_pt_en_converter/saved_model.pb (deflated 91%)
  adding: ted_hrlr_translate_pt_en_converter/assets/ (stored 0%)
  adding: ted_hrlr_translate_pt_en_converter/assets/en_vocab.txt (deflated 54%)
  adding: ted_hrlr_translate_pt_en_converter/assets/pt_vocab.txt (deflated 57%)


In [43]:
!du -h *.zip

172K	ted_hrlr_translate_pt_en_converter.zip


In [45]:
pt_lookup = tf.lookup.StaticVocabularyTable(
    tf.lookup.TextFileInitializer(filename='pt_vocab.txt', 
                                  key_dtype=tf.string, 
                                  key_index=tf.lookup.TextFileIndex.WHOLE_LINE, 
                                  value_dtype=tf.int64, 
                                  value_index=tf.lookup.TextFileIndex.LINE_NUMBER),
    num_oov_buckets=1)
pt_tokenizer = text.BertTokenizer(pt_lookup)

In [46]:
pt_lookup.lookup(tf.constant(['é', 'um', 'uma', 'para', 'não']))

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([7765,   85,   86,   87, 7765])>