In [None]:
import requests
import tensorflow as tf
import tensorflow_text as tf_text
import warnings
from transformers import BertTokenizer
import sentencepiece as spm

warnings.filterwarnings('ignore')

# SentencePiece

Downloading and creating a dictionary for the standard SentencePiece tokens.

In [37]:
# URL of the SentencePiece model file
url_sp = "https://github.com/tensorflow/text/blob/master/tensorflow_text/python/ops/test_data/test_oss_model.model?raw=true"

# Path to save the downloaded SentencePiece model file
sp_model_path = "sentencepiece.model"

# Download the SentencePiece model file
r_sp = requests.get(url_sp)
with open(sp_model_path, 'wb') as f:
    f.write(r_sp.content)

print(f"SentencePiece model has been written to {sp_model_path}")

SentencePiece model has been written to sentencepiece.model


# BertTokenizer

Downloading and creating a dictionary for the standard SentencePiece tokens.

In [33]:
url_bert = "https://github.com/tensorflow/text/blob/master/tensorflow_text/python/ops/test_data/test_wp_en_vocab.txt?raw=true"
r_bert = requests.get(url_bert)
filepath_bert = "bert_vocabulary.txt"
open(filepath_bert, 'wb').write(r_bert.content)
print(f"Bert vocabulary has been written to {filepath_bert}")

Bert vocabulary has been written to bert_vocabulary.txt


### Comparing BertTokenizer and Sentencepiece

In [34]:
# Initialize the tokenizer with the vocabulary file
bert_tokenizer = BertTokenizer(vocab_file=filepath_bert, do_lower_case=True)

# Tokenize the sentences
tokens_1 = bert_tokenizer.encode("We are the champions, my friends.", add_special_tokens=True)
tokens_2 = bert_tokenizer.encode("The banker is checking my credit application.", add_special_tokens=True)

# Print the tokens
print("Tokens 1:", tokens_1)
print("Tokens 2:", tokens_2)

# Convert tokens back to words
tokens_1_words = bert_tokenizer.convert_ids_to_tokens(tokens_1)
tokens_2_words = bert_tokenizer.convert_ids_to_tokens(tokens_2)

# Print the tokens as words
print("Tokens 1 as words:", tokens_1_words)
print("Tokens 2 as words:", tokens_2_words)

# Detokenize the tokens back to sentences
detokenized_1 = bert_tokenizer.decode(tokens_1, skip_special_tokens=False)
detokenized_2 = bert_tokenizer.decode(tokens_2, skip_special_tokens=False)

# Print the detokenized sentences
print("Detokenized 1:", detokenized_1)
print("Detokenized 2:", detokenized_2)

Tokens 1: [7011, 78, 86, 71, 39, 5499, 601, 2294, 13, 99, 412, 15, 7010]
Tokens 2: [7011, 71, 1327, 227, 80, 1512, 138, 99, 2650, 2310, 15, 7010]
Tokens 1 as words: ['[CLS]', 'we', 'are', 'the', 'c', '##ham', '##p', '##ions', ',', 'my', 'friends', '.', '[SEP]']
Tokens 2 as words: ['[CLS]', 'the', 'bank', '##er', 'is', 'check', '##ing', 'my', 'credit', 'application', '.', '[SEP]']
Detokenized 1: [CLS] we are the champions, my friends. [SEP]
Detokenized 2: [CLS] the banker is checking my credit application. [SEP]


In [39]:
# Initialize the SentencePiece tokenizer
sp_tokenizer = tf_text.SentencepieceTokenizer(model=tf.io.gfile.GFile(sp_model_path, 'rb').read(), out_type=tf.string)

# Load the SentencePiece model using sentencepiece library to get special tokens
sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)

# Get the special tokens
cls_token = sp.id_to_piece(1)  # <s>
sep_token = sp.id_to_piece(2)  # </s>

# Tokenize the sentences
sp_tokens_1 = sp_tokenizer.tokenize(["We are the champions, my friends."])
sp_tokens_2 = sp_tokenizer.tokenize(["The banker is checking my credit application."])

# Add special tokens to the tokenized output
sp_tokens_1 = tf.concat([[cls_token], sp_tokens_1.flat_values, [sep_token]], axis=0)
sp_tokens_2 = tf.concat([[cls_token], sp_tokens_2.flat_values, [sep_token]], axis=0)

# Decode the tokens to UTF-8 strings for readability
sp_tokens_1 = [token.decode('utf-8') for token in sp_tokens_1.numpy()]
sp_tokens_2 = [token.decode('utf-8') for token in sp_tokens_2.numpy()]

# Print the tokens
print("Tokens 1:", sp_tokens_1)
print("Tokens 2:", sp_tokens_2)

# Get the vocabulary size
vocab_size = sp.get_piece_size()
print(f"Total number of tokens in the SentencePiece vocabulary: {vocab_size}")

# Detokenize the tokens back to words
detokenized_1 = tf.strings.reduce_join(sp_tokens_1, separator=' ', axis=-1)
detokenized_2 = tf.strings.reduce_join(sp_tokens_2, separator=' ', axis=-1)

# Decode the detokenized sentences to UTF-8 strings for readability
detokenized_1 = detokenized_1.numpy().decode('utf-8')
detokenized_2 = detokenized_2.numpy().decode('utf-8')

# Print the detokenized sentences
print("Detokenized 1:", detokenized_1)
print("Detokenized 2:", detokenized_2)

Tokens 1: ['<s>', '▁We', '▁are', '▁the', '▁', 'ch', 'amp', 'ion', 's', ',', '▁my', '▁friend', 's', '.', '</s>']
Tokens 2: ['<s>', '▁The', '▁b', 'an', 'k', 'er', '▁is', '▁c', 'he', 'c', 'k', 'ing', '▁my', '▁credit', '▁a', 'p', 'p', 'l', 'ic', 'ation', '.', '</s>']
Total number of tokens in the SentencePiece vocabulary: 1000
Detokenized 1: <s> ▁We ▁are ▁the ▁ ch amp ion s , ▁my ▁friend s . </s>
Detokenized 2: <s> ▁The ▁b an k er ▁is ▁c he c k ing ▁my ▁credit ▁a p p l ic ation . </s>


In [43]:
# Tokenize the Sentences
sentences = ["The train is coming down the railroad.", "I would like to train for the marathon."]

# BERT Tokenization
bert_tokens_1 = bert_tokenizer.encode(sentences[0], add_special_tokens=True)
bert_tokens_2 = bert_tokenizer.encode(sentences[1], add_special_tokens=True)

# SentencePiece Tokenization
sp_tokens_1 = sp_tokenizer.tokenize([sentences[0]])
sp_tokens_2 = sp_tokenizer.tokenize([sentences[1]])

# Print the token values (numbers)
print("BERT Tokens 1:", bert_tokens_1)
print("BERT Tokens 2:", bert_tokens_2)
print("SentencePiece Tokens 1:", sp_tokens_1.numpy().tolist())
print("SentencePiece Tokens 2:", sp_tokens_2.numpy().tolist())

# Print the Tokens as Words
bert_tokens_1_words = bert_tokenizer.convert_ids_to_tokens(bert_tokens_1)
bert_tokens_2_words = bert_tokenizer.convert_ids_to_tokens(bert_tokens_2)

sp_tokens_1_words = [sp.id_to_piece(token) for token in sp_tokens_1.numpy().tolist()[0]]
sp_tokens_2_words = [sp.id_to_piece(token) for token in sp_tokens_2.numpy().tolist()[0]]

print("BERT Tokens 1 as words:", bert_tokens_1_words)
print("BERT Tokens 2 as words:", bert_tokens_2_words)
print("SentencePiece Tokens 1 as words:", sp_tokens_1_words)
print("SentencePiece Tokens 2 as words:", sp_tokens_2_words)

BERT Tokens 1: [7011, 71, 1603, 80, 437, 210, 71, 54, 3715, 1548, 1966, 15, 7010]
BERT Tokens 2: [7011, 45, 127, 106, 73, 1603, 85, 71, 49, 899, 1224, 5502, 269, 15, 7010]
SentencePiece Tokens 1: [[69, 605, 47, 589, 245, 7, 390, 128, 131, 218, 6]]
SentencePiece Tokens 2: [[9, 67, 110, 10, 605, 42, 7, 463, 19, 95, 112, 6]]
BERT Tokens 1 as words: ['[CLS]', 'the', 'train', 'is', 'coming', 'down', 'the', 'r', '##ail', '##ro', '##ad', '.', '[SEP]']
BERT Tokens 2 as words: ['[CLS]', 'i', 'would', 'like', 'to', 'train', 'for', 'the', 'm', '##ar', '##at', '##ho', '##n', '.', '[SEP]']
SentencePiece Tokens 1 as words: ['▁The', '▁train', '▁is', '▁coming', '▁down', '▁the', '▁ra', 'il', 'ro', 'ad', '.']
SentencePiece Tokens 2 as words: ['▁I', '▁would', '▁like', '▁to', '▁train', '▁for', '▁the', '▁mar', 'a', 'th', 'on', '.']
