In [1]:
from gluonnlp.data.tokenizers import SentencepieceTokenizer
from transformers import T5Tokenizer

In [2]:
trans = T5Tokenizer.from_pretrained("t5-base")
gluon = SentencepieceTokenizer(model_path=trans.vocab_file)

In [3]:
extra_ids = 100
additional_special_tokens = {
    'extra_{}_token'.format(i): '<extra_id_{}>'.format(i) for i in range(extra_ids - 1, -1, -1)
} # keys must end with "token"
# https://github.com/dmlc/gluon-nlp/blob/12f6da265237a9ab32feff5e4a446a275f74e5da/src/gluonnlp/data/vocab.py#L205

In [4]:
# https://github.com/google/sentencepiece/blob/master/doc/special_symbols.md
# unable to add extra_token_{1..99} in spm.SentencePieceProcessor constructor

# https://github.com/dmlc/gluon-nlp/blob/12f6da265237a9ab32feff5e4a446a275f74e5da/src/gluonnlp/data/tokenizers/sentencepiece.py#L112-L122
# unable to add extra_token_{1..99} in SentencepieceTokenizer constructor

other_control_tokens_ids = \
            [i for i in range(len(gluon._sp_model))
             if gluon._sp_model.is_control(i)]
other_control_tokens = set([gluon._sp_model.id_to_piece(ele)
                            for ele in other_control_tokens_ids])
other_control_tokens

{'</s>', '<pad>'}

In [5]:
from gluonnlp.data.vocab import Vocab

gluon._vocab = Vocab(gluon._vocab.all_tokens, **additional_special_tokens)

In [6]:
# https://github.com/dmlc/gluon-nlp/blob/12f6da265237a9ab32feff5e4a446a275f74e5da/src/gluonnlp/data/vocab.py#L204

for i in range(10): 
    token = '<extra_id_{}>'.format(i)
    print('Token: {}, Index: {}'.format(token, gluon.vocab(token)))

Token: <extra_id_0>, Index: 32000
Token: <extra_id_1>, Index: 32011
Token: <extra_id_2>, Index: 32022
Token: <extra_id_3>, Index: 32033
Token: <extra_id_4>, Index: 32044
Token: <extra_id_5>, Index: 32055
Token: <extra_id_6>, Index: 32066
Token: <extra_id_7>, Index: 32077
Token: <extra_id_8>, Index: 32088
Token: <extra_id_9>, Index: 32099


In [7]:
gluon.encode("Hello World", int)

[8774, 1150]

In [8]:
trans.encode("Hello World") # </s>: 1

[8774, 1150, 1]

In [9]:
trans

PreTrainedTokenizer(name_or_path='t5-base', vocab_size=32100, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_id_44>', '<extra_id_45>',

In [10]:
gluon

SentencepieceTokenizer(
   model_path = /home/ubuntu/.cache/huggingface/transformers/684a47ca6257e4ca71f0037771464c5b323e945fbc58697d2fad8a7dd1a2f8ba.3b69006860e7b5d0a63ffdddc01ddcd6b7c318a6f4fd793596552c741734c62d
   lowercase = False, nbest = 0, alpha = 0.0
   vocab = Vocab(size=32100, unk_token="<unk>", extra_0_token="<extra_id_0>", extra_10_token="<extra_id_10>", extra_11_token="<extra_id_11>", extra_12_token="<extra_id_12>", extra_13_token="<extra_id_13>", extra_14_token="<extra_id_14>", extra_15_token="<extra_id_15>", extra_16_token="<extra_id_16>", extra_17_token="<extra_id_17>", extra_18_token="<extra_id_18>", extra_19_token="<extra_id_19>", extra_1_token="<extra_id_1>", extra_20_token="<extra_id_20>", extra_21_token="<extra_id_21>", extra_22_token="<extra_id_22>", extra_23_token="<extra_id_23>", extra_24_token="<extra_id_24>", extra_25_token="<extra_id_25>", extra_26_token="<extra_id_26>", extra_27_token="<extra_id_27>", extra_28_token="<extra_id_28>", extra_29_token="<extra_i

In [30]:
def noise_span_to_unique_sentinel(tokens, noise_mask, vocab):
  """Replace each run of consecutive noise tokens with a different sentinel.
  The idea here is to be able to align the dropped spans in the inputs
  with the markers in the targets.
  We want to generate training examples like
  "We hold X to be Y that" -> "X these truths Y self evident Z"
  Sentinels assigned in decreasing order within the sequence starting at
  vocabulary.size - 1.  That is, we appropriate the last tokens in the
  vocabulary for additional use as sentinels.
  TODO(noam): we may want to try enlarging the vocabulary and leaving room
  for the sentinels instead.  However, this requires enlarging the embedding
  tables in the model, so that is a bigger change.
  Args:
    tokens: a 1d integer Tensor
    noise_mask: a boolean Tensor with the same shape as tokens
    vocabulary: a vocabulary.Vocabulary
    seeds: an unused int32 Tensor
  Returns:
    a Tensor with the same shape and dtype as tokens
  """
  vocab_size = vocab
  prev_token_is_noise = tf.pad(noise_mask[:-1], [[1, 0]])

  first_noise_tokens = tf.logical_and(
      noise_mask, tf.logical_not(prev_token_is_noise))
  subsequent_noise_tokens = tf.logical_and(noise_mask, prev_token_is_noise)

  sentinel = vocab_size - tf.cumsum(tf.cast(first_noise_tokens, tokens.dtype))

  tokens = tf.where(first_noise_tokens, sentinel, tokens)
  return tf.boolean_mask(tokens, tf.logical_not(subsequent_noise_tokens))

In [31]:
noise_span_to_unique_sentinel(
    tf.constant([1, 2, 3, 4, 5, 6, 7, 8]), 
    tf.constant([True, False, False, False, True, True, False, True]), 
    1000
)

<tf.Tensor: shape=(7,), dtype=int32, numpy=array([999,   2,   3,   4, 998,   7, 997], dtype=int32)>

In [21]:
from transformers import AlbertTokenizer

In [22]:
altr_tk = AlbertTokenizer.from_pretrained('albert-base-v2')

In [23]:
altr_tk("Hello World")

{'input_ids': [2, 10975, 126, 3], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [24]:
algl_tk = SentencepieceTokenizer(altr_tk.vocab_file, lowercase=True)

In [25]:
algl_tk.encode("Hello World", int)

[10975, 126]

In [120]:
altr_tk

PreTrainedTokenizer(name_or_path='albert-base-v2', vocab_size=30000, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '<unk>', 'sep_token': '[SEP]', 'pad_token': '<pad>', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [121]:
algl_tk

SentencepieceTokenizer(
   model_path = /home/ubuntu/.cache/huggingface/transformers/10be6ce6d3508f1fdce98a57a574283b47c055228c1235f8686f039287ff8174.d6110e25022b713452eb83d5bfa8ae64530995a93d8e694fe52e05aa85dd3a7d
   lowercase = True, nbest = 0, alpha = 0.0
   vocab = Vocab(size=30000, unk_token="<unk>", pad_token="<pad>", other2_token="[CLS]", other0_token="[SEP]", other1_token="[MASK]")
)