# setup PreTrainedTokenizerFast

In [3]:
import sys
sys.path.append("/workspace")

In [46]:
import json

In [17]:
from pathlib import Path
import os
from preprocessing.text_tokenizer import TextTokenizer
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import CharDelimiterSplit
from tokenizers.processors import BertProcessing
# https://github.com/huggingface/transformers/issues/7234#issuecomment-720092292
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, RobertaConfig, RobertaForMaskedLM

In [18]:
dataset_path = Path('/workspace/notebooks/data')

In [29]:
# 2. Create a tokenizer
# load our tokenizer
text_tokenizer = TextTokenizer(dataset_path)
text_tokenizer.load_vocab(dataset_path/'pan_tadeusz'/'vocab.json')

# Create transformers compatible tokenizer
tokenizer = Tokenizer(WordLevel(text_tokenizer.vocab))
tokenizer.pre_tokenizer = CharDelimiterSplit(' ')
# tokenizer.model.unk_token = '<unk>'

tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

tokenizer.enable_truncation(max_length=128)  # 512

tokenizer_path = dataset_path / 'tokenizer1a'
tokenizer_path.mkdir(parents=True, exist_ok=True)
tokenizer.save(str(tokenizer_path/"tokenizer1a.json"))

# # Re-create as roberta compatible tokenizer
# tokenizer_path = dataset_path / 'tokenizer1'
# print(tokenizer_path)

tokenizer2 = PreTrainedTokenizerFast(tokenizer_file=str(tokenizer_path/"tokenizer1a.json"))
# tokenizer2._tokenizer.post_processor = BertProcessing(
#     ("</s>", tokenizer2._tokenizer.token_to_id("</s>")),
#     ("<s>", tokenizer2._tokenizer.token_to_id("<s>")),
# )
# tokenizer2._tokenizer.enable_truncation(max_length=128)  # 512
tokenizer2.mask_token = "<mask>"
tokenizer2.pad_token = "<pad>"

In [34]:
tokenizer.

<tokenizers.Tokenizer at 0x5571369c7720>

In [30]:
tokenizer2

PreTrainedTokenizerFast(name_or_path='', vocab_size=6473, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', special_tokens={'pad_token': '<pad>', 'mask_token': '<mask>'})

In [15]:
tokenizer2.backend_tokenizer.pre_tokenizer

<tokenizers.pre_tokenizers.CharDelimiterSplit at 0x7fc20984dab0>

In [23]:
tokenizer3 = PreTrainedTokenizer(tokenizer_file=str(tokenizer_path/"tokenizer1.json"))

In [None]:
from transformers.tokeniz

In [54]:
tokenizer2 = PreTrainedTokenizerFast(
    tokenizer_file=str(tokenizer_path/"tokenizer1a.json"), 
#     special_tokens_map='notebooks/data/my-roberta2/special_tokens_map.json'
)
tokenizer2.mask_token

Using mask_token, but it is not set yet.


In [55]:
special_tokens_map = {
    "bos_token": "<s>",
    "eos_token": "</s>",
    "unk_token": "<unk>",
    "sep_token": "</s>",
    "pad_token": "<pad>",
    "cls_token": "<s>",
    "mask_token": "<mask>"
}

In [56]:
tokenizer2.add_special_tokens(special_tokens_map)

0

In [57]:
tokenizer2.mask_token

'<mask>'

In [62]:
tokenizer2.eos_token_id

4

In [71]:
# (dataset_path/'my-pretrained-tokenizer-fast1').mkdir()
tokenizer2.save_pretrained(save_directory=dataset_path/'my-pretrained-tokenizer-fast1', legacy_format=False)

('/workspace/notebooks/data/my-pretrained-tokenizer-fast1/tokenizer_config.json',
 '/workspace/notebooks/data/my-pretrained-tokenizer-fast1/special_tokens_map.json',
 '/workspace/notebooks/data/my-pretrained-tokenizer-fast1/tokenizer.json')

In [None]:
tokenizer2.save_vocabulary

In [79]:
tokenizer3 = PreTrainedTokenizerFast.from_pretrained(
    dataset_path/'my-pretrained-tokenizer-fast2'
)
tokenizer3.mask_token

'<mask>'

In [80]:
type(tokenizer3.mask_token)

str

In [81]:
tokenizer3

PreTrainedTokenizerFast(name_or_path='/workspace/notebooks/data/my-pretrained-tokenizer-fast2', vocab_size=6473, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [82]:
tokenizer3("_cap_ lit++ --wo !")

{'input_ids': [3, 6, 4858, 3492, 8, 4], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}