In [None]:
!pip install tokenizers

In [None]:
from tokenizers import models
from tokenizers import decoders
from tokenizers import Tokenizer
from tokenizers import trainers
from tokenizers import processors

from tokenizers import normalizers
from tokenizers import pre_tokenizers
from transformers import PreTrainedTokenizerFast

tokenizer = Tokenizer(models.WordPiece(unk_token='[UNK]'))

#normalization( NFKD Unicode normalization )
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.Lowercase(), normalizers.NFKD()]
)
#Pre-Tokenization: In pre-tokenization component we will be broken from string of text into tokens.
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

#training the tokenizer
#hyper params
trainer = trainers.WordPieceTrainer(
    vocab_size=30000,
    special_tokens=['[UNK]', '[PAD]', '[CLS]', '[SEP]', '[MASK]'],
    min_frequency=2,
    continuing_subword_prefix='##'
)
#training
tokenizer.train_from_iterator(iter("""list containing language sentences"""), trainer=trainer)


#post tokenization processing
cls_id = tokenizer.token_to_id('[CLS]')
sep_id = tokenizer.token_to_id('[SEP]')

#post processing step
tokenizer.post_processor = processors.TemplateProcessing(
    single=f'[CLS]:0 $A:0 [SEP]:0',
    pair=f'[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1',
    special_tokens=[
        ('[CLS]', cls_id),
        ('[SEP]', sep_id)
    ]
)

#extra decoding layer
tokenizer.decoder = decoders.WordPiece(prefix='##')


#saving the model to desired directory

# load the tokenizer in a transformers tokenizer instance
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token='[UNK]',
    pad_token='[PAD]',
    cls_token='[CLS]',
    sep_token='[SEP]',
    mask_token='[MASK]'
)

# save the tokenizer
tokenizer.save_pretrained('bert-base-dv')

#load the saved model
tokenizer = PreTrainedTokenizerFast.from_pretrained('bert-base-dv')