In [4]:
from transformers import BertTokenizer
from tokenizers import BertWordPieceTokenizer
from pathlib import Path

In [5]:
out_base = Path('./polished/models/v2bert/')

In [6]:
data_file = './no_en_data/ka_nse_train.txt'

### Train sub-word tokenizer

In [7]:
wb_tokenizer = BertWordPieceTokenizer(clean_text=True, handle_chinese_chars=True,
                                      strip_accents=True, lowercase=False)

wb_tokenizer.train(data_file,
                   vocab_size=30000, min_frequency=5,
                   special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
wb_tokenizer.save_model(str(out_base / 'wordpiece'))






['polished/models/v2bert/wordpiece/vocab.txt']

In [8]:
wb_tokenizer.encode('Hi broz').tokens

['[UNK]', '[UNK]']

In [9]:
wb_tokenizer.encode('შემომეჭამა').tokens

['შემომ', '##ეჭ', '##ამა']

In [10]:
wb_tokenizer.encode('-დიალოგის მაგალითი. მაგარია? ვნახოთ აბა [ლოლ').tokens

['-', 'დიალოგის', 'მაგალითი', '.', 'მაგარია', '?', 'ვნახოთ', 'აბა', '[', 'ლოლ']

### Create BertTokenizer which is now aware of special token meanings

In [11]:
tokenizer = BertTokenizer(str(out_base / 'wordpiece' / "vocab.txt"),
                              do_lower_case=False, do_basic_tokenize=True, 
                              bos_token='[CLS]', 
                              eos_token='[SEP]', sep_token='[SEP]', 
                              cls_token='[CLS]', unk_token='[UNK]', 
                              pad_token='[PAD]', mask_token='[MASK]',)

In [12]:
tokenizer.save_pretrained(str(out_base / 'berttokenizer'))

('polished/models/v2bert/berttokenizer/tokenizer_config.json',
 'polished/models/v2bert/berttokenizer/special_tokens_map.json',
 'polished/models/v2bert/berttokenizer/vocab.txt',
 'polished/models/v2bert/berttokenizer/added_tokens.json')

In [13]:
tokenizer('შემომეჭამა')

{'input_ids': [2, 6399, 1909, 2096, 3], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}