In [16]:
from tokenizers import Tokenizer, pre_tokenizers, models
from thai2transformers.tokenizers import CustomPreTokenizer, WordLevelTrainer

#newmm
from pythainlp.tokenize import word_tokenize, syllable_tokenize

#sefr cut
import sefr_cut
sefr_cut.load_model(engine='ws1000')
def pingcut(text):
    return sefr_cut.tokenize(text)[0]

#parameter
# tokenize_func = syllable_tokenize #word_tokenize, syllable_tokenize, pingcut (very slow on CPU)
tokenize_func =  word_tokenize #, syllable_tokenize, pingcut (very slow on CPU)
texts = ['โรนัลโด<_>เขาได้เล่นกับทีม','โปรตุเกส<_>มีโรนัลโด']

from transformers import CamembertTokenizer

#tokenizers version used
import tokenizers
tokenizers.__version__

loading model.....
Success


'0.9.4'

In [50]:
%%time
#get vocab
trainer = WordLevelTrainer(pre_tokenize_func=tokenize_func, 
                           vocab_size=10, 
                           input_dir='test_folder', 
                           additional_special_tokens=['<s>','<pad>','</s>','<unk>','<mask>', '<_>'])
trainer.count_parallel()
trainer.save_vocab('test.json')

CPU times: user 19 ms, sys: 51.8 ms, total: 70.8 ms
Wall time: 184 ms


In [51]:
trainer.token_counter

Counter({'กริช': 3,
         'ตี': 3,
         'ยา': 3,
         'นู': 3,
         ' ': 306,
         'รู': 6,
         'นัล': 6,
         'ดู': 6,
         'ดุ': 3,
         'ช': 6,
         'ซัง': 3,
         'ตุ': 6,
         'อา': 6,
         'ไว': 3,
         '(': 3,
         'โปรตุเกส': 27,
         ':': 3,
         'Cristiano': 3,
         'Ronaldo': 3,
         'dos': 3,
         'Santos': 3,
         'Aveiro': 3,
         ';': 3,
         'เกิด': 3,
         '5': 6,
         'กุมภาพันธ์': 3,
         'ค.ศ.': 6,
         '1985': 3,
         ')': 3,
         'หรือ': 3,
         'ที่': 21,
         'รู้จัก': 3,
         'กัน': 3,
         'ใน': 69,
         'ชื่อ': 3,
         'ค': 6,
         'ริ': 6,
         'สเตีย': 6,
         'โน': 6,
         'โรนัลโด': 36,
         'เป็น': 27,
         'นักฟุตบอล': 6,
         'ชาว': 3,
         'ปัจจุบัน': 6,
         'เล่น': 27,
         'ตำแหน่ง': 9,
         'กองหน้า': 3,
         'ให้': 12,
         'กับ': 30,
         'ยู': 6,
     

In [52]:
trainer.vocab

{'<s>': 0,
 '<pad>': 1,
 '</s>': 2,
 '<unk>': 3,
 '<mask>': 4,
 '<_>': 5,
 ' ': 6,
 'ใน': 7,
 'ได้': 8,
 'โรนัลโด': 9,
 'เขา': 10,
 'กับ': 11,
 'ทีม': 12,
 'โปรตุเกส': 13,
 'เป็น': 14,
 'เล่น': 15}

In [53]:
#create pretokenizer and tokenizer
c = CustomPreTokenizer(tokenize_func)
pre = pre_tokenizers.PreTokenizer.custom(c)
tokenizer = Tokenizer(models.WordLevel.from_file('test.json', unk_token = '<unk>'))
tokenizer.pre_tokenizer = pre

In [54]:
#simple encoding
tokenizer.encode_batch(texts)

[Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=4, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

In [55]:
#padding
tokenizer.enable_padding(pad_id=1, pad_token="<pad>", length=10)

In [56]:
#truncation
tokenizer.enable_truncation(max_length=10)

In [57]:
#done
res = [{'input_ids': i.ids, 'attention_mask': i.attention_mask} for i in tokenizer.encode_batch(texts)]
res

[{'input_ids': [9, 5, 10, 8, 15, 11, 12, 1, 1, 1],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]},
 {'input_ids': [13, 5, 3, 9, 1, 1, 1, 1, 1, 1],
  'attention_mask': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]}]

In [58]:
tokenizer.decode(res[0]['input_ids'])

'โรนัลโด <_> เขา ได้ เล่น กับ ทีม <pad> <pad> <pad>'

In [59]:
tokenizer.decode(res[1]['input_ids'])

'โปรตุเกส <_> <unk> โรนัลโด <pad> <pad> <pad> <pad> <pad> <pad>'

In [60]:
#simple encoding
res = [
    {'input_ids': i.ids, 'attention_mask': i.attention_mask} for i in tokenizer.encode_batch(['UBI<_>และ'])
]

res

[{'input_ids': [3, 5, 3, 1, 1, 1, 1, 1, 1, 1],
  'attention_mask': [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]}]

In [61]:
tokenizer.decode(res[0]['input_ids'])

'<unk> <_> <unk> <pad> <pad> <pad> <pad> <pad> <pad> <pad>'

In [63]:
tokenizer.token_to_id('<s>'), tokenizer.token_to_id('<pad>'),\
tokenizer.token_to_id('</s>'), tokenizer.token_to_id('<unk>'),\
tokenizer.token_to_id('<mask>'), tokenizer.token_to_id('<_>')

(0, 1, 2, 3, 4, 5)