In [30]:
from transformers import pipeline
from transformers import RobertaTokenizerFast, RobertaTokenizer

from tokenizers import SentencePieceBPETokenizer


In [31]:
tokenizer = RobertaTokenizerFast.from_pretrained("../dataset/spm/th-wiki_only_20.7.2020_small")
tokenizer

<transformers.tokenization_roberta.RobertaTokenizerFast at 0x7f7b02c61f60>

In [32]:
tokenizer_spm = RobertaTokenizer(
                    vocab_file="../dataset/spm/th-wiki_only_20.7.2020_small_sentencepiece/vocab.json",
                    merges_file="../dataset/spm/th-wiki_only_20.7.2020_small_sentencepiece/merges.txt")
tokenizer_spm

<transformers.tokenization_roberta.RobertaTokenizer at 0x7f7b02c61ef0>

In [33]:
tokenizer.vocab_size

52000

In [34]:
th_bpe_dict = tokenizer.get_vocab()
th_spm_dict = tokenizer_spm.get_vocab()

In [39]:
print(list(th_bpe_dict.keys())[10])
print(list(th_spm_dict.keys())[20000])

eries
กษาปณ์


In [40]:
tokenizer.encode_plus('รถไฟสีแดง')

{'input_ids': [0, 1545, 284, 275, 1531, 2], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [43]:
out = tokenizer_spm.encode('ส')
out

[0, 119, 3, 3, 2]

In [15]:
def viz_tok(text:str, tokenizer=tokenizer):
    out = tokenizer.tokenize(text)

    ids = tokenizer.convert_tokens_to_ids(out)
    out_txt = '<SEP>'.join([ tokenizer.decode([_id]) for _id in ids ])
    print(f'out: {out}')
    print(f'Ids: {ids}')    
    print(f'Tokens: {out_txt}')

In [16]:
viz_tok('ปี  ค่ะ ครับ ใด ก็ มี ที่ นี่ นี้ บุก ทุก ทุกข์')

out: ['à¸Ľ', 'à¸µ', 'Ġ', 'Ġà¸Ħ', 'à¹Ī', 'à¸°', 'Ġà¸Ħà¸£', 'à¸±', 'à¸ļ', 'Ġ', 'à¹ĥà¸Ķ', 'Ġà¸ģ', 'à¹ĩ', 'Ġà¸¡', 'à¸µ', 'Ġà¸Ĺ', 'à¸µà¹Ī', 'Ġà¸Ļ', 'à¸µà¹Ī', 'Ġà¸Ļ', 'à¸µà¹ī', 'Ġà¸ļ', 'à¸¸', 'à¸ģ', 'Ġà¸Ĺ', 'à¸¸', 'à¸ģ', 'Ġà¸Ĺ', 'à¸¸', 'à¸ģà¸Ĥ', 'à¹Į']
Ids: [296, 275, 225, 386, 269, 283, 913, 271, 288, 225, 1266, 447, 300, 406, 275, 397, 299, 477, 299, 477, 394, 581, 302, 270, 397, 302, 270, 397, 302, 1191, 291]
Tokens: ป<SEP>ี<SEP> <SEP> ค<SEP>่<SEP>ะ<SEP> คร<SEP>ั<SEP>บ<SEP> <SEP>ใด<SEP> ก<SEP>็<SEP> ม<SEP>ี<SEP> ท<SEP>ี่<SEP> น<SEP>ี่<SEP> น<SEP>ี้<SEP> บ<SEP>ุ<SEP>ก<SEP> ท<SEP>ุ<SEP>ก<SEP> ท<SEP>ุ<SEP>กข<SEP>์


In [17]:
viz_tok('อารยธรรมโบราณของแดนกลาง')

out: ['à¸Ńà¸²à¸£à¸¢à¸ĺà¸£à¸£à¸¡', 'à¹Ĥà¸ļà¸£à¸²à¸ĵ', 'à¸Ĥà¸Ńà¸ĩ', 'à¹ģà¸Ķà¸Ļ', 'à¸ģà¸¥à¸²à¸ĩ']
Ids: [9736, 1604, 340, 3611, 713]
Tokens: อารยธรรม<SEP>โบราณ<SEP>ของ<SEP>แดน<SEP>กลาง


In [44]:
viz_tok('อารยธรรมโบราณของแดนกลาง', tokenizer=tokenizer_spm)

out: ['à', '¸', 'Ń', 'à', '¸', '²', 'à', '¸', '£', 'à', '¸', '¢', 'à', '¸', 'ĺ', 'à', '¸', '£', 'à', '¸', '£', 'à', '¸', '¡', 'à', '¹', 'Ĥ', 'à', '¸', 'ļ', 'à', '¸', '£', 'à', '¸', '²', 'à', '¸', 'ĵ', 'à', '¸', 'Ĥ', 'à', '¸', 'Ń', 'à', '¸', 'ĩ', 'à', '¹', 'ģ', 'à', '¸', 'Ķ', 'à', '¸', 'Ļ', 'à', '¸', 'ģ', 'à', '¸', '¥', 'à', '¸', '²', 'à', '¸', 'ĩ']
Ids: [119, 3, 3, 119, 3, 3, 119, 3, 102, 119, 3, 101, 119, 3, 3, 119, 3, 102, 119, 3, 102, 119, 3, 100, 119, 3, 3, 119, 3, 3, 119, 3, 102, 119, 3, 3, 119, 3, 3, 119, 3, 3, 119, 3, 3, 119, 3, 3, 119, 3, 3, 119, 3, 3, 119, 3, 3, 119, 3, 3, 119, 3, 3, 119, 3, 3, 119, 3, 3]
Tokens: �<SEP><unk><SEP><unk><SEP>�<SEP><unk><SEP><unk><SEP>�<SEP><unk><SEP>�<SEP>�<SEP><unk><SEP>�<SEP>�<SEP><unk><SEP><unk><SEP>�<SEP><unk><SEP>�<SEP>�<SEP><unk><SEP>�<SEP>�<SEP><unk><SEP>�<SEP>�<SEP><unk><SEP><unk><SEP>�<SEP><unk><SEP><unk><SEP>�<SEP><unk><SEP>�<SEP>�<SEP><unk><SEP><unk><SEP>�<SEP><unk><SEP><unk><SEP>�<SEP><unk><SEP><unk><SEP>�<SEP><unk><SEP><unk><SEP>�<SE

In [19]:

fill_mask = pipeline(
    "fill-mask",
    model="../results/checkpoint-25000/",
    tokenizer=tokenizer,
    device=2
)

In [22]:
fill_mask("อารยธรรมรถ<mask>ฟ้า")

[{'sequence': '<s>อารยธรรมรถัฟ้า</s>',
  'score': 0.06177940219640732,
  'token': 271,
  'token_str': 'à¸±'},
 {'sequence': '<s>อารยธรรมรถิฟ้า</s>',
  'score': 0.04437718912959099,
  'token': 279,
  'token_str': 'à¸´'},
 {'sequence': '<s>อารยธรรมรถ้ฟ้า</s>',
  'score': 0.04331345111131668,
  'token': 274,
  'token_str': 'à¹ī'},
 {'sequence': '<s>อารยธรรมรถ่ฟ้า</s>',
  'score': 0.04146899655461311,
  'token': 269,
  'token_str': 'à¹Ī'},
 {'sequence': '<s>อารยธรรมรถีฟ้า</s>',
  'score': 0.037992678582668304,
  'token': 275,
  'token_str': 'à¸µ'}]