# BERT WordPiece Tokenizer (한국어)

## BertWordPieceTokenizer 훈련

In [1]:
from tokenizers import BertWordPieceTokenizer

data_files = [
    "korquad_train_context.txt",
    "korquad_train_question.txt",
    "korquad_validation_context.txt",
    "korquad_validation_question.txt"
]

vocab_sizes = [5000, 10000, 20000, 30000, 40000, 50000]
limit_alphabet = 6000
min_frequency = 5

for vocab_size in vocab_sizes:
    tokenizer = BertWordPieceTokenizer(lowercase=False)
    tokenizer.train(files=data_files,
                    vocab_size=vocab_size,
                    limit_alphabet=limit_alphabet,
                    min_frequency=min_frequency)

    tokenizer_file = f"bert_word_piece_korquad/tokenizer_{vocab_size}.json"
    tokenizer.save(tokenizer_file)
    print(f"Saved tokenizer: {tokenizer_file}")




Saved tokenizer: bert_word_piece_korquad/tokenizer_5000.json



Saved tokenizer: bert_word_piece_korquad/tokenizer_10000.json



Saved tokenizer: bert_word_piece_korquad/tokenizer_20000.json



Saved tokenizer: bert_word_piece_korquad/tokenizer_30000.json



Saved tokenizer: bert_word_piece_korquad/tokenizer_40000.json



Saved tokenizer: bert_word_piece_korquad/tokenizer_50000.json


## 훈련 결과 이용

In [2]:
from tokenizers import Tokenizer

texts = [
    "미세먼지가 심하면 차량 2부제와 같은 비상저감조치를 시행",
    "가뜩이나 어려운 조건 속에서"
]

for text in texts:
    print("#" * 80)
    print("TEXT: " + text)
    for vocab_size in vocab_sizes:
        tokenizer_file = f"bert_word_piece_korquad/tokenizer_{vocab_size}.json"
        print(f"  TOKENIZER: {tokenizer_file}")
        tokenizer = Tokenizer.from_file(tokenizer_file)
        print(f"    {tokenizer.encode(text).tokens}")

################################################################################
TEXT: 미세먼지가 심하면 차량 2부제와 같은 비상저감조치를 시행
  TOKENIZER: bert_word_piece_korquad/tokenizer_5000.json
    ['미', '##세', '##먼', '##지', '##가', '심', '##하', '##면', '차', '##량', '2', '##부', '##제', '##와', '같', '##은', '비', '##상', '##저', '##감', '##조', '##치', '##를', '시', '##행']
  TOKENIZER: bert_word_piece_korquad/tokenizer_10000.json
    ['미', '##세', '##먼', '##지가', '심', '##하면', '차량', '2', '##부', '##제와', '같은', '비', '##상', '##저', '##감', '##조', '##치를', '시행']
  TOKENIZER: bert_word_piece_korquad/tokenizer_20000.json
    ['미세', '##먼', '##지가', '심', '##하면', '차량', '2부', '##제와', '같은', '비상', '##저', '##감', '##조', '##치를', '시행']
  TOKENIZER: bert_word_piece_korquad/tokenizer_30000.json
    ['미세먼지', '##가', '심', '##하면', '차량', '2부', '##제와', '같은', '비상', '##저', '##감', '##조치를', '시행']
  TOKENIZER: bert_word_piece_korquad/tokenizer_40000.json
    ['미세먼지', '##가', '심', '##하면', '차량', '2부', '##제와', '같은', '비상', '##저', '##감', '##조치를', '시행']
  TOKENI