## BPE(Byte Pair Encoding) 토크나이저 만들기

* 어휘 집합(vocag.json) 생성 필요  
* 병합 우선순위(merge.txt) 생성 필요  
* 어휘 집합과 병합 우선순위가 있으면 토큰화를 수행 가능

In [2]:
import pandas as pd
import os 

In [3]:
data_dir = "./tokenizer_data"

In [4]:
train_data = pd.read_csv('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt', sep='\t', encoding='utf-8')
train_data.to_csv('./data/ratings_train.csv', encoding='utf-8', index=False)
test_data = pd.read_csv('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt', sep='\t', encoding='utf-8')
test_data.to_csv('./data/ratings_test.csv', encoding='utf-8', index=False)


In [16]:
train_file_nm = 'bpe_ratings_train.txt'
full_data_dir = os.path.join(data_dir, train_file_nm)
text_file = open(full_data_dir, 'w' , encoding='utf-8')

corp = set()

train_str = '\n'.join(train_data['document'].astype('str').tolist())
text_file.write(train_str)

train_list = train_str.split(' ')
train_set = set(train_list)
corp.update(train_set)
text_file.close()

In [17]:
test_file_nm = 'bpe_ratings_test.txt'
full_data_dir = os.path.join(data_dir, test_file_nm)
text_file = open(full_data_dir, 'w' , encoding='utf-8')

corp = set()

test_str = '\n'.join(test_data['document'].astype('str').tolist())
text_file.write(test_str)

test_list = test_str.split(' ')
test_set = set(test_list)
corp.update(test_set)
text_file.close()

In [18]:
from tokenizers import CharBPETokenizer

In [19]:
vocab_size = 10000

train_fnm = os.path.join(data_dir, train_file_nm)
test_fnm = os.path.join(data_dir, test_file_nm)

tokenizer = CharBPETokenizer()
tokenizer.train(
    files = [train_fnm, test_fnm] , 
    vocab_size = vocab_size , 
    special_tokens = ["<unk>"]
)

tokenizer.save_model(data_dir)
tokenizer.save(f"{data_dir}/bpe_tokenizer.json")

## 훈련한 토크나이저 적용하기

In [20]:
from tokenizers import Tokenizer

In [21]:
bpe_tokenizer_pretrained = Tokenizer.from_file(f"{data_dir}/bpe_tokenizer.json")
global bpe_tokenizer_pretrained

def usingBPETokenizer(data) -> list :
    return bpe_tokenizer_pretrained.encode(data).tokens

In [22]:
train_data['document'][0]

'아 더빙.. 진짜 짜증나네요 목소리'

In [23]:
print(usingBPETokenizer(train_data['document'][0]))

['아</w>', '더빙</w>', '.</w>', '.</w>', '진짜</w>', '짜증나', '네요</w>', '목소리</w>']
