In [1]:
import re
import collections
from typing import Dict, List, Tuple, Set

## 1. BPE (Byte Pair Encoding)

BPE is a data compression algorithm proposed in 1994.  
Basically, it works by finding a pair of consecutive words most frequently appeared, and merged it into one letter.

- e.g. `aaabdaaabac`
    - `Z=aa` $\rightarrow$   `ZabdZabac`
    - `Z=aa, Y=ab` $\rightarrow$  `ZYdZYac`
    - `Z=aa, Y=ab, X=ZY` $\rightarrow$  `XdXac`

In natural language preprocessing, BPE is a subword segmentation algorithm, which means it splits exsiting word.
- e.g. frequency of each word in train vocaburay
    ```python
    # dictionary (frequency of each word in train vocaburary)
    low : 5, lower : 2, newest : 6, widest : 3

    # vocabulary
    low, lower, newest, widest

        ↓

    # dictionary
    l o w : 5,  l o w e r : 2,  n e w e s t : 6,  w i d e s t : 3
    
    # vocabulary
    l, o, w, e, r, n, s, t, i, d

        ↓ (1st update, "(e, s)" is the most frequent pair)

    # dictionary update
    l o w : 5,
    l o w e r : 2,
    n e w es t : 6,
    w i d es t : 3

    # vocabulary update
    l, o, w, e, r, n, s, t, i, d, es

        ↓ (2nd update, "(es, t)" is the most frequent pair)

    # dictionary update
    l o w : 5,
    l o w e r : 2,
    n e w es t : 6,
    w i d es t : 3

    # vocabulary update
    l, o, w, e, r, n, s, t, i, d, es, est

    ↓ (3rd update, "(l, o)" is the most frequent pair)

    # dictionary update
    l o w : 5,
    l o w e r : 2,
    n e w es t : 6,
    w i d es t : 3

    # vocabulary update
    l, o, w, e, r, n, s, t, i, d, es, est, lo

    ...
    ```

References:
- [Neural Machine Translation of Rare Words with Subword Units]


[Neural Machine Translation of Rare Words with Subword Units]: https://arxiv.org/abs/1508.07909

In [2]:
def get_stats(dictionary: Dict[str, int]) -> Dict[str, int]:
    pairs = collections.defaultdict(int)
    for word, freq in dictionary.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i], symbols[i+1]] += freq
    return pairs


def merge_dictionary(pair: Tuple[str, str], v_in: Dict[str, int]) -> Dict[str, int]:
    v_out = {}
    bigram = re.escape(" ".join(pair))
    # (?<!\S) => negative lookbehind
    #   - ?<!X: case where there's no X right in front of the current location
    #   - \S: non-white-space character
    # (?!\S) => negative lookahead
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub("".join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

In [3]:
num_merges = 10

dictionary = {
    "l o w </w>": 5,
    "l o w e r </w>": 2,
    "n e w e s t </w>": 6,
    "w i d e s t </w>": 3,
}

In [4]:
get_stats(dictionary)

defaultdict(int,
            {('l', 'o'): 7,
             ('o', 'w'): 7,
             ('w', '</w>'): 5,
             ('w', 'e'): 8,
             ('e', 'r'): 2,
             ('r', '</w>'): 2,
             ('n', 'e'): 6,
             ('e', 'w'): 6,
             ('e', 's'): 9,
             ('s', 't'): 9,
             ('t', '</w>'): 9,
             ('w', 'i'): 3,
             ('i', 'd'): 3,
             ('d', 'e'): 3})

In [5]:
bpe_codes = {}
bpe_codes_reverse = {}
for i in range(num_merges):
    pairs = get_stats(dictionary)
    most_frequent_pair = max(pairs, key=pairs.get)
    
    dictionary = merge_dictionary(most_frequent_pair, dictionary)

    bpe_codes[most_frequent_pair] = i
    bpe_codes_reverse["".join(most_frequent_pair)] = most_frequent_pair

In [6]:
get_stats(dictionary)

defaultdict(int,
            {('low', 'e'): 2,
             ('e', 'r'): 2,
             ('r', '</w>'): 2,
             ('wi', 'd'): 3,
             ('d', 'est</w>'): 3})

In [7]:
bpe_codes

{('e', 's'): 0,
 ('es', 't'): 1,
 ('est', '</w>'): 2,
 ('l', 'o'): 3,
 ('lo', 'w'): 4,
 ('n', 'e'): 5,
 ('ne', 'w'): 6,
 ('new', 'est</w>'): 7,
 ('low', '</w>'): 8,
 ('w', 'i'): 9}

In [8]:
bpe_codes_reverse

{'es': ('e', 's'),
 'est': ('es', 't'),
 'est</w>': ('est', '</w>'),
 'lo': ('l', 'o'),
 'low': ('lo', 'w'),
 'ne': ('n', 'e'),
 'new': ('ne', 'w'),
 'newest</w>': ('new', 'est</w>'),
 'low</w>': ('low', '</w>'),
 'wi': ('w', 'i')}

In [9]:
def get_pairs(word: Tuple[str]) -> Set[str]:
    """Return set of symbol pairs in a word.
    Word is represented as a tuple of symbols (symbols being variable-length strings).
    """
    if not word:
        return set()
    
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs


def encode(word: str, bpe_codes: Dict[str, int]) ->  Tuple[str]:
    """Encode word based on list of BPE merge operations, which are applied consecutively"""

    # e.g. word="loki"
    #  - chars = ("l", "o", "k", "i", "</w>")
    #  - pairs = (("l", "o"), ("o", "k"), ("k", "i"), ("i", "</w>"))
    chars = tuple(word) + ("</w>", )
    pairs = get_pairs(word)

    if not pairs:
        return word

    num_iter = 0
    while True:
        num_iter += 1        
        bigram = min(pairs, key=lambda pair: bpe_codes.get(pair, float("inf")))

        # there's no further merge
        if bigram not in bpe_codes:
            break
        
        c1, c2 = bigram
        new = []
        i = 0
        while i < len(chars):
            cur = chars[i]
            # update i until cur == c1
            if c1 != cur:
                new.append(cur)
                i += 1
                continue

            # in case of (..., c1, c2, ...), merge c1 and c2
            #                  i       next 
            if c1 == cur and i < len(chars)-1 and c2 == chars[i+1]:                
                new.append(c1+c2)
                i += 2
            # in case of (..., c1, c3, ...), where c2 != c3
            #                  i   next
            else:
                new.append(c1)
                i += 1

        chars = tuple(new)
        if len(chars) == 1:
            break
        else:
            pairs = get_pairs(chars)

    # ignore </w> token
    if chars[-1] == "</w>":
        chars = chars[:-1]
    elif chars[-1].endswith("</w>"):
        chars = chars[:-1] + (chars[-1].replace("</w>", ""),)

    return chars

In [10]:
encode("", bpe_codes)

''

In [11]:
encode("loki", bpe_codes)

('lo', 'k', 'i')

In [12]:
encode("lowest", bpe_codes)

('low', 'est')

In [13]:
encode("lowing", bpe_codes)

('low', 'i', 'n', 'g')

In [14]:
encode("highing", bpe_codes)

('h', 'i', 'g', 'h', 'i', 'n', 'g')

## 2. SentencePiece
Sentencepiece is an open-source library developed by google for splitting subword.  
Since Sentencepice does not require pre-tokenization and uses only the raw data, it can be applied to all languages.


References:
- [SentencePiece: A simple and language independent subword tokenizer and detokenizer for Neural Text Processing]
- [sentencepiece Github]

[SentencePiece: A simple and language independent subword tokenizer and detokenizer for Neural Text Processing]: https://arxiv.org/pdf/1808.06226
[sentencepiece Github]: https://github.com/google/sentencepiece

In [15]:
import os
import csv
import urllib.request

import pandas as pd
import sentencepiece as spm

### 2-1. IMDB Example

In [16]:
data_dir = "../data/"
os.makedirs(data_dir, exist_ok=True)

imdb_csv_path = os.path.join(data_dir, "imdb_review.csv")
imdb_txt_path = os.path.join(data_dir, "imdb_review.txt")

In [17]:
urllib.request.urlretrieve(
    "https://raw.githubusercontent.com/LawrenceDuan/IMDb-Review-Analysis/master/IMDb_Reviews.csv",
    filename=imdb_csv_path
)

('../data/imdb_review.csv', <http.client.HTTPMessage at 0x7feb901462e0>)

In [18]:
df_train = pd.read_csv(imdb_csv_path)

In [19]:
df_train.head()

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0
3,One of the most unheralded great works of anim...,1
4,"It was the Sixties, and anyone with long hair ...",0


In [20]:
df_train.shape

(50000, 2)

In [21]:
# save df as text file (sentencepiece requires .txt input file)

with open(imdb_txt_path, "w", encoding="utf-8") as f:
    f.write("\n".join(df_train.review))

In [22]:
"""
Args:
- input: train data
- model_prefix: model name
- vocab_size: size of the vocabulary
- model_tyep: one of unigram(default), bpe, char, word
- max_sentence_length: maximum length of senetence
- pad_id, pad_piece: pad token id, value
- unk_id, unk_piece: unknown token id, value
- bos_id, bos_piece: begin of sentence token id, value
- eos_id, eos_piece: end of sequence token id, value
- user_defined_symbols: user defined token

Returns:
- {model_prefix}.vocab
- {model_prefix}.model
"""

spm.SentencePieceTrainer.Train(f"--input={imdb_txt_path} --model_prefix=imdb --vocab_size=5000 --model_type=bpe --max_sentence_length=9999")

sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=../data/imdb_review.txt --model_prefix=imdb --vocab_size=5000 --model_type=bpe --max_sentence_length=9999
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: ../data/imdb_review.txt
  input_format: 
  model_prefix: imdb
  model_type: BPE
  vocab_size: 5000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 9999
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_i

In [23]:
vocab_list = pd.read_csv("./imdb.vocab", sep="\t", header=None, quoting=csv.QUOTE_NONE)
vocab_list.sample(10, random_state=1234)

Unnamed: 0,0,1
2706,belie,-2703
2436,set,-2433
1201,▁top,-1198
1486,▁fant,-1483
4286,▁chick,-4283
2391,▁clim,-2388
4911,▁Chan,-4908
3262,▁govern,-3259
1361,▁guess,-1358
149,▁j,-146


In [24]:
# note that the vocab_size to train SentencePiece was 5,000

vocab_list.shape

(5000, 2)

In [25]:
# load model

sp = spm.SentencePieceProcessor()
vocab_file = "imdb.model"
sp.load(vocab_file)

True

In [26]:
lines = [
    "I didn't at all think of it this way.",
    "I have waited a long time for some to film",
]

for line in lines:
    print(line)
    print(sp.encode_as_pieces(line))
    print(sp.encode_as_ids(line))
    print()

I didn't at all think of it this way.
['▁I', '▁didn', "'", 't', '▁at', '▁all', '▁think', '▁of', '▁it', '▁this', '▁way', '.']
[41, 624, 4950, 4926, 139, 170, 378, 30, 58, 73, 413, 4945]

I have waited a long time for some to film
['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁some', '▁to', '▁film']
[41, 142, 1364, 1121, 4, 668, 285, 93, 205, 33, 91]



### 2-2. Naver Movie Review Example

In [27]:
data_dir = "../data/"
os.makedirs(data_dir, exist_ok=True)

naver_movie_csv_path = os.path.join(data_dir, "naver_movie_review.csv")
naver_movie_txt_path = os.path.join(data_dir, "naver_movie_review.txt")

In [28]:
urllib.request.urlretrieve(
    "https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt",
    filename=naver_movie_csv_path
)

('../data/naver_movie_review.csv', <http.client.HTTPMessage at 0x7feb57ea1340>)

In [29]:
df = pd.read_table(naver_movie_csv_path)

In [30]:
df.head()

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


In [31]:
df.shape

(200000, 3)

In [32]:
# NOTE: document contains null values

df.dropna(how="any", inplace=True)

In [33]:
df.shape

(199992, 3)

In [34]:
# save df as text file (sentencepiece requires .txt input file)

with open(naver_movie_txt_path, "w", encoding="utf-8") as f:
    f.write("\n".join(df.document))

In [35]:
spm.SentencePieceTrainer.Train(f"--input={naver_movie_txt_path} --model_prefix=naver --vocab_size=5000 --model_type=bpe --max_sentence_length=9999")

sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=../data/naver_movie_review.txt --model_prefix=naver --vocab_size=5000 --model_type=bpe --max_sentence_length=9999
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: ../data/naver_movie_review.txt
  input_format: 
  model_prefix: naver
  model_type: BPE
  vocab_size: 5000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 9999
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_

In [36]:
vocab_list = pd.read_csv("./naver.vocab", sep="\t", header=None, quoting=csv.QUOTE_NONE)
vocab_list.sample(10, random_state=1234)

Unnamed: 0,0,1
2706,까진,-2703
2436,▁넘어,-2433
1201,이거,-1198
1486,▁영화인데,-1483
4286,벅,-4283
2391,한건,-2388
4911,홥,-4908
3262,at,-3259
1361,감이,-1358
149,다는,-146


In [37]:
vocab_list.shape

(5000, 2)

In [38]:
sp = spm.SentencePieceProcessor()
vocab_file = "./naver.model"
sp.load(vocab_file)

True

In [39]:
lines = [
  "뭐 이딴 것도 영화냐.",
  "진짜 최고의 영화입니다 ㅋㅋ",
]

for line in lines:
    print(line)
    print(sp.encode_as_pieces(line))
    print(sp.encode_as_ids(line))
    print()

뭐 이딴 것도 영화냐.
['▁뭐', '▁이딴', '▁것도', '▁영화냐', '.']
[136, 970, 1299, 2593, 3276]

진짜 최고의 영화입니다 ㅋㅋ
['▁진짜', '▁최고의', '▁영화입니다', '▁ᄏᄏ']
[54, 204, 825, 121]



### 2-3. API guides

In [40]:
# returns vocab size

sp.GetPieceSize()

5000

In [41]:
# index to subword

sp.IdToPiece(4)

'영화'

In [42]:
# subword to index

sp.PieceToId("영화")

4

In [43]:
# returns sentence from the input integer sequence

sp.DecodeIds([54, 204, 825, 121])

'진짜 최고의 영화입니다 ᄏᄏ'

In [44]:
# returns sentence from the input subword sequence

sp.DecodePieces(['▁진짜', '▁최고의', '▁영화입니다', '▁ᄏᄏ'])

'진짜 최고의 영화입니다 ᄏᄏ'

In [45]:
# returns subword sequence from the input sentence

sp.encode('진짜 최고의 영화입니다 ᄏᄏ', out_type=str)

['▁진짜', '▁최고의', '▁영화입니다', '▁ᄏᄏ']

In [46]:
# returns integer sequence from the input sentence

sp.encode('진짜 최고의 영화입니다 ᄏᄏ', out_type=int)

[54, 204, 825, 121]

## 3. SubwodTextEncoder
SubwodTextEncoder is a subword tokenizer, which can be used through TensorFlow.  
It uses Wordpiece Model, which is similar to BPE, and can easily split words into subwords.

In [47]:
import os
import urllib.request

import pandas as pd
import tensorflow_datasets as tfds

In [48]:
# we have already downloaded imdb dataset in Section 2.
data_dir = "../data/"

imdb_csv_path = os.path.join(data_dir, "imdb_review.csv")

In [49]:
df = pd.read_csv(imdb_csv_path)
df.head()

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0
3,One of the most unheralded great works of anim...,1
4,"It was the Sixties, and anyone with long hair ...",0


In [50]:
df.shape

(50000, 2)

In [51]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    corpus_generator=df.review,
    target_vocab_size=2**13,
)

2024-12-18 13:34:06.357117: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [52]:
len(tokenizer.subwords)

8011

In [53]:
tokenizer.subwords[:100]

['the_',
 ', ',
 '. ',
 'a_',
 'and_',
 'of_',
 'to_',
 's_',
 'is_',
 'br',
 'in_',
 'I_',
 'that_',
 'this_',
 'it_',
 ' /><',
 ' />',
 'was_',
 'The_',
 't_',
 'as_',
 'with_',
 'for_',
 '.<',
 'on_',
 'but_',
 'movie_',
 'are_',
 ' (',
 'have_',
 'his_',
 'film_',
 'not_',
 'be_',
 'you_',
 'ing_',
 ' "',
 'ed_',
 'it',
 'd_',
 'an_',
 'at_',
 'by_',
 'he_',
 'one_',
 'who_',
 'from_',
 'y_',
 'or_',
 'e_',
 'like_',
 'all_',
 '" ',
 'they_',
 'so_',
 'just_',
 'has_',
 ') ',
 'about_',
 'her_',
 'out_',
 'This_',
 'some_',
 'movie',
 'ly_',
 'film',
 'very_',
 'more_',
 'It_',
 'what_',
 'would_',
 'when_',
 'if_',
 'good_',
 'up_',
 'which_',
 'their_',
 'only_',
 'even_',
 'my_',
 'really_',
 'had_',
 'can_',
 'no_',
 'were_',
 'see_',
 '? ',
 'she_',
 'than_',
 '! ',
 'there_',
 'been_',
 'get_',
 'into_',
 'will_',
 ' - ',
 'much_',
 'n_',
 'because_',
 'ing']

In [54]:
sample_sentence = "It's mind-blowing to me that this film was even made."
print(sample_sentence)

It's mind-blowing to me that this film was even made.


In [55]:
# returns encoded integer sequence

tokenized = tokenizer.encode(sample_sentence)
print(tokenized)

[137, 8051, 8, 910, 8057, 2169, 36, 7, 103, 13, 14, 32, 18, 79, 681, 8058]


In [56]:
# returns decoded original sentence

original_sentence = tokenizer.decode(tokenized)
print(original_sentence)

It's mind-blowing to me that this film was even made.


In [57]:
# vocabulary size

tokenizer.vocab_size

8268

In [58]:
for ts in tokenized:
    print(f"{ts} -> {tokenizer.decode([ts])}")

137 -> It
8051 -> '
8 -> s 
910 -> mind
8057 -> -
2169 -> blow
36 -> ing 
7 -> to 
103 -> me 
13 -> that 
14 -> this 
32 -> film 
18 -> was 
79 -> even 
681 -> made
8058 -> .


In [59]:
# add "xyz" right after "even"

sample_sentence = "It's mind-blowing to me that this film was evenxyz made."
print(sample_sentence)

It's mind-blowing to me that this film was evenxyz made.


In [60]:
# returns encoded integer sequence

tokenized = tokenizer.encode(sample_sentence)
print(tokenized)

[137, 8051, 8, 910, 8057, 2169, 36, 7, 103, 13, 14, 32, 18, 7974, 8132, 8133, 997, 681, 8058]


In [61]:
# returns decoded original sentence

original_sentence = tokenizer.decode(tokenized)
print(original_sentence)

It's mind-blowing to me that this film was evenxyz made.


In [62]:
# since "xyz" does not appeared in the train data, it is spliited into "x", "y", "z" separately

for ts in tokenized:
    print(f"{ts} -> {tokenizer.decode([ts])}")

137 -> It
8051 -> '
8 -> s 
910 -> mind
8057 -> -
2169 -> blow
36 -> ing 
7 -> to 
103 -> me 
13 -> that 
14 -> this 
32 -> film 
18 -> was 
7974 -> even
8132 -> x
8133 -> y
997 -> z 
681 -> made
8058 -> .


## 4. Huggingface Tokenizer
BERT uses WordPiece Tokenizer, which is implemented as `BertWordPieceTokenizer` in Huggingface.

In [63]:
import urllib.request

import pandas as pd
from tokenizers import BertWordPieceTokenizer

In [64]:
# we have already downloaded imdb dataset in Section 2.
data_dir = "../data/"

csv_path = os.path.join(data_dir, "naver_movie_review.csv")
txt_path = os.path.join(data_dir, "naver_movie_review.txt")

In [65]:
df = pd.read_table(csv_path)

In [66]:
df.head()

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


In [67]:
df.shape

(200000, 3)

In [68]:
df.dropna(how="any", inplace=True)

In [69]:
df.shape

(199992, 3)

In [70]:
"""
- lowercase: ignore case sensitive if True
- strip_accents: remove accent if True (e.g. é → e, ô → o)
"""

tokenizer = BertWordPieceTokenizer(lowercase=False, strip_accents=False)

In [71]:
"""
- files: list of paths of train data
- vocab_size: vocabulary size
- limit_alphabet: the number of tokens allowed before merge
- min_frequency: merge pair only if it appears more than this number
"""

tokenizer.train(
    files=txt_path,
    vocab_size=30000,
    limit_alphabet=6000,
    min_frequency=5,
)






In [72]:
base_model_dir = "../model/"
os.makedirs(base_model_dir, exist_ok=True)

model_dir = os.path.join(base_model_dir, "bert_naver_moview_tokenizer")
os.makedirs(model_dir, exist_ok=True)

tokenizer.save_model(model_dir)

['../model/bert_naver_moview_tokenizer/vocab.txt']

In [73]:
df_vocab = pd.read_fwf(os.path.join(model_dir, "vocab.txt"), header=None)

In [74]:
df_vocab.head(30)

Unnamed: 0,0
0,[PAD]
1,[UNK]
2,[CLS]
3,[SEP]
4,[MASK]
5,!
6,""""
7,#
8,$
9,%


In [75]:
# NOTE: vocab size was 30,000

df_vocab.shape

(30000, 1)

In [76]:
sample_sentence = "아 배고픈데 짜장면먹고싶다"
encoded = tokenizer.encode(sample_sentence)

print(f"Tokens: {encoded.tokens}")
print(f"Token Ids: {encoded.ids}")
print(f"Decoded: {tokenizer.decode(encoded.ids)}")

Tokens: ['아', '배고', '##픈', '##데', '짜장면', '##먹고', '##싶다']
Token Ids: [2111, 20632, 4184, 3283, 24680, 7873, 7379]
Decoded: 아 배고픈데 짜장면먹고싶다


In [77]:
sample_sentence = "커피 한잔의 여유를 즐기자"
encoded = tokenizer.encode(sample_sentence)

print(f"Tokens: {encoded.tokens}")
print(f"Token Ids: {encoded.ids}")
print(f"Decoded: {tokenizer.decode(encoded.ids)}")

Tokens: ['커피', '한잔', '##의', '여유', '##를', '즐기', '##자']
Token Ids: [12825, 25647, 3270, 12696, 3247, 10784, 3648]
Decoded: 커피 한잔의 여유를 즐기자
