# Google Drive Mount

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Install


In [2]:
!pip install datasets==2.20.0

Collecting datasets==2.20.0
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m256.0/547.8 kB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets==2.20.0)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets==2.20.0)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets==2.20.0)
  Download

# 4.1.1 Tokenizers 학습

In [3]:
from datasets import load_dataset

dataset = load_dataset("klue", "ynat")
dataset['train'][0]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.17M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/847k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45678 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/9107 [00:00<?, ? examples/s]

{'guid': 'ynat-v1_train_00000',
 'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영',
 'label': 3,
 'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=227&oid=001&aid=0008508947',
 'date': '2016.06.30. 오전 10:36'}

In [4]:
target_key = "title"
for key in dataset.column_names.keys():
  with open(f"drive/MyDrive/Books/data/tokenizer_data_{key}.txt", "w") as f:
    f.write("\n".join(dataset[key][target_key]))

In [5]:
user_defined_symbols = [
    "[PAD]",  # 문장의 길이를 맞추기 위해 사용되는 토큰
    "[UNK]",  # 토크나이저가 인식할 수 없는 토큰
    "[CLS]",  # bert 계열 모델에서 문장 전체의 정보를 저장하는 토큰
    "[SEP]",  # bert 계열 모델에서 문장 구분을 위해 사용하는 토큰
    "[MASK]", # MLM 모델에서 토큰 마스킹을 위해 사용하는 토큰
]

unused_token_num = 100
unused_list = [f"[UNUSED{i}]" for i in range(unused_token_num)]  # 사전학습 시, 어휘에 없는 토큰을 추가하기 위한 빈 공간

whole_user_defined_symbols = user_defined_symbols + unused_list
print(whole_user_defined_symbols[:10])

['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '[UNUSED0]', '[UNUSED1]', '[UNUSED2]', '[UNUSED3]', '[UNUSED4]']


In [6]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece

bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

In [7]:
from tokenizers import normalizers

normalizer = normalizers.BertNormalizer()
bert_tokenizer.normalizer = normalizer

normalizer.normalize_str("Héllò hôwWnare ü? ")

'hello howwnare u? '

In [8]:
from tokenizers.pre_tokenizers import Whitespace

pre_tokenizer = Whitespace()
bert_tokenizer.pre_tokenizer = pre_tokenizer

pre_tokenizer.pre_tokenize_str("안녕하세요. 제대로 인코딩이 되는지 확인 중입니다.")

[('안녕하세요', (0, 5)),
 ('.', (5, 6)),
 ('제대로', (7, 10)),
 ('인코딩이', (11, 15)),
 ('되는지', (16, 19)),
 ('확인', (20, 22)),
 ('중입니다', (23, 27)),
 ('.', (27, 28))]

In [9]:
from tokenizers.processors import TemplateProcessing

post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[(t, i) for i, t in enumerate(user_defined_symbols)]
)

bert_tokenizer.post_processor = post_processor

In [10]:
from tokenizers.trainers import WordPieceTrainer

vocab_size = 24000
trainer = WordPieceTrainer(
    vocab_size=vocab_size,
    special_tokens=whole_user_defined_symbols,
)

In [11]:
from glob import glob

bert_tokenizer.train(glob(f"drive/MyDrive/Books/data/*.txt"), trainer)

In [12]:
output = bert_tokenizer.encode("인코딩 및 디코딩이 제대로 이루어지는지 확인 중입니다.")
print(output.ids)

bert_tokenizer.decode(output.ids)

[2, 675, 906, 2220, 4518, 1240, 906, 2220, 569, 6727, 12916, 10780, 586, 1881, 16618, 10191, 106, 3]


'인 ##코 ##딩 및 디 ##코 ##딩 ##이 제대로 이루 ##어지는 ##지 확인 중이 ##ᆸ니다 .'

In [14]:
from tokenizers import decoders

bert_tokenizer.decoder = decoders.WordPiece()
bert_tokenizer.decode(output.ids)

'인코딩 및 디코딩이 제대로 이루어지는지 확인 중입니다.'

In [15]:
from transformers import BertTokenizerFast

fast_tokenizer = BertTokenizerFast(tokenizer_object=bert_tokenizer)
encoded = fast_tokenizer.encode("인코딩 및 디코딩이 제대로 이루어지는지 확인 중입니다.")
decoded = fast_tokenizer.decode(encoded)
print(encoded)
print(decoded)

[2, 675, 906, 2220, 4518, 1240, 906, 2220, 569, 6727, 12916, 10780, 586, 1881, 16618, 10191, 106, 3]
[CLS] 인코딩 및 디코딩이 제대로 이루어지는지 확인 중입니다. [SEP]


Save Tokenizer

In [16]:
output_dir = "drive/MyDrive/Books/outputs/MyTokenizer"
fast_tokenizer.save_pretrained(output_dir)

('drive/MyDrive/Books/outputs/MyTokenizer/tokenizer_config.json',
 'drive/MyDrive/Books/outputs/MyTokenizer/special_tokens_map.json',
 'drive/MyDrive/Books/outputs/MyTokenizer/vocab.txt',
 'drive/MyDrive/Books/outputs/MyTokenizer/added_tokens.json',
 'drive/MyDrive/Books/outputs/MyTokenizer/tokenizer.json')

In [19]:
new_tokenizer = BertTokenizerFast.from_pretrained(output_dir)

encoded = new_tokenizer(["인코딩 잘 되는지 확인", "안되면 다시 학습하자"])

for k, v in encoded.items():
  print(k, v)

print(new_tokenizer.decode(encoded["input_ids"][0]))
print(new_tokenizer.decode(encoded["input_ids"][1]))

input_ids [[2, 675, 906, 2220, 1675, 6464, 586, 1881, 3], [2, 18633, 1594, 6985, 3782, 3]]
token_type_ids [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]]
attention_mask [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]
[CLS] 인코딩 잘 되는지 확인 [SEP]
[CLS] 안되면 다시 학습하자 [SEP]
