# Google Drive Mount

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Install


In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


# 4.1.1 Tokenizers 학습하기

In [3]:
from datasets import load_dataset

dataset = load_dataset("klue", "ynat")
dataset['train'][0]

Downloading builder script:   0%|          | 0.00/23.3k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/22.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/21.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.93M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/45678 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/9107 [00:00<?, ? examples/s]

{'guid': 'ynat-v1_train_00000',
 'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영',
 'label': 3,
 'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=227&oid=001&aid=0008508947',
 'date': '2016.06.30. 오전 10:36'}

In [None]:
target_key = "title"
for key in dataset.column_names.keys():
  with open(f"drive/MyDrive/Books/data/tokenizer_data_{key}.txt", "w") as f:
    f.write("\n".join(dataset[key][target_key]))

In [4]:
user_defined_symbols = [
    "[PAD]",  # 문장의 길이를 맞추기 위해 사용되는 토큰
    "[UNK]",  # 토크나이저가 인식할 수 없는 토큰
    "[CLS]",  # bert 계열 모델에서 문장 전체의 정보를 저장하는 토큰
    "[SEP]",  # bert 계열 모델에서 문장 구분을 위해 사용하는 토큰
    "[MASK]", # MLM 모델에서 토큰 마스킹을 위해 사용하는 토큰
]

In [7]:
unused_token_num = 100
unused_list = [f"[UNUSED{i}]" for i in range(unused_token_num)]  # 사전학습 시, 어휘에 없는 토큰을 추가하기 위한 빈 공간

whole_user_defined_symbols = user_defined_symbols + unused_list
print(whole_user_defined_symbols[:10])

['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '[UNUSED0]', '[UNUSED1]', '[UNUSED2]', '[UNUSED3]', '[UNUSED4]']


In [8]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece

bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

In [9]:
from tokenizers import normalizers

normalizer = normalizers.BertNormalizer()
bert_tokenizer.normalizer = normalizer

normalizer.normalize_str("Héllò hôwWnare ü? ")

'hello howwnare u? '

In [10]:
from tokenizers.pre_tokenizers import Whitespace

pre_tokenizer = Whitespace()
bert_tokenizer.pre_tokenizer = pre_tokenizer

pre_tokenizer.pre_tokenize_str("안녕하세요. 제대로 인코딩이 되는지 확인 중입니다.")

[('안녕하세요', (0, 5)),
 ('.', (5, 6)),
 ('제대로', (7, 10)),
 ('인코딩이', (11, 15)),
 ('되는지', (16, 19)),
 ('확인', (20, 22)),
 ('중입니다', (23, 27)),
 ('.', (27, 28))]

In [11]:
from tokenizers.processors import TemplateProcessing

post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[(t, i) for i, t in enumerate(user_defined_symbols)]
)

bert_tokenizer.post_processor = post_processor

In [12]:
from tokenizers.trainers import WordPieceTrainer

vocab_size = 24000
trainer = WordPieceTrainer(
    vocab_size=vocab_size,
    special_tokens=whole_user_defined_symbols,
)

In [13]:
from glob import glob

bert_tokenizer.train(glob(f"drive/MyDrive/Books/data/*.txt"), trainer)

In [14]:
output = bert_tokenizer.encode("인코딩 및 디코딩이 제대로 이루어지는가?")
print(output.ids)

bert_tokenizer.decode(output.ids)

[2, 675, 906, 2220, 4518, 1240, 906, 2220, 569, 6727, 12916, 10780, 618, 1, 3]


'인 ##코 ##딩 및 디 ##코 ##딩 ##이 제대로 이루 ##어지는 ##가'

In [15]:
from tokenizers import decoders

bert_tokenizer.decoder = decoders.WordPiece()
bert_tokenizer.decode(output.ids)

'인코딩 및 디코딩이 제대로 이루어지는가'

In [16]:
from transformers import BertTokenizerFast

fast_tokenizer = BertTokenizerFast(tokenizer_object=bert_tokenizer)
encoded = fast_tokenizer.encode("인코딩 및 디코딩이 제대로 이루어지는가?")
decoded = fast_tokenizer.decode(encoded)
print(encoded)
print(decoded)

[2, 675, 906, 2220, 4518, 1240, 906, 2220, 569, 6727, 12916, 10780, 618, 1, 3]
[CLS] 인코딩 및 디코딩이 제대로 이루어지는가 [UNK] [SEP]


save tokenizer

In [None]:
fast_tokenizer.save_pretrained("drive/MyDrive/Books/outputs/MyTokenizer")

('drive/MyDrive/Books/outputs/MyTokenizer/tokenizer_config.json',
 'drive/MyDrive/Books/outputs/MyTokenizer/special_tokens_map.json',
 'drive/MyDrive/Books/outputs/MyTokenizer/vocab.txt',
 'drive/MyDrive/Books/outputs/MyTokenizer/added_tokens.json',
 'drive/MyDrive/Books/outputs/MyTokenizer/tokenizer.json')

In [18]:
new_tokenizer = BertTokenizerFast.from_pretrained("drive/MyDrive/Books/outputs/MyTokenizer")

encoded = new_tokenizer(["인코딩 잘 되는지 확인", "안되면 어떡하지?"])

for k, v in encoded.items():
  print(k, v)

print(new_tokenizer.decode(encoded["input_ids"][0]))
print(new_tokenizer.decode(encoded["input_ids"][1]))

input_ids [[2, 675, 906, 2220, 1675, 6464, 586, 1881, 3], [2, 18633, 22162, 2729, 1, 3]]
token_type_ids [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]]
attention_mask [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]
[CLS] 인코딩 잘 되는지 확인 [SEP]
[CLS] 안되면 어떡하지 [UNK] [SEP]


# 4.2 모델 초기화 후 학습

In [None]:
from datasets import load_dataset
from transformers import BertTokenizerFast

In [None]:
dataset = load_dataset("klue", "ynat")

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("drive/MyDrive/Books/outputs/MyTokenizer")

In [None]:
from transformers import BertConfig

cfg = BertConfig
cfg

transformers.models.bert.configuration_bert.BertConfig

In [None]:
mycfg = BertConfig(vocab_size=tokenizer.vocab_size)

In [None]:
from transformers import BertForMaskedLM

model = BertForMaskedLM(mycfg)
model.config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.35.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 24000
}

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

datasets = dataset.map(
    lambda x: tokenizer(x['title']),
    batched=True,
    batch_size=1000,
    remove_columns=dataset.column_names['train'],
)

args = TrainingArguments(
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    max_steps=1000,
    evaluation_strategy="steps",
    logging_strategy="steps",
    logging_steps=100,
    logging_dir="drive/MyDrive/Books/outputs/logs",
    output_dir="drive/MyDrive/Books/outputs/ckpt",
)

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
)

trainer.train()

trainer.save("drive/MyDrive/Books/outputs/MyBertModel")

Map:   0%|          | 0/45678 [00:00<?, ? examples/s]

Map:   0%|          | 0/9107 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
100,9.6979,9.561581
200,9.3775,9.506992
300,9.3077,9.497546
400,9.2965,9.457968
500,9.225,9.414169
600,9.1618,9.433818
700,9.2004,9.398431
800,9.232,9.388632
900,9.1997,9.403358
1000,9.1675,9.369228


AttributeError: ignored