## 참고 사이트
https://www.ohsuz.dev/4bb36581-b0bd-49ca-acd0-530c35546009

https://colab.research.google.com/drive/1VvMBT98LVJpxonUMmfpn8LGqDfUVFPS7?usp=sharing#scrollTo=BtuDt--Yb1Bx

https://stackoverflow.com/questions/65646925/how-to-train-bert-from-scratch-on-a-new-domain-for-both-mlm-and-nsp

In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForPreTraining
from tqdm.notebook import tqdm
import numpy as np
import torch
import time, datetime
from nltk.tokenize import sent_tokenize
import random

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [3]:
data = pd.read_csv("../../data/nsmc/ratings.txt", sep='\t')

In [4]:
data = data.dropna()

In [5]:
data['sent'] = data.document.apply(lambda x: sent_tokenize(x))

In [6]:
train_list = []
for each_sent in data.sent:
	if len(each_sent)>1:
		train_list.append(each_sent)

In [13]:
train_list[0]

['디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산업이 부러웠는데.',
 '사실 우리나라에서도 그 어려운시절에 끝까지 열정을 지킨 노라노 같은 전통이있어 저와 같은 사람들이 꿈을 꾸고 이뤄나갈 수 있다는 것에 감사합니다.']

In [14]:
len(train_list)

48913

In [7]:
documents = [[]]
for each_review in tqdm(train_list):
	for each_line in each_review:
		tokens = tokenizer.tokenize(each_line)
		tokens = tokenizer.convert_tokens_to_ids(tokens)
		documents[-1].append(tokens)
	documents.append([])
	

  0%|          | 0/48913 [00:00<?, ?it/s]

In [16]:
len(documents)

48914

In [17]:
documents[-1]

[]

In [8]:
documents = documents[0:-1]

In [9]:
def create_examples_from_document(document, doc_index, block_size, tokenizer, short_seq_probability, nsp_probability):
    max_num_tokens = block_size - tokenizer.num_special_tokens_to_add(pair=True)
    target_seq_length = max_num_tokens
    if random.random() < short_seq_probability:
        target_seq_length = random.randint(2, max_num_tokens)

    current_chunk = []  # a buffer stored current working segments
    current_length = 0
    i = 0
    while i < len(document):
        segment = document[i]
        current_chunk.append(segment)
        current_length += len(segment)
        if i == len(document) - 1 or current_length >= target_seq_length:
            if current_chunk:
                # `a_end` is how many segments from `current_chunk` go into the `A`
                # (first) sentence.
                a_end = 1
                # 여기서 문장_1+문장_2 가 이루어졌을 때, 길이를 random하게 짤라버립니다 :-)
                if len(current_chunk) >= 2:
                    a_end = random.randint(1, len(current_chunk) - 1)
                tokens_a = []
                for j in range(a_end):
                    tokens_a.extend(current_chunk[j])
                # 이제 [SEP] 뒷 부분인 segmentB를 살펴볼까요?
                tokens_b = []
                # 50%의 확률로 랜덤하게 다른 문장을 선택하거나, 다음 문장을 학습데이터로 만듭니다.
                if len(current_chunk) == 1 or random.random() < nsp_probability:
                    is_random_next = True
                    target_b_length = target_seq_length - len(tokens_a)

                    # This should rarely go for more than one iteration for large
                    # corpora. However, just to be careful, we try to make sure that
                    # the random document is not the same as the document
                    # we're processing.
                    for _ in range(10):
                        random_document_index = random.randint(0, len(documents) - 1)
                        if random_document_index != doc_index:
                            break
                    # 여기서 랜덤하게 선택합니다 :-)
                    random_document = documents[random_document_index]
                    random_start = random.randint(0, len(random_document) - 1)
                    for j in range(random_start, len(random_document)):
                        tokens_b.extend(random_document[j])
                        if len(tokens_b) >= target_b_length:
                            break
                    # We didn't actually use these segments so we "put them back" so
                    # they don't go to waste.
                    num_unused_segments = len(current_chunk) - a_end
                    i -= num_unused_segments
                # Actual next
                else:
                    is_random_next = False
                    for j in range(a_end, len(current_chunk)):
                        tokens_b.extend(current_chunk[j])

                # 이제 126 token을 넘는다면 truncation을 해야합니다.
                # 이 때, 126 token 이내로 들어온다면 행위를 멈추고,
                # 만약 126 token을 넘는다면, segmentA와 segmentB에서 랜덤하게 하나씩 제거합니다.
                def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
                    """Truncates a pair of sequences to a maximum sequence length."""
                    while True:
                        total_length = len(tokens_a) + len(tokens_b)
                        if total_length <= max_num_tokens:
                            break
                        trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
                        assert len(trunc_tokens) >= 1
                        # We want to sometimes truncate from the front and sometimes from the
                        # back to add more randomness and avoid biases.
                        if random.random() < 0.5:
                            del trunc_tokens[0]
                        else:
                            trunc_tokens.pop()

                truncate_seq_pair(tokens_a, tokens_b, max_num_tokens)

                assert len(tokens_a) >= 1
                assert len(tokens_b) >= 1

                # add special tokens
                input_ids = tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b)
                # add token type ids, 0 for sentence a, 1 for sentence b
                token_type_ids = tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b)
                
                # 드디어 아래 항목에 대한 데이터셋이 만들어졌습니다! :-)
                # 즉, segmentA[SEP]segmentB, [0, 0, .., 0, 1, 1, ..., 1], NSP 데이터가 만들어진 것입니다 :-)
                # 그럼 다음은.. 이 데이터에 [MASK] 를 씌워야겠죠?
                example = {
                    "input_ids": torch.tensor(input_ids, dtype=torch.long),
                    "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
                    "next_sentence_label": torch.tensor(1 if is_random_next else 0, dtype=torch.long),
                }

                examples.append(example)

            current_chunk = []
            current_length = 0

        i += 1

In [10]:
examples = []
for doc_index, document in enumerate(documents):
    create_examples_from_document(document=document, doc_index=doc_index, block_size=256, tokenizer=tokenizer, short_seq_probability=0.1, nsp_probability=0.5)

In [21]:
len(examples)

75887

In [22]:
examples[0]

{'input_ids': tensor([   101,   9122,  86150,  10622,  84703,  11018,   9953,  24017,  11467,
            117,   9597,  20479,  48446,  13764,  10739,  70162,  12638,   8924,
          20173,   9641,  17360,   9665,  43022,  10622,  25605,   9323,  16617,
          14523,  68828,   9297,  18227,  21386,  26784,  10739,   9365,  30873,
         119172,  41850,    119,    102,   9638,  22200,  45465,  75855,  12310,
         119257,  97146,  37093,  52363,  25387,  12030,   8872,  12692,  11903,
          25503,  96720,  12605,  11287,   8982,  37093,  82034,  27487,   8932,
          14423,  22096,    119,   9091,  28396,  11779,  10731,  14153,  12692,
            102]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]),
 'next_sentence_label': tensor(1)}

In [11]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [24]:
print(data_collator(examples[0:1]))

{'input_ids': tensor([[   101,   9122,  86150,  10622,  84703,  11018,   9953,  24017,  11467,
            103,   9597,  20479,  48446,  13764,  10739,  70162,  12638,   8924,
          20173,   9641,    103,   9665,  43022,  10622,  25605,   9323,    103,
          14523,  68828,   9297,  18227,  21386,  26784,  10739,   9365,  30873,
         119172,  41850,    103,    102,   9638,  22200,  45465,  75855,  12310,
         119257,  97146,  37093,  52363,  25387,    103,   8872,  12692,  11903,
          25503,  96720,  12605,  11287,   8982,  37093,  82034,  27487,   8932,
            103,  22096,    119,   9091,  28396,  11779,    103,  14153,  12692,
            102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]]), 'next_sentence_label': tensor([1]), 'atten

In [25]:
print(data_collator(examples[0:1])['input_ids'][0])

tensor([   101,   9122,    103,  10622,  84703,  11018,    103,    103,  11467,
           117,   9597,  20479,  61623,  13764,  10739,  70162,  12638,   8924,
         20173,   9641,  17360,   9665,  43022,  10622,  25605,   9323,  16617,
         14523,  68828,   9297,  18227, 106467,  26784,    103,   9365,  30873,
           103,  41850,    119,    102,   9638,  22200,  45465,  81397,  12310,
        119257,  97146,  37093,  52363,  25387,  12030,   8872,  12692,  11903,
         25503,  96720,  12605,  11287,   8982,  37093,  82034, 111472,   8932,
         14423,    103,    119,   9091,  28396,    103,  10731,  14153,  12692,
           102])


In [26]:
tokenizer.mask_token_id

103

In [27]:
tokenizer.decode(data_collator(examples[0:1])['input_ids'][0].tolist())

'[CLS] 디자인을 배우 [MASK] [MASK]생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 [MASK]화 [MASK]업이 부러웠는데. [SEP] 이종격투기챔피온출신인 게리다 [MASK]엘스가 나온 [MASK] 것이 기대된다. [MASK]프VS게리 [SEP]'

In [12]:
#model = BertForPreTraining.from_pretrained('bert-base-multilingual-cased')
model = BertForPreTraining.from_pretrained('./nsmc_model/checkpoint-11000/')

In [13]:
from transformers import Trainer, TrainingArguments

In [14]:
training_args = TrainingArguments(output_dir='./nsmc_model/additional_epochs', overwrite_output_dir=True, num_train_epochs=50,per_device_train_batch_size=32, save_steps=1000, save_total_limit=2, logging_steps=5000)
trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=examples)

In [15]:
trainer.train()

***** Running training *****
  Num examples = 75853
  Num Epochs = 50
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 118550


Step,Training Loss
5000,2.2721
10000,2.0637
15000,1.9033
20000,1.7841
25000,1.6867
30000,1.6149
35000,1.5408
40000,1.4754
45000,1.4261
50000,1.3711


Saving model checkpoint to ./nsmc_model/additional_epochs/checkpoint-1000
Configuration saved in ./nsmc_model/additional_epochs/checkpoint-1000/config.json
Model weights saved in ./nsmc_model/additional_epochs/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./nsmc_model/additional_epochs/checkpoint-2000
Configuration saved in ./nsmc_model/additional_epochs/checkpoint-2000/config.json
Model weights saved in ./nsmc_model/additional_epochs/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./nsmc_model/additional_epochs/checkpoint-3000
Configuration saved in ./nsmc_model/additional_epochs/checkpoint-3000/config.json
Model weights saved in ./nsmc_model/additional_epochs/checkpoint-3000/pytorch_model.bin
Deleting older checkpoint [nsmc_model/additional_epochs/checkpoint-1000] due to args.save_total_limit
Saving model checkpoint to ./nsmc_model/additional_epochs/checkpoint-4000
Configuration saved in ./nsmc_model/additional_epochs/checkpoint-4000/config.json
Model weig

TrainOutput(global_step=118550, training_loss=1.3611814491017766, metrics={'train_runtime': 19838.6479, 'train_samples_per_second': 191.175, 'train_steps_per_second': 5.976, 'total_flos': 2.16338389056459e+17, 'train_loss': 1.3611814491017766, 'epoch': 50.0})

In [16]:
trainer.save_model('./nsmc_model/additional_epochs/final')

Saving model checkpoint to ./nsmc_model/additional_epochs/final
Configuration saved in ./nsmc_model/additional_epochs/final/config.json
Model weights saved in ./nsmc_model/additional_epochs/final/pytorch_model.bin
