## 참고 사이트
https://www.ohsuz.dev/4bb36581-b0bd-49ca-acd0-530c35546009

https://colab.research.google.com/drive/1VvMBT98LVJpxonUMmfpn8LGqDfUVFPS7?usp=sharing#scrollTo=BtuDt--Yb1Bx

https://stackoverflow.com/questions/65646925/how-to-train-bert-from-scratch-on-a-new-domain-for-both-mlm-and-nsp

In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForPreTraining
from tqdm.notebook import tqdm
import numpy as np
import torch
import time, datetime
from nltk.tokenize import sent_tokenize
import random

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [3]:
tokenizer.num_special_tokens_to_add(pair=True)

3

In [4]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [5]:
data = pd.read_parquet("../data/dataset_grammar.parquet")
#data = data.drop(index=data.index[data.Rating==2])
#data = data.reset_index(drop=True)

In [6]:
data['sent'] = data.Review.apply(lambda x: sent_tokenize(x))

In [7]:
train_list = []
for each_sent in data.sent:
	if len(each_sent)>1:
		train_list.append(each_sent)

In [8]:
train_list[0]

['숙성 돼지고기 전문점입니다.',
 '건물 모양 때문에 매장 모양도 좀 특이하지만 쾌적한 편이고 살짝 레트로 감성으로 분위기 잡아놨습니다.',
 '모든 직원분들께서 전부 가능하다고 멘트 쳐주시며, 고기는 초반 커팅까지는 구워주십니다.',
 '가격 저렴한 편 아니지만 맛은 준수합니다.',
 '등심덧살이 인상 깊었는데 구이로 별로일 줄 알았는데 육향 짙고 얇게 저며 뻑뻑하지 않았습니다.',
 '하이라이트는 된장찌개.',
 '진짜 굿입니다.',
 '버터 간장밥, 골뱅이 국수 등 나중에 더 맛봐야 할 것들은 남겨뒀습니다.']

In [9]:
len(train_list)

366891

In [10]:
documents = [[]]
for each_review in tqdm(train_list):
	for each_line in each_review:
		tokens = tokenizer.tokenize(each_line)
		tokens = tokenizer.convert_tokens_to_ids(tokens)
		documents[-1].append(tokens)
	documents.append([])
	

  0%|          | 0/366891 [00:00<?, ?it/s]

In [11]:
len(documents)

366892

In [12]:
documents[-1]

[]

In [13]:
documents = documents[0:-1]

In [14]:
def create_examples_from_document(document, doc_index, block_size, tokenizer, short_seq_probability, nsp_probability):
    max_num_tokens = block_size - tokenizer.num_special_tokens_to_add(pair=True)
    target_seq_length = max_num_tokens
    if random.random() < short_seq_probability:
        target_seq_length = random.randint(2, max_num_tokens)

    current_chunk = []  # a buffer stored current working segments
    current_length = 0
    i = 0
    while i < len(document):
        segment = document[i]
        current_chunk.append(segment)
        current_length += len(segment)
        if i == len(document) - 1 or current_length >= target_seq_length:
            if current_chunk:
                # `a_end` is how many segments from `current_chunk` go into the `A`
                # (first) sentence.
                a_end = 1
                # 여기서 문장_1+문장_2 가 이루어졌을 때, 길이를 random하게 짤라버립니다 :-)
                if len(current_chunk) >= 2:
                    a_end = random.randint(1, len(current_chunk) - 1)
                tokens_a = []
                for j in range(a_end):
                    tokens_a.extend(current_chunk[j])
                # 이제 [SEP] 뒷 부분인 segmentB를 살펴볼까요?
                tokens_b = []
                # 50%의 확률로 랜덤하게 다른 문장을 선택하거나, 다음 문장을 학습데이터로 만듭니다.
                if len(current_chunk) == 1 or random.random() < nsp_probability:
                    is_random_next = True
                    target_b_length = target_seq_length - len(tokens_a)

                    # This should rarely go for more than one iteration for large
                    # corpora. However, just to be careful, we try to make sure that
                    # the random document is not the same as the document
                    # we're processing.
                    for _ in range(10):
                        random_document_index = random.randint(0, len(documents) - 1)
                        if random_document_index != doc_index:
                            break
                    # 여기서 랜덤하게 선택합니다 :-)
                    random_document = documents[random_document_index]
                    random_start = random.randint(0, len(random_document) - 1)
                    for j in range(random_start, len(random_document)):
                        tokens_b.extend(random_document[j])
                        if len(tokens_b) >= target_b_length:
                            break
                    # We didn't actually use these segments so we "put them back" so
                    # they don't go to waste.
                    num_unused_segments = len(current_chunk) - a_end
                    i -= num_unused_segments
                # Actual next
                else:
                    is_random_next = False
                    for j in range(a_end, len(current_chunk)):
                        tokens_b.extend(current_chunk[j])

                # 이제 126 token을 넘는다면 truncation을 해야합니다.
                # 이 때, 126 token 이내로 들어온다면 행위를 멈추고,
                # 만약 126 token을 넘는다면, segmentA와 segmentB에서 랜덤하게 하나씩 제거합니다.
                def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
                    """Truncates a pair of sequences to a maximum sequence length."""
                    while True:
                        total_length = len(tokens_a) + len(tokens_b)
                        if total_length <= max_num_tokens:
                            break
                        trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
                        assert len(trunc_tokens) >= 1
                        # We want to sometimes truncate from the front and sometimes from the
                        # back to add more randomness and avoid biases.
                        if random.random() < 0.5:
                            del trunc_tokens[0]
                        else:
                            trunc_tokens.pop()

                truncate_seq_pair(tokens_a, tokens_b, max_num_tokens)

                assert len(tokens_a) >= 1
                assert len(tokens_b) >= 1

                # add special tokens
                input_ids = tokenizer.build_inputs_with_special_tokens(tokens_a, tokens_b)
                # add token type ids, 0 for sentence a, 1 for sentence b
                token_type_ids = tokenizer.create_token_type_ids_from_sequences(tokens_a, tokens_b)
                
                # 드디어 아래 항목에 대한 데이터셋이 만들어졌습니다! :-)
                # 즉, segmentA[SEP]segmentB, [0, 0, .., 0, 1, 1, ..., 1], NSP 데이터가 만들어진 것입니다 :-)
                # 그럼 다음은.. 이 데이터에 [MASK] 를 씌워야겠죠?
                example = {
                    "input_ids": torch.tensor(input_ids, dtype=torch.long),
                    "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
                    "next_sentence_label": torch.tensor(1 if is_random_next else 0, dtype=torch.long),
                }

                examples.append(example)

            current_chunk = []
            current_length = 0

        i += 1

In [15]:
examples = []
for doc_index, document in enumerate(documents):
    create_examples_from_document(document=document, doc_index=doc_index, block_size=256, tokenizer=tokenizer, short_seq_probability=0.1, nsp_probability=0.5)

In [16]:
len(examples)

663090

In [17]:
examples[0]

{'input_ids': tensor([   101,   9461,  17138,   9096,  68833,  12310,   9665,  25934,  34907,
          58303,  48345,    119,   8865,  29364,   9283,  37114,  20729,   9258,
          13890,   9283,  37114,  12092,   9682,   9891,  10739,  23665,  19105,
           9821,  14801,  11102,   9924,  54355,   9408, 119236,   9186,  15184,
          11261,   8848,  17138,  11467,   9367,  19855,  12310,    100,    119,
          25701,   9707,  14279,  37712,  27023, 118683,  12424,   9665,  14646,
           8843,  96535,  11664,   9274,  15184,   9755,  87281,  21406,    117,
           8888,  46216,   9757,  30134,   9798, 100329,  18382,  11018,   8908,
          69592,  16323, 119085,  48345,    119,   8843,  45465,   9663, 118878,
          11102,   9924,   9519,  25503,  28578,   9254,  10892,   9691,  15891,
          33188,  48345,    119,   9121,  71013, 118786, 106249,  10739,   9640,
          14871,   8938,  74311,   8908,  10739,  11261,   9353,  11261,  18392,
           9692

In [18]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [19]:
print(data_collator(examples[0:1]))

{'input_ids': tensor([[   101,   9461,  17138,   9096,  68833,  12310,   9665,    103,  34907,
          58303,  48345,    103,    103,    103,   9283,  77237,  20729,   9258,
          13890,   9283,  37114,    103,    103,   9891,  10739,  23665,    103,
            103,  14801,  11102,   9924,  54355,   9408, 119236,   9186,  15184,
          11261,    103,  17138,  11467,   9367,  19855,  12310,    100,    119,
          25701,   9707,  14279,  37712,    103, 118683,  12424,   9665,  14646,
           8843,  96535,  11664,   9274,  15184,   9755,  87281,    103,    117,
           8888,  46216,    103,  30134,   9798,    103,  18382,  11018,   8908,
          69592,  16323, 119085,  48345,    119,    103,  45465,   9663, 118878,
          11102,   9924,   9519,  25503,  28578,   9254,  10892,   9691,  15891,
          33188,  48345,    103,   9121,    103,    103, 106249,  10739,   9640,
          14871,   8938,  74311,   8908,  10739,    103,   9353,  11261,  18392,
           969

In [20]:
print(data_collator(examples[0:1])['input_ids'][0])

tensor([   101,   9461,  17138,   9096,  68833,  12310,    103,  25934,  34907,
         58303,  48345,    119,  82718,  29364,   9283,    103,    103,   9258,
         13890,   9283,  37114,  12092,   9682,   9891,  10739,  23665,  19105,
          9821,  14801,  11102,   9924,  54355,   9408, 119236,   9186,  15184,
         11261,   8848,  17138,    103,   9367,  19855,  12310,    100,    103,
         25701,   9707,  14279,  37712,  27023, 118683,  12424,   9665,    103,
          8843,  96535,  11664,   9274,  15184,   9755,  87281,  21406,    117,
          8888,    103,   9757,  30134,   9798, 100329,    103,  11018,   8908,
         69592,  16323, 119085,  48345,    119,   8843,  45465,   9663, 118878,
         11102,   9924,   9519,  25503,  28578,   9254,  10892,    103,  15891,
           103,  48345,    103,   9121,  71013, 118786, 106249,  10739,   9640,
         14871,   8938,  74311,   8908,  10739,  11261,   9353,  11261,  18392,
          9692,   9524,    103,  41850, 

In [21]:
tokenizer.mask_token_id

103

In [22]:
tokenizer.decode(data_collator(examples[0:1])['input_ids'][0].tolist())

'[CLS] 숙성 돼 [MASK]기 전tott점입 驥. 건물 모양 때문에 매장 모양도 좀 특이하지만 쾌적한 편이고 살짝 레트로 [MASK]성으로 분위 [MASK] [UNK]. 모든 직원분들께서 [MASK]부 가능하다고 멘트 쳐주시며, 고기는 초반 커팅까지는 구 [MASK]주십니다. 가격 저렴한 편 아 [MASK]지만 맛은лна [MASK]합니다. 등 [MASK]덧살이 인상 깊었는데 구이로 별로일 줄 알았는데 육향 [MASK]고 얇게 저며 [UNK] 않았습니다. 하이 [MASK]이트는 Mil장찌 [MASK]. [SEP] [MASK]론 내 입맛에만 그럴 수도 있지만 여기에 [MASK] 비벼 [MASK]으면... 눈 [MASK] 난 [MASK]. 또 먹고 싶다.... [MASK] 튀긴 가지에 중국식 소스... 굿!! [MASK] 혹시ādi대에 갔는데 [MASK]엇을 먹을지 계 芸이 없 [MASK] 중국음 [MASK]이 당긴다면 가 [MASK]는 걸 추천한다. 아무 리뷰 [SEP]'

In [23]:
model = BertForPreTraining.from_pretrained('bert-base-multilingual-cased')

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
from transformers import Trainer, TrainingArguments

In [25]:
training_args = TrainingArguments(output_dir='./review_pre/model_output', overwrite_output_dir=True, num_train_epochs=2,per_device_train_batch_size=4, save_steps=1000, save_total_limit=2, logging_steps=100)
trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=examples)

In [26]:
trainer.train()

***** Running training *****
  Num examples = 663090
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 82888


Step,Training Loss
100,3.3406
200,3.0375
300,2.8641
400,2.8722
500,2.7358
600,2.6967
700,2.6291
800,2.621
900,2.5734
1000,2.5086


Saving model checkpoint to ./review_pre/model_output/checkpoint-1000
Configuration saved in ./review_pre/model_output/checkpoint-1000/config.json
Model weights saved in ./review_pre/model_output/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./review_pre/model_output/checkpoint-2000
Configuration saved in ./review_pre/model_output/checkpoint-2000/config.json
Model weights saved in ./review_pre/model_output/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./review_pre/model_output/checkpoint-3000
Configuration saved in ./review_pre/model_output/checkpoint-3000/config.json
Model weights saved in ./review_pre/model_output/checkpoint-3000/pytorch_model.bin
Deleting older checkpoint [review_pre/model_output/checkpoint-1000] due to args.save_total_limit
Saving model checkpoint to ./review_pre/model_output/checkpoint-4000
Configuration saved in ./review_pre/model_output/checkpoint-4000/config.json
Model weights saved in ./review_pre/model_output/checkpoint-4000/pytor

In [1]:
trainer.save_model('./review_pre/model_output')

NameError: name 'trainer' is not defined