In [2]:
# GPT 모델을 원하는 자연어 데이터들을 이용해 사전학습

In [1]:
!pip install transformers



In [1]:
!mkdir my_data
!curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=1zib1GI8Q5wV08TgYBa2GagqNh4jyfXZz" > /dev/null
!curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=1zib1GI8Q5wV08TgYBa2GagqNh4jyfXZz" -o my_data/wiki_20190620_small.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1323k  100 1323k    0     0   516k      0  0:00:02  0:00:02 --:--:-- 1246k


In [2]:
path = "/content/my_data/wiki_20190620_small.txt"

# BERT 모델

In [3]:
from tokenizers import SentencePieceBPETokenizer
from tokenizers.normalizers import BertNormalizer

tokenizer = SentencePieceBPETokenizer()

tokenizer._tokenizer.normalizer = BertNormalizer(clean_text=True,
handle_chinese_chars=False,
lowercase=False)

tokenizer.train(
    path,
    vocab_size=10000,
    special_tokens=[
        "<s>", # 문장의 시작
        "<pad>", # 패딩 토큰
        "</s>", # 문장의 끝
        "<unk>", # 사전에 없는 토큰
    ],
)

In [4]:
print(tokenizer.encode("이순신은 조선 중기의 무신이다.").ids)
print(tokenizer.encode("이순신은 조선 중기의 무신이다.").tokens)
print(tokenizer.decode(tokenizer.encode("<s>이순신은 조선 중기의 무신이다.</s>").ids, skip_special_tokens=True))
# SentencePiece를 사용하면, 나중에 decoding 과정에서 '_' 만 ' '로 replace해주면 띄어쓰기 복원이 가능해짐.

[1005, 578, 6613, 1303, 1041, 2071, 1136, 595, 1033]
['▁이', '순', '신은', '▁조선', '▁중', '기의', '▁무', '신', '이다.']
이순신은 조선 중기의 무신이다.


In [6]:
tokenizer.save_model('/content/drive/MyDrive/1자연어처리/sentencepiece_tokenizer')

In [7]:
tokenizer = SentencePieceBPETokenizer.from_file(vocab_filename='/content/drive/MyDrive/1자연어처리/sentencepiece_tokenizer/vocab.json',
                                                merges_filename='/content/drive/MyDrive/1자연어처리/sentencepiece_tokenizer/merges.txt')

In [8]:
print(tokenizer.encode("이순신은 조선 중기의 무신이다."))
print(tokenizer.encode("이순신은 조선 중기의 무신이다.").ids)
print(tokenizer.encode("<s>이순신은 조선 중기의 무신이다.</s>").tokens)
print(tokenizer.decode(tokenizer.encode("<s>이순신은 조선 중기의 무신이다.</s>").ids, skip_special_tokens=True))

Encoding(num_tokens=9, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
[1005, 579, 6613, 1303, 1041, 2071, 1136, 596, 1033]
['▁<', 's', '>', '이', '순', '신은', '▁조선', '▁중', '기의', '▁무', '신', '이다.', '<', '/s', '>']
<s>이순신은 조선 중기의 무신이다.</s>


In [9]:
tokenizer.add_special_tokens(["<s>", "</s>", "<unk>", "<pad>", "<shkim>"])
tokenizer.pad_token_id = tokenizer.token_to_id("<pad>")
tokenizer.unk_token_id = tokenizer.token_to_id("<unk>")
tokenizer.bos_token_id = tokenizer.token_to_id("<s>")
tokenizer.eos_token_id = tokenizer.token_to_id("</s>")

print(tokenizer.encode("<s>이순신은 조선 중기의 무신이다.</s>").ids)
print(tokenizer.encode("<s>이순신은 조선 중기의 무신이다.</s>").tokens)
print(tokenizer.decode(tokenizer.encode("<s>이순신은 조선 중기의 무신이다.</s>").ids, skip_special_tokens=True))

[0, 1005, 579, 6613, 1303, 1041, 2071, 1136, 596, 1033, 2]
['<s>', '▁이', '순', '신은', '▁조선', '▁중', '기의', '▁무', '신', '이다.', '</s>']
이순신은 조선 중기의 무신이다.


# GPT-2 불러오기

In [10]:
from transformers import GPT2Config, GPT2LMHeadModel

config = GPT2Config(
  vocab_size=tokenizer.get_vocab_size(),
  bos_token_id=tokenizer.token_to_id("<s>"),
  eos_token_id=tokenizer.token_to_id("</s>"),
)

model = GPT2LMHeadModel(config)

In [11]:
model.num_parameters()

93523200

In [12]:
import json
import os
import pickle
import random
import time
import warnings
from typing import Dict, List, Optional

import torch
from torch.utils.data.dataset import Dataset

from filelock import FileLock

from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.utils import logging

In [13]:
logger = logging.get_logger(__name__)

class TextDataset(Dataset):
    """
    This will be superseded by a framework-agnostic approach soon.
    """

    def __init__(
        self,
        tokenizer: PreTrainedTokenizer,
        file_path: str,
        block_size: int,
        overwrite_cache=False,
        cache_dir: Optional[str] = None,
    ):
        assert os.path.isfile(file_path), f"Input file path {file_path} not found"

        block_size = block_size - tokenizer.num_special_tokens_to_add(is_pair=False)

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            cache_dir if cache_dir is not None else directory,
            "cached_lm_{}_{}_{}".format(
                tokenizer.__class__.__name__,
                str(block_size),
                filename,
            ),
        )

        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        lock_path = cached_features_file + ".lock"
        with FileLock(lock_path):

            if os.path.exists(cached_features_file) and not overwrite_cache:
                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    self.examples = pickle.load(handle)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
                )

            else:
                logger.info(f"Creating features from dataset file at {directory}")
                # 여기서부터 본격적으로 데이터셋을 만들기 시작
                self.examples = []
                text = ""
                with open(file_path, encoding="utf-8") as f:
                    lines = f.readlines()
                    for line in lines:
                        line = line.strip()
                        line = "<s>"+line+"</s>" # 학습 데이터 앞 뒤에 문장 구분 기호를 추가해
                        text += line    # 'text' 객체에 모든 학습 데이터를 다 합쳐버림
                tokenized_text = tokenizer.encode(text).ids

                # 모델의 최대 sequence length만큼 데이터를 잘라서 저장
                for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
                    self.examples.append(
                        tokenized_text[i : i + block_size]
                    )
                # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
                # If your dataset is small, first you should look for a bigger one :-) and second you
                # can change this behavior by adding (model specific) padding.

                start = time.time()
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
                logger.info(
                    "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                )

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> torch.Tensor:
        return torch.tensor(self.examples[i], dtype=torch.long)

In [14]:
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=path,
    block_size=128,
)

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(    # GPT는 생성모델이기 때문에 [MASK] 가 필요 없음
    tokenizer=tokenizer, mlm=False,)

In [15]:
print(dataset[0])

tensor([   0, 3997, 3546, 8404,  464,    5, 5481, 9525, 1798, 1890, 2297, 1262,
        9623, 2679, 1188, 2174,    2,    0, 5709, 5481,  256, 6466,  751, 3426,
         872, 1556,  681,  895, 1627, 9220,  588, 3621, 1010, 3303,    2,    0,
        6466, 7416, 2305,  404, 2217, 1074,    2,    0, 1013, 1107, 3716,  647,
        8574, 1024,  940,   94, 7321,  372,   94,  722, 9292,  706, 1651,  454,
        3166, 1032, 1074,    2,    0, 6343, 1262, 3716, 1009, 2932, 1176,  913,
        2037, 1171, 3227,  844,   94,  440,  974, 1486, 1017,    3, 1323, 3914,
        2095, 1042,    2,    0, 1382, 2068, 2225, 1095,  327,  844, 1823,  507,
           5, 1240, 7696,    2,    0, 3897, 6466, 1053, 1077,  687, 2318, 4649,
        5204, 5672, 1013, 1759,  116, 2742, 3004,  105,  656, 2283, 9762, 1192,
        1796, 2449, 2546, 9936, 6466, 1053, 1037,  534])


# GPT-2 학습

In [16]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='model_output',
    overwrite_output_dir=True,
    num_train_epochs=50,
    per_device_train_batch_size=64, # 512:32  # 128:64
    save_steps=1000,
    save_total_limit=2,
    logging_steps=100

)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)


In [18]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjeongrak5[0m ([33mjeongrak5-not[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,7.7338
200,7.0871
300,6.6259
400,6.2263
500,5.8927
600,5.5934
700,5.3187
800,5.0853
900,4.8793
1000,4.6858


TrainOutput(global_step=1500, training_loss=5.382701314290364, metrics={'train_runtime': 2266.4257, 'train_samples_per_second': 41.85, 'train_steps_per_second': 0.662, 'total_flos': 6195887308800000.0, 'train_loss': 5.382701314290364, 'epoch': 50.0})

In [19]:
trainer.save_model('/content/drive/MyDrive/1자연어처리/gpt-2')

In [20]:
use_GPU = 1

device = torch.device("cuda" if (torch.cuda.is_available() and use_GPU) else "cpu")

# GPT-2 사용

허깅페이스에서 제공해주는 GPT의 경우 generate라는 함수를 제공

input_ids는 시작 토큰을 너어주고 모델에게 제공해주면 자동으로 뒷부분을 생성.

generate의 설정에 따라 생성되는 방식이 변할 수 있음

In [22]:
import torch
torch.manual_seed(42)

# Device configuration
device = torch.device('cuda' if (torch.cuda.is_available() and use_GPU) else 'cpu')

input_ids = torch.tensor(tokenizer.encode("<s>이순신", add_special_tokens=True).ids).unsqueeze(0).to('cuda')

output_sequences = model.generate(input_ids=input_ids, do_sample=True, max_length=100, num_return_sequences=3)
for generated_sequence in output_sequences:
    generated_sequence = generated_sequence.tolist()
    print("GENERATED SEQUENCE : {0}".format(tokenizer.decode(generated_sequence, skip_special_tokens=True)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


GENERATED SEQUENCE : 이순신 아도 대위전상을 했지만 고졸에서 연영노는 그러나 "유로 모시한 동시 대작부터 농우상장 김수산운 km2경우절·북스나지만 터키시라들 중중중대하지 못변치에서는 위키가동기수전경정·국군들이 소 사건구위상사 등을 진행하면서 편시 대통령직대 총선에서 부전·파 등을 통해 일간 휴 박
GENERATED SEQUENCE : 이순신할 수 있는 나다.
GENERATED SEQUENCE : 이순신치, 김의 한국, 사회량, 영동과 기·구동 등을 멸망력이 큰 영향을 끼 점령 등과 등의 다양한 통일질들의 제안했다.
