### KOGPT2를 활용한 트윗 학습 실행
이 문서는 더 빠른 작성을 위해 다른 프로젝트의 .ipynb와 달리 colab으로 작성되었다.

In [None]:
#필요한 것들 설치
!pip install gluonnlp
!pip install sentencepiece
!pip install transformers
!pip install mxnet

In [2]:
#라이브러리 import
import gluonnlp as nlp
from gluonnlp.data import SentencepieceTokenizer
from transformers import TFGPT2LMHeadModel
import tensorflow as tf
import numpy as np
import os

In [3]:
#모델 생성 방식 따라 디렉토리 경로 입력하는 클래스
class GPT2Model(tf.keras.Model):
    def __init__(self, dir_path):
        super(GPT2Model, self).__init__()
        self.gpt2 = TFGPT2LMHeadModel.from_pretrained(dir_path)
        
    def call(self, inputs):
        return self.gpt2(inputs)[0]

In [None]:
#파라미터 불러오는 명령어
!wget https://www.dropbox.com/s/nzfa9xpzm4edp6o/gpt_ckpt.zip -O gpt_ckpt.zip
!unzip -o gpt_ckpt.zip

In [5]:
#모델 위치 설정 및 불러오기
BASE_MODEL_PATH = './gpt_ckpt'
gpt_model = GPT2Model(BASE_MODEL_PATH)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at ./gpt_ckpt.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [6]:
#사전학습 모델 구성하기
BATCH_SIZE = 16
NUM_EPOCHS = 5
MAX_LEN = 40
TOKENIZER_PATH = './gpt_ckpt/gpt2_kor_tokenizer.spiece'

tokenizer = SentencepieceTokenizer(TOKENIZER_PATH, num_best=0, alpha=0)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(TOKENIZER_PATH,
                                               mask_token=None,
                                               sep_token=None,
                                               cls_token=None,
                                               unknown_token='<unk>',
                                               padding_token='<pad>',
                                               bos_token='<s>',
                                               eos_token='</s>')

In [7]:
#토크나이저 & 사전 학습 모델 통해 문장 만들기 함수
def tf_top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-99999):
    _logits = logits.numpy()
    top_k = min(top_k, logits.shape[-1])  
    if top_k > 0:
        indices_to_remove = logits < tf.math.top_k(logits, top_k)[0][..., -1, None]
        _logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits = tf.sort(logits, direction='DESCENDING')
        sorted_indices = tf.argsort(logits, direction='DESCENDING')
        cumulative_probs = tf.math.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1)

        sorted_indices_to_remove = cumulative_probs > top_p
        sorted_indices_to_remove = tf.concat([[False], sorted_indices_to_remove[..., :-1]], axis=0)
        indices_to_remove = sorted_indices[sorted_indices_to_remove].numpy().tolist()
        
        _logits[indices_to_remove] = filter_value
    return tf.constant([_logits])

#seed_word:시작하는 단어
def generate_sent(seed_word, model, max_step=100, greedy=False, top_k=0, top_p=0.):
    sent = seed_word
    toked = tokenizer(sent)
    
    for _ in range(max_step):
        #인풋 = 토큰화된 단어의 인덱스
        input_ids = tf.constant([vocab[vocab.bos_token],]  + vocab[toked])[None, :] 
        #아웃풋 = 문장의 마지막 단어
        outputs = model(input_ids)[:, -1, :]
        if greedy: #가장 확률 높은 단어만 선택해서 반복되는 경우가 잦음
            gen = vocab.to_tokens(tf.argmax(outputs, axis=-1).numpy().tolist()[0])
        else: #그렇지 않음
            output_logit = tf_top_k_top_p_filtering(outputs[0], top_k=top_k, top_p=top_p)
            gen = vocab.to_tokens(tf.random.categorical(output_logit, 1).numpy().tolist()[0])[0]
        if gen == '</s>': #멈춤단어나오면 멈춤
            break
        sent += gen.replace('▁', ' ')
        toked = tokenizer(sent)

    return sent

In [8]:
#colab에 업로드
from google.colab import files
uploaded = files.upload()
import io

Saving edited_twit.txt to edited_twit.txt


In [10]:
#트윗 데이터 넣기
sents = [s[:-1] for s in open('edited_twit.txt', encoding='UTF8').readlines()]

In [12]:
#토크나이저에 인풋 아웃풋 나눠 넣기
input_data = []
output_data = []

for s in sents:
    tokens = [vocab[vocab.bos_token],]  + vocab[tokenizer(s)] + [vocab[vocab.eos_token],]
    input_data.append(tokens[:-1])
    output_data.append(tokens[1:])

In [13]:
#패딩 해서 입출력 구성
input_data = tf.keras.preprocessing.sequence.pad_sequences(input_data, MAX_LEN, value=vocab[vocab.padding_token])
output_data = tf.keras.preprocessing.sequence.pad_sequences(output_data, MAX_LEN, value=vocab[vocab.padding_token])

input_data = np.array(input_data, dtype=np.int64)
output_data = np.array(output_data, dtype=np.int64)

In [14]:
#손실함수 & 정확도 측정 설정
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, vocab[vocab.padding_token]))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

def accuracy_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, vocab[vocab.padding_token]))
    mask = tf.expand_dims(tf.cast(mask, dtype=pred.dtype), axis=-1)
    pred *= mask    
    acc = train_accuracy(real, pred)

    return tf.reduce_mean(acc)

In [15]:
#모델 컴파일
gpt_model.compile(loss=loss_function,
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=[accuracy_function])

In [16]:
#학습 실행
history = gpt_model.fit(input_data, output_data, 
                    batch_size=BATCH_SIZE, epochs=NUM_EPOCHS)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
#저장
DATA_OUT_PATH = './data_out'
model_name = 'twit_model'
save_path = os.path.join(DATA_OUT_PATH, model_name)

if not os.path.exists(save_path):
    os.makedirs(save_path)

gpt_model.gpt2.save_pretrained(save_path)

loaded_gpt_model = GPT2Model(save_path)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at ./data_out/twit_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [18]:
print(generate_sent('프로젝트', gpt_model))
print('-'*32)
print(generate_sent('마라탕 존맛탱', gpt_model))
print('-'*32)
print(generate_sent('어젯밤은 유난히 쌀쌀하여 버티기 어려웠다.', gpt_model))
print('-'*32)

프로젝트팀/피겨스케이팅 국가대항전이 두 나라에만 집중되는 것 같아 개선해야 한다고 보는가 하면 두 나라 모두 올림픽 출전권을 따기 위해 메달은 꼭 따야지 한다는 식으로 접근하는...
--------------------------------
마라탕 존맛탱이 구몬같은 거 해먹어도 구럼 나 미스릴 좀 해줘 시발 릴렉합디다
--------------------------------
어젯밤은 유난히 쌀쌀하여 버티기 어려웠다.
--------------------------------
