In [None]:
!python --version

In [1]:
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_memory_growth(gpus[0], True)
  except RuntimeError as e:
    # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
    print(e)

In [2]:
'''공통'''
import torch
import numpy as np
import pandas as pd

from tqdm import tqdm, tqdm_notebook
import tqdm

'''GPT-2'''
#import tensorflow as tf
from transformers import TFGPT2LMHeadModel, AutoTokenizer, GPT2LMHeadModel

'''KoBERT'''
from torch import nn
#import torch.nn.functional as F
#import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

## 2. 파인튜닝된 KoBERT 로드

In [3]:
# 학습 데이터 전처리 클래스
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [4]:
# KoBERT 모델 클래스(분류기)
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=3,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [5]:
# vocabulary 불러오기
_, vocab = get_pytorch_kobert_model()

using cached model. /home/lab06/세미프로젝트/GPT3 챗봇/ChatBot_GPT3 & 2/.cache/kobert_v1.zip
using cached model. /home/lab06/세미프로젝트/GPT3 챗봇/ChatBot_GPT3 & 2/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [6]:
### 모델 로드 코드
model_kobert = torch.load('./model/KoBERT_cls_model.pt')
model_kobert.eval()

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

In [7]:
#GPU 사용
device = torch.device("cuda:0")

In [8]:
# params 설정
max_len = 64
batch_size = 64

In [9]:
# 토크나이저 로드
tokenizer_kobert = get_tokenizer()
tok_kobert = nlp.data.BERTSPTokenizer(tokenizer_kobert, vocab, lower=False)

using cached model. /home/lab06/세미프로젝트/GPT3 챗봇/ChatBot_GPT3 & 2/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [10]:
# 감정 예측 함수
def predict_emotion(predict_sentence):
    
    result_emo = None
    
    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok_kobert, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)
    
    #model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model_kobert(token_ids, valid_length, segment_ids)


        test_eval=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()

            if np.argmax(logits) == 0:
                test_eval.append("분노가")
                result_emo = '분노'
            elif np.argmax(logits) == 1:
                test_eval.append("행복이")
                result_emo = '행복'
            elif np.argmax(logits) == 2:
                test_eval.append("슬픔이")
                result_emo = '슬픔'
                
        print(">> 입력하신 내용에서 " + test_eval[0] + " 느껴집니다.")
    
    return result_emo

## 3. 파인튜닝된 GPT2 (제목생성) 로드

In [11]:
# GPT2 토크나이저 로드
gpt2_tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2', bos_token='</s>', eos_token='</s>', pad_token='<pad>')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
# gpt2_title_model 로드
gpt2_title_model = TFGPT2LMHeadModel.from_pretrained('./model/Gen_title_GPT2_model.h5')

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at ./model/Gen_title_GPT2_model.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [103]:
# 제목 생성
def gen_title(user_emotion):
    sent = '<usr>' + user_emotion + '<sys>'
    input_ids = [gpt2_tokenizer.bos_token_id] + gpt2_tokenizer.encode(sent)
    input_ids = tf.convert_to_tensor([input_ids])
    output = gpt2_title_model.generate(input_ids, max_length=50, do_sample=True, temperature=0.85, top_p=0.80, repetition_penalty=2.0)
    sentence = gpt2_tokenizer.decode(output[0].numpy().tolist())
    gened_title = sentence.split('<sys> ')[1].replace('</s>', '')
    return gened_title

## 4. 파인튜닝된 GPT2 첫문장생성 로드

In [14]:
gpt2_sent_model = TFGPT2LMHeadModel.from_pretrained('./model/Gen_sent_GPT2_model.h5')

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at ./model/Gen_sent_GPT2_model.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [39]:
# 문장생성
def gen_sent(user_emotion):
    sent = '<usr>' + user_emotion + '<sys>'
    input_ids = [gpt2_tokenizer.bos_token_id] + gpt2_tokenizer.encode(sent)
    input_ids = tf.convert_to_tensor([input_ids])
    output = gpt2_sent_model.generate(input_ids, max_length=50, do_sample=True, temperature=0.85, top_p=0.80)
    sentence = gpt2_tokenizer.decode(output[0].numpy().tolist())
    gened_sent = sentence.split('<sys> ')[1].replace('</s>', '')
    return gened_sent

## 5. 파인튜닝된 GPT2 소설생성 로드

In [16]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel

In [17]:
gpt2_novel_tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
                                                    bos_token='</s>', eos_token='</s>', unk_token='<unk>',
                                                    pad_token='<pad>', mask_token='<mask>') 

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [18]:
gpt2_novel_model = GPT2LMHeadModel.from_pretrained('./model/Gen_novel_GPT2_model')

In [19]:
gpt2_novel_model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [47]:
def gen_novel(usr_txt):
    input_ids = gpt2_novel_tokenizer.encode(usr_txt)
    gen_ids = gpt2_novel_model.generate(torch.tensor([input_ids]),
                               max_length=128, # generate 할 개수
                               repetition_penalty=2.0, # 단어 반복시 패널티를 주어서 새로운 단어를 생성
                               temperature = 0.85,
                               top_p = 0.80,
                               do_sample = True, 
                               pad_token_id=gpt2_novel_tokenizer.pad_token_id,
                               eos_token_id=gpt2_novel_tokenizer.eos_token_id,
                               bos_token_id=gpt2_novel_tokenizer.bos_token_id,
                               use_cache=True)
    generated = gpt2_novel_tokenizer.decode(gen_ids[0,:].tolist())
    
    return generated

## 6. 소설 생성

In [41]:
# 1. 사용자 입력
user_text = "오늘 너무 행복해~~!"

In [42]:
# 2. 사용자 입력 감정 분석
user_emotion = predict_emotion(user_text)
user_emotion



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

'행복'

In [105]:
# 3. 제목 생성
user_title = gen_title(user_emotion)
user_title

'달콤한 오프 더 레코드 2'

In [102]:
# 4. 첫 문장 생성
user_sent = gen_sent(user_emotion)
user_sent

'<unk>네는’ 이렇게 한다!'

In [79]:
# 5. 소설 생성
tmp = gen_novel(user_sent)
tmp

'--- p.403나라는 인간이 얼마나 건강하게 유지될 수 있을까? 「“왜 자네는 그런 말을 했을까?”「그렇습니다 」중에서아뇨, 라고 대답한 사람은 그 사람이 아니라 그의 어머니였으니 어쩌면 이 편지를 받은 것은 그녀의 아버지가 보낸 편지에 대한 답장일지도 몰랐지요 “뭐라고요?” 내가 물었어 나는 고개를 갸웃거렸는데 어머니가 말했네 “아니에요, 당신은 내 딸이 아닙니다, 저는 제 딸이에요. 우리 어머니는...... 에미코 씨와는 전혀 관계없는 분이지 않습니까? 그렇죠 하지만 당신의 편지는 나보다 훨씬 더 훌륭합'

In [57]:
import re

tmp.replace('\n', '')
tmp.replace('<unk>', '')

'그리고 고맙다는 말과 함께 얼굴을 닦았다...... 하지만 그 표정에도 기대와 설렘이 동시에 배어 있었다 바로 그때 문득, 누군가 자신을 찾아왔을 때처럼 자신이 누구인지 알 수 없는 것이 떠올랐기 때문이리라.그것이 이 세상에는 아무것도 없어 라고......? 그는 뭔가를 발견하기 위해 노력 중이었지 그런데 우연히 발견한 물건이 아니었기에 다른 사람에게 맡겨야 했는지도 모른다고 생각했었으니까 「고양이에게」중에서“왜 그런 걸 물어봤어?”「누가 이런 짓을 한 거냐구요! 왜 이렇게까지 묻는 건데요?”그는 눈을 동그래지게 뜨고는 잠시'

In [69]:
tmp2 = re.sub('[---p.*0-9]', '', user_sent)
re.sub('^ ', ' ', tmp2)

' 나라는 인간이 얼마나 건강하게 유지될 수 있을까?'