In [1]:
import pandas as pd
import numpy as np
import re

from tqdm import tqdm_notebook

In [2]:
### 데이터 로드
title_df = pd.read_csv("./raw_data/raw_titles.csv")
title_df = title_df.drop('Unnamed: 0', axis=1)

In [9]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

#kobert
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

In [10]:
#GPU 사용
device = torch.device("cuda:0")

In [11]:
# 학습 데이터 전처리 클래스
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [12]:
# KoBERT 모델 클래스(분류기)
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=4,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [13]:
_, vocab = get_pytorch_kobert_model()

using cached model. /home/lab06/세미프로젝트/GPT3 챗봇/ChatBot_GPT3 & 2/GPT2 학습 데이터 생성/.cache/kobert_v1.zip
using cached model. /home/lab06/세미프로젝트/GPT3 챗봇/ChatBot_GPT3 & 2/GPT2 학습 데이터 생성/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [14]:
model_kobert = torch.load('../model/KoBERT_sent_cls_model.pt')
model_kobert.eval()

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

In [15]:
max_len = 64
batch_size = 64

In [16]:
tokenizer_kobert = get_tokenizer()
tok_kobert = nlp.data.BERTSPTokenizer(tokenizer_kobert, vocab, lower=False)

using cached model. /home/lab06/세미프로젝트/GPT3 챗봇/ChatBot_GPT3 & 2/GPT2 학습 데이터 생성/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [17]:
def predict(predict_sentence):
    
    result_emo = None
    
    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok_kobert, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)
    
    #model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model_kobert(token_ids, valid_length, segment_ids)


        test_eval=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()

            if np.argmax(logits) == 0:
                test_eval.append("분노가")
                result_emo = '분노'
            elif np.argmax(logits) == 1:
                test_eval.append("행복이")
                result_emo = '행복'
            elif np.argmax(logits) == 2:
                test_eval.append("슬픔이")
                result_emo = '슬픔'
            elif np.argmax(logits) == 3:
                test_eval.append("중립이")
                result_emo = '중립' 
                
        #print(">> 입력하신 내용에서 " + test_eval[0] + " 느껴집니다.")
    
    return result_emo

In [20]:
emotion_list = []

# 문장 라벨링하기
for title in tqdm_notebook(title_df.title.values):
    emotion_list.append(predict(title))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for title in tqdm_notebook(title_df.title.values):


  0%|          | 0/10000 [00:00<?, ?it/s]



In [21]:
title_df['감정'] = emotion_list

In [None]:
for title, emotion in title_df.values:
    print(f'제목 : {title}\n감정 : {emotion}\n')

In [22]:
title_df = title_df[~title_df['감정'].str.contains("중립", na=False, case=False)]

In [23]:
title_df.감정.value_counts()

슬픔    2399
행복    1312
분노     686
Name: 감정, dtype: int64

In [24]:
title_df.to_csv("../train data/train_gpt2_title.csv")