# 감정분류 모델_(언더샘플링 X)

In [1]:
# 필요한 모듈 임포트
import pandas as pd
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [2]:
# 데이터 불러오기

train_file = "c:/2nd_project/Data/talk_data/영화주제 대화 말뭉치 라벨링.csv"
data = pd.read_csv(train_file, delimiter=',')
data = data[data['label'] != 0]
data.head()

Unnamed: 0,text,label
6,정말요 어떤 내용이래요,3
12,애니메이션이면 개봉하면 저도 꼭 봐야겠어요,1
43,그거 말고 추천작은요,1
99,맞어 나 진짜 많이 봐 신의 탑이랑 일요 웹툰에 23보는데 꼭 봐주라 존잼,1
184,모가디슈 내용이 뭐야,3


In [3]:
# 데이터 분포 확인

data['label'].value_counts()

label
6    1590
1    1415
3    1137
4     239
2     184
Name: count, dtype: int64

In [4]:
# 문장과 라벨 분류

queries = data['text'].tolist()
intents = data['label'].tolist()

In [5]:
# 전처리

from Preprocess2 import Preprocess2
p = Preprocess2(word2index_dic='c:/2nd_project/Data/chatbot_dict_talk.bin',
               userdic = 'c:/2nd_project/Data/user_dic.txt')

In [6]:
# 문장 숫자로 변환

sequences = []
for sentence in queries:
    sentence = str(sentence)
    pos = p.pos(sentence)
    keywords = p.get_keywords(pos, without_tag=True)
    seq = p.get_wordidx_sequence(keywords)
    sequences.append(seq)

In [7]:
sequences

[[159, 68, 133, 103, 4],
 [234, 1867, 148, 117, 178, 2, 3],
 [37, 68, 136, 437, 107],
 [29, 9, 35, 77, 2, 1, 143, 4739, 90, 1758, 2, 178, 1474, 192, 195],
 [365, 103, 45],
 [34, 1871, 13, 4, 141, 103, 959, 83],
 [5523, 191, 685, 1717, 84, 103, 4],
 [9, 129, 30, 2, 41, 84, 103, 4, 139],
 [9, 3807, 2, 87],
 [103, 177, 99, 47, 164, 52],
 [71, 1111, 1553, 20, 60],
 [806, 17, 742, 695, 49, 140, 4, 3],
 [90, 136, 94, 3, 238],
 [445, 684, 136, 1554],
 [873, 9, 640, 2],
 [7, 50, 178, 2, 3, 7, 444, 5546],
 [9, 3495, 3817, 103, 4, 54, 83, 211],
 [178, 2, 3],
 [136, 116, 988],
 [100, 259, 3, 81, 2, 7, 591, 2, 7],
 [1060, 1348, 136, 1030, 1879, 213, 865, 564, 3, 92, 355, 2858],
 [50, 9, 718, 647, 188, 4779, 7, 742, 695],
 [59, 806, 17, 8976, 49],
 [3234, 633, 1423, 103],
 [7, 113, 55, 99, 60],
 [7, 291, 58, 66, 873, 90, 2, 2],
 [1172, 103, 853, 438, 4, 174, 147, 8981, 7],
 [27, 45, 2582, 592, 103, 48],
 [7, 5, 27, 108, 81, 7, 9, 295, 47, 873, 20, 48],
 [7, 9, 2346, 7, 1274, 873],
 [50, 667, 1009, 

In [8]:
# 제로 패딩

from GlobalParams import MAX_SEQ_LEN

padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post')

In [9]:
padded_seqs

array([[ 159,   68,  133, ...,    0,    0,    0],
       [ 234, 1867,  148, ...,    0,    0,    0],
       [  37,   68,  136, ...,    0,    0,    0],
       ...,
       [ 185,   28,  317, ...,    0,    0,    0],
       [ 185, 3471,  317, ...,    0,    0,    0],
       [1173,   28,   85, ...,    0,    0,    0]])

In [10]:
padded_seqs.shape

(4565, 15)

In [11]:
# X = corpus_df[['text']].astype(str)
X = padded_seqs
y = data['label']

pd.Series(y).value_counts()

label
6    1590
1    1415
3    1137
4     239
2     184
Name: count, dtype: int64

In [12]:
X.shape

(4565, 15)

In [13]:
print(X)
print('='*40)
print(y)

[[ 159   68  133 ...    0    0    0]
 [ 234 1867  148 ...    0    0    0]
 [  37   68  136 ...    0    0    0]
 ...
 [ 185   28  317 ...    0    0    0]
 [ 185 3471  317 ...    0    0    0]
 [1173   28   85 ...    0    0    0]]
6        3
12       1
43       1
99       1
184      3
        ..
74068    6
74069    6
74070    6
74071    6
74072    6
Name: label, Length: 4565, dtype: int64


In [14]:
encoder = LabelEncoder()
y_label = encoder.fit_transform(y)
y_label

array([2, 0, 0, ..., 4, 4, 4], dtype=int64)

In [15]:
print(len(y_label))
np.unique(y_label)

4565


array([0, 1, 2, 3, 4], dtype=int64)

In [16]:
# train 데이터와 test 데이터 나누기(8:2)

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y_label, stratify=y, test_size=0.2, random_state=0)

In [17]:
np.unique(y_train)

array([0, 1, 2, 3, 4], dtype=int64)

In [18]:
pd.Series(y_train).value_counts()

4    1272
0    1132
2     910
3     191
1     147
Name: count, dtype: int64

In [19]:
# 하이퍼파라미터 설정
dropout_prob = 0.5
EMB_SIZE = 128
EPOCH = 5
VOCAB_SIZE = len(p.word_index) + 1 # 전체 단어수 (패딩 0 포함)

In [20]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(20)
val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(20)

In [21]:
train_ds

<BatchDataset element_spec=(TensorSpec(shape=(None, 15), dtype=tf.int32, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [22]:
# CNN 모델 정의
input_layer = Input(shape=(MAX_SEQ_LEN,))
embedding_layer = Embedding(VOCAB_SIZE, EMB_SIZE, input_length=MAX_SEQ_LEN)(input_layer)
dropout_emb = Dropout(rate = dropout_prob)(embedding_layer)

conv1 = Conv1D(
    filters=128,
    kernel_size=3,
    padding='same',
    activation=tf.nn.relu)(dropout_emb)
pool1 = GlobalMaxPool1D()(conv1)

conv2 = Conv1D(
    filters=128,
    kernel_size=4,
    padding='same',
    activation=tf.nn.relu)(dropout_emb)

pool2 = GlobalMaxPool1D()(conv2)
conv3 = Conv1D(
    filters=128,
    kernel_size=5,
    padding='same',
    activation=tf.nn.relu)(dropout_emb)
pool3 = GlobalMaxPool1D()(conv3)

# 3,4,5gram 이후 합치기
concat = concatenate([pool1, pool2, pool3])

hidden1 = Dense(128, activation=tf.nn.relu)(concat)
hidden2 = Dense(128, activation=tf.nn.relu)(hidden1)
hidden3 = Dense(128, activation=tf.nn.relu)(hidden2)

dropout_hidden = Dropout(rate=dropout_prob)(hidden3)
logits = Dense(5, name='logits')(dropout_hidden)
predictions = Dense(5, activation=tf.nn.softmax)(logits)

# 모델 생성
model = Model(inputs=input_layer, outputs=predictions)
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 15)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 15, 128)      1923968     ['input_1[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 15, 128)      0           ['embedding[0][0]']              
                                                                                                  
 conv1d (Conv1D)                (None, 15, 128)      49280       ['dropout[0][0]']                
                                                                                              

In [23]:
# 모델 학습
model.fit(train_ds, validation_data=val_ds, epochs=EPOCH, verbose=1)

model.save('c:/2nd_project/Model/intent_usx_model_0809_a.h5')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### 평가

### TEST

In [25]:
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import preprocessing

# 의도 분류 모델 모듈
class StoryModel:
    def __init__(self, model_name, proprocess):
        # intent 레이블
        # self.labels = dict(zip(range(0,12),encoder.classes_.tolist()))
        self.labels = {0: '추천',
                        1: '후기',
                        2: '정보',
                        3: '예매',
                        4: '욕설',
                        }
        # intent 분류 모델 불러오기
        self.model = load_model(model_name)
        # 챗봇 Preprocess 객체
        self.p = proprocess

    # 의도 클래스 예측
    def predict_class(self, query):
        # 형태소 분석
        pos = self.p.pos(query)

        # 문장내 키워드 추출(불용어 제거)
        keywords = self.p.get_keywords(pos, without_tag=True)
        sequences = [self.p.get_wordidx_sequence(keywords)]

        # 패딩처리
        padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post')
        predict = self.model.predict(padded_seqs)
        predict_class = tf.math.argmax(predict, axis=1)

        return predict_class.numpy()[0]

In [26]:
from Preprocess2 import Preprocess2
p = Preprocess2(word2index_dic='c:/2nd_project/Data/chatbot_dict_talk.bin',
               userdic = 'c:/2nd_project/Data/user_dic.txt')

intent = StoryModel(model_name='c:/2nd_project/Model/intent_usx_model_0809_a.h5', proprocess=p)

items=["오늘 영화 예약 해줘", "씨발", "내일 영화 추천 해봐라", "인터스텔라 내용이 뭐야?", "7번방의 선물 후기 좀"]

for item in items:
    predict = intent.predict_class(item)
    predict_label = intent.labels[predict]

    print(item)
    print("의도 예측 클래스 : ", predict)
    print("의도 예측 레이블 : ", predict_label)

오늘 영화 예약 해줘
의도 예측 클래스 :  3
의도 예측 레이블 :  예매
씨발
의도 예측 클래스 :  4
의도 예측 레이블 :  욕설
내일 영화 추천 해봐라
의도 예측 클래스 :  0
의도 예측 레이블 :  추천
인터스텔라 내용이 뭐야?
의도 예측 클래스 :  2
의도 예측 레이블 :  정보
7번방의 선물 후기 좀
의도 예측 클래스 :  1
의도 예측 레이블 :  후기
