In [1]:
# 필요한 모듈 임포트
import pandas as pd
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate
from sklearn.preprocessing import LabelEncoder

In [2]:
corpus_df = pd.read_csv('c:/2nd_project/Data/talk_data/영화주제 대화 말뭉치 라벨링.csv')
corpus_df = corpus_df[corpus_df['label'] != 0]
corpus_df.head(3)

Unnamed: 0,text,label
6,정말요 어떤 내용이래요,3
12,애니메이션이면 개봉하면 저도 꼭 봐야겠어요,1
43,그거 말고 추천작은요,1


In [3]:
train_file = "c:/2nd_project/Data/talk_data/영화주제 대화 말뭉치 라벨링.csv"
data = pd.read_csv(train_file, delimiter=',')
queries = data['text'].tolist()
intents = data['label'].tolist()

In [4]:
from Preprocess2 import Preprocess2
p = Preprocess2(word2index_dic='c:/2nd_project/Data/chatbot_dict_talk.bin',
               userdic = 'c:/2nd_project/Data/user_dic.txt')

In [5]:
sequences = []
for sentence in queries:
    sentence = str(sentence)
    pos = p.pos(sentence)
    keywords = p.get_keywords(pos, without_tag=True)
    seq = p.get_wordidx_sequence(keywords)
    sequences.append(seq)

In [6]:
from GlobalParams import MAX_SEQ_LEN
padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post')

In [7]:
padded_seqs

array([[  19,  456,    2, ...,    0,    0,    0],
       [6713,  575, 2835, ...,    0,    0,    0],
       [ 117,  887, 1507, ...,    0,    0,    0],
       ...,
       [ 185,   28,  317, ...,    0,    0,    0],
       [ 185, 3471,  317, ...,    0,    0,    0],
       [1173,   28,   85, ...,    0,    0,    0]])

In [8]:
# X = corpus_df[['text']].astype(str)
X = padded_seqs
y = corpus_df['label']

In [9]:
encoder = LabelEncoder()
y_label = encoder.fit_transform(y)
y_label

array([2, 0, 0, ..., 4, 4, 4], dtype=int64)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(padded_seqs, intents, stratify=intents, test_size=0.2, random_state=0)

In [11]:
# from sklearn.model_selection import train_test_split

# X_train, X_val, y_train, y_val = train_test_split(X, y_label, stratify=y_label, random_state=0)

In [12]:
pd.Series(y_train).value_counts()

0    55606
6     1272
1     1132
3      910
4      191
2      147
Name: count, dtype: int64

In [13]:
# 하이퍼파라미터 설정
dropout_prob = 0.5
EMB_SIZE = 128
EPOCH = 5
VOCAB_SIZE = len(p.word_index) + 1 # 전체 단어수 (패딩 0 포함)

In [14]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(20)
val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(20)

In [15]:
train_ds

<BatchDataset element_spec=(TensorSpec(shape=(None, 15), dtype=tf.int32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [16]:
# print("*"*50)
# print(padded_seqs)
# print(padded_seqs.shape)
# print('*'*50)
# CNN 모델 정의
input_layer = Input(shape=(MAX_SEQ_LEN,))
embedding_layer = Embedding(VOCAB_SIZE, EMB_SIZE, input_length=MAX_SEQ_LEN)(input_layer)
dropout_emb = Dropout(rate = dropout_prob)(embedding_layer)

conv1 = Conv1D(
    filters=128,
    kernel_size=3,
    padding='same',
    activation=tf.nn.relu)(dropout_emb)
pool1 = GlobalMaxPool1D()(conv1)

conv2 = Conv1D(
    filters=128,
    kernel_size=4,
    padding='same',
    activation=tf.nn.relu)(dropout_emb)

pool2 = GlobalMaxPool1D()(conv2)
conv3 = Conv1D(
    filters=128,
    kernel_size=5,
    padding='same',
    activation=tf.nn.relu)(dropout_emb)
pool3 = GlobalMaxPool1D()(conv3)

# 3,4,5gram 이후 합치기
concat = concatenate([pool1, pool2, pool3])

hidden1 = Dense(128, activation=tf.nn.relu)(concat)
hidden2 = Dense(128, activation=tf.nn.relu)(hidden1)
hidden3 = Dense(128, activation=tf.nn.relu)(hidden2)

dropout_hidden = Dropout(rate=dropout_prob)(hidden3)
logits = Dense(5, name='logits')(dropout_hidden)
predictions = Dense(5, activation=tf.nn.softmax)(logits)

# 모델 생성
model = Model(inputs=input_layer, outputs=predictions)
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 15)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 15, 128)      1923968     ['input_1[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 15, 128)      0           ['embedding[0][0]']              
                                                                                                  
 conv1d (Conv1D)                (None, 15, 128)      49280       ['dropout[0][0]']                
                                                                                              

In [17]:
# 모델 학습
model.fit(train_ds, validation_data=val_ds, epochs=EPOCH, verbose=1)

model.save('c:/2nd_project/Model/intent_model_0808_b.h5')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### TEST

In [18]:
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import preprocessing

# 의도 분류 모델 모듈
class StoryModel:
    def __init__(self, model_name, proprocess):
        # intent 레이블
        # self.labels = dict(zip(range(0,12),encoder.classes_.tolist()))
        self.labels = {0: '추천',
                        1: '후기',
                        2: '정보',
                        3: '예매',
                        4: '욕설',
                        }
        # intent 분류 모델 불러오기
        self.model = load_model(model_name)
        # 챗봇 Preprocess 객체
        self.p = proprocess

    # 의도 클래스 예측
    def predict_class(self, query):
        # 형태소 분석
        pos = self.p.pos(query)

        # 문장내 키워드 추출(불용어 제거)
        keywords = self.p.get_keywords(pos, without_tag=True)
        sequences = [self.p.get_wordidx_sequence(keywords)]

        # 패딩처리
        padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post')
        predict = self.model.predict(padded_seqs)
        predict_class = tf.math.argmax(predict, axis=1)

        return predict_class.numpy()[0]

In [20]:
from Preprocess2 import Preprocess2
p = Preprocess2(word2index_dic='c:/2nd_project/Data/chatbot_dict_talk.bin',
               userdic = 'c:/2nd_project/Data/user_dic.txt')

intent = StoryModel(model_name='c:/2nd_project/Model/intent_model_0808_b.h5', proprocess=p)

items=["오늘 영화 예매 해줘"]

for item in items:

    predict = intent.predict_class(item)

    predict_label = intent.labels[predict]

    print(item)

    print("의도 예측 클래스 : ", predict)

    print("의도 예측 레이블 : ", predict_label)

오늘 영화 예매 해줘
의도 예측 클래스 :  0
의도 예측 레이블 :  추천
