In [6]:
import numpy as np
import pandas as pd
from konlpy.tag import Okt, Kkma, Komoran
from tensorflow.keras.utils import pad_sequences, to_categorical

In [7]:
df = pd.read_csv('kedi.csv',encoding='cp949')

In [8]:
#독립변수, 종속변수
X = df.프로그램명
y = df.소분류

In [9]:
# 형태소 분석기 
okt = Okt()

In [10]:
X = X.apply(okt.nouns)

In [12]:
## 문자를 숫자로 바꾸는 작업
# 1. 유니크한 값을 정리
word_list_x = []
for x in X:
    word_list_x.extend(x)
word_list_x = list(set(word_list_x))
# 2. index_word 구성된 딕셔너리 작성 (dictionary comprehention)
index_word_x = { i+1:v for i,v in enumerate(word_list_x)}
# 3. word_index 구성된 딕셔너리 작성
word_index_x = { v:i for i,v in index_word_x.items()}

In [23]:
# word들을 정수로 변환
def trans_word_index(x):
    tmp = []
    for i in x:
        tmp.append(word_index_x[i])
    return tmp
def trans_index_word(x):
    tmp = []
    for i in x:
        tmp.append(index_word_x[i])
        return tmp

X_ = X.apply(trans_word_index)

In [24]:
maxlen = X_.apply(len).max()
X_ = X_.values

In [26]:
X_ = pad_sequences(X_,maxlen=maxlen)

In [27]:
word_list_y = []
for i in y:
    word_list_y.extend([i])
word_list_y = list(set(word_list_y))

In [28]:
index_word_y = { i:v for i, v in enumerate(word_list_y)}
word_index_y = { v:i for i,v in index_word_y.items()}

In [29]:
def trans_y(x):
    return word_index_y[x]
y_ = y.apply(trans_y)

In [30]:
y_ = to_categorical(y_)

In [31]:
X_.shape, y_.shape

((125170, 21), (125170, 117))

In [32]:
### 모델 생성을 위한 모듈 import
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten
from tensorflow.keras.layers import Dense, LSTM, BatchNormalization, Dropout

In [33]:
## input dimension 계산
input_dim = max(word_index_x.values())+1
## 벡터의 길이 계산
maxlen

21

In [34]:
## 모델 생성
model = Sequential(
    [
        Embedding(input_dim,200,input_length=maxlen),
        LSTM(32,return_sequences=True),
        BatchNormalization(),
        LSTM(64,return_sequences=True),
        BatchNormalization(),
        Flatten(),
        Dense(256,activation='relu'),
        Dense(117)
    ]
)

In [35]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 21, 200)           3341200   
                                                                 
 lstm (LSTM)                 (None, 21, 32)            29824     
                                                                 
 batch_normalization (BatchN  (None, 21, 32)           128       
 ormalization)                                                   
                                                                 
 lstm_1 (LSTM)               (None, 21, 64)            24832     
                                                                 
 batch_normalization_1 (Batc  (None, 21, 64)           256       
 hNormalization)                                                 
                                                                 
 flatten (Flatten)           (None, 1344)              0

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X_,y_,test_size=.2)

In [38]:
model.compile(optimizer='rmsprop',
             loss='categorical_crossentropy',
             metrics=['accuracy'])

In [39]:
model.fit(X_train,
          y_train,
          epochs=10,
         validation_data=(X_test,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x21cb305c100>