In [1]:
import numpy as np
import pandas as pd
from konlpy.tag import Okt, Kkma, Komoran
from tensorflow.keras.utils import pad_sequences, to_categorical

df = pd.read_csv('kedi.csv',encoding='cp949')

#독립변수, 종속변수
X = df.프로그램명
y = df.소분류

# 형태소 분석기 
okt = Okt()

X = X.apply(okt.nouns)

## 문자를 숫자로 바꾸는 작업
# 1. 유니크한 값을 정리
word_list_x = []
for x in X:
    word_list_x.extend(x)
word_list_x = list(set(word_list_x))
# 2. index_word 구성된 딕셔너리 작성 (dictionary comprehention)
index_word_x = { i+1:v for i,v in enumerate(word_list_x)}
# 3. word_index 구성된 딕셔너리 작성
word_index_x = { v:i for i,v in index_word_x.items()}

# word들을 정수로 변환
def trans_word_index(x):
    tmp = []
    for i in x:
        tmp.append(word_index_x[i])
    return tmp
def trans_index_word(x):
    tmp = []
    for i in x:
        tmp.append(index_word_x[i])
        return tmp

X_ = X.apply(trans_word_index)

maxlen = X_.apply(len).max()
X_ = X_.values

X_ = pad_sequences(X_,maxlen=maxlen)

word_list_y = []
for i in y:
    word_list_y.extend([i])
word_list_y = list(set(word_list_y))

index_word_y = { i:v for i, v in enumerate(word_list_y)}
word_index_y = { v:i for i,v in index_word_y.items()}

def trans_y(x):
    return word_index_y[x]
y_ = y.apply(trans_y)

y_ = to_categorical(y_)

X_.shape, y_.shape

### 모델 생성을 위한 모듈 import
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten
from tensorflow.keras.layers import Dense, LSTM, BatchNormalization, Dropout

## input dimension 계산
input_dim = max(word_index_x.values())+1
## 벡터의 길이 계산
maxlen

## 모델 생성
model = Sequential(
    [
        Embedding(input_dim,200,input_length=maxlen),
        LSTM(32,return_sequences=True),
        BatchNormalization(),
        LSTM(64,return_sequences=True),
        BatchNormalization(),
        Flatten(),
        Dense(256,activation='relu'),
        Dense(117,activation='softmax'),
    ]
)

model.summary()

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_,y_,test_size=.2)

model.compile(optimizer='rmsprop',
             loss='categorical_crossentropy',
             metrics=['accuracy'])

model.fit(X_train,
          y_train,
          epochs=10,
         validation_data=(X_test,y_test))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 21, 200)           3341200   
                                                                 
 lstm (LSTM)                 (None, 21, 32)            29824     
                                                                 
 batch_normalization (BatchN  (None, 21, 32)           128       
 ormalization)                                                   
                                                                 
 lstm_1 (LSTM)               (None, 21, 64)            24832     
                                                                 
 batch_normalization_1 (Batc  (None, 21, 64)           256       
 hNormalization)                                                 
                                                                 
 flatten (Flatten)           (None, 1344)              0

<keras.callbacks.History at 0x17315dba130>

In [5]:
model.fit(X_,
          y_,
          epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x17355b56490>

In [9]:
from tensorflow.keras.models import load_model

In [10]:
model.save('kedi_model.h6')



INFO:tensorflow:Assets written to: kedi_model.h6\assets


INFO:tensorflow:Assets written to: kedi_model.h6\assets


In [14]:
np.argmax(kedi_model.predict(X_test[[0]]))
index_word_y[37]



'역사'

In [13]:
kedi_model = load_model('kedi_model.h6')

In [23]:
def subject_predict(subject):
    word = okt.nouns(subject)
    sub = []
    for w in word:
        if word_index_x.get(w) == None:
            pass
        else:
            sub.append(word_index_x.get(w))
    sub = np.array(sub).reshape(1,-1)
    sub = pad_sequences(sub,21)
    result = kedi_model.predict(sub)
    result = np.argmax(result)
    return index_word_y[result]

In [24]:
subject_predict('엄준식의 약탈교실')



'서예·서화'

In [12]:
np.argmax(kedi_model.predict(X_test[[17]]))



53

In [2]:
index_word_y[np.argmax(model.predict(X_test[[17]]))]



'영어'

In [3]:
index_word_y[np.argmax(y_test,axis=1)[17]]

'영어'

In [4]:
index_word_y[np.argmax(y_test,axis=1)[17]]

'영어'