In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding
from keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

import re
import numpy as np
import pandas as pd

Using TensorFlow backend.


In [2]:
def make_word_id(keywords):
    text = ''.join(keywords)
    text= text.replace(' ','').strip()

    word_to_id ={}
    for idx,i in enumerate(set(text)):
        word_to_id[i] = idx + 1    
    
    id_to_word = dict((j,i) for i,j in word_to_id.items())    
    return word_to_id, id_to_word

In [3]:
df = pd.read_csv('search_log.csv')
df['keyword']=df['keyword'].astype(str)

In [4]:
# 리스트로 만들기
keywords = df['keyword'].str.lower().tolist()

#한글자 짜리 제거
keywords = [i for i in keywords if len(i)!=1]
keywords_end = [i+'$' for i in keywords]

# 딕셔너리 제작
word_to_id, id_to_word = make_word_id(keywords_end)

# 인코딩
keywords_encoded = [np.array([word_to_id[i] for i in word.replace(' ','').strip()]) for word in keywords_end]

#### shft 하여 X, y 만들기

In [5]:
xs, ts =[], []
for word in keywords_encoded:
    for i in range(len(word)-1):
        xs.append(word[:i+1])
        ts.append(word[i+1:])

In [6]:
maxlen=5
vocab_size = len(word_to_id)
batch_size = 64

In [7]:
# 패딩
xs_pad = pad_sequences(xs, maxlen=maxlen, padding='post')
ts_pad = pad_sequences(ts, maxlen=maxlen, padding='post')

# train test split
X_train, X_test, y_train, y_test = train_test_split(xs_pad, ts_pad, test_size=.9, random_state=0)

In [8]:
def train_generator(X_train, y_train, vocab_size, batch_size=32):    
    while True:        
        idx = np.random.choice(np.arange(len(X_train)), size=batch_size, replace=False)
        yield X_train[idx], to_categorical(y_train[idx], num_classes=vocab_size)        
        
def build_callbacks():
    checkpointer = ModelCheckpoint(filepath='unet.h5', verbose=0, save_best_only=True, save_weights_only=True)
    callbacks = [checkpointer]
    return callbacks

### MODEL

In [9]:
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=maxlen, mask_zero=True))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(vocab_size, activation=('softmax'))))
model.compile(loss='categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 5, 128)            268416    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 5, 512)            788480    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 5, 2097)           1075761   
Total params: 2,132,657
Trainable params: 2,132,657
Non-trainable params: 0
_________________________________________________________________


In [10]:
train_steps = len(X_train) //batch_size
test_steps = len(y_train) //batch_size
model.fit_generator(train_generator(X_train, y_train, vocab_size), epochs=20, steps_per_epoch = train_steps, callbacks = build_callbacks())

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/20
Epoch 2/20
  13/3082 [..............................] - ETA: 39s - loss: 0.9654 - accuracy: 0.6130



Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x295c03f14a8>

### generate test

In [11]:
def generate(model, word, word_to_id, id_to_word):      
    
    while True:
        X_input = [word_to_id[i] for i in word]     
        X_input = pad_sequences([X_input], maxlen=maxlen, padding='post')
        score = model.predict(X_input)
        idx = np.argmax(score[0, 0 ,:])
        pred_word = id_to_word[idx]
        
        if pred_word =='$' or len(word) == maxlen:
            break
            
        word += pred_word
    return word

In [12]:
print(generate(model, '레', word_to_id, id_to_word))
print(generate(model, '레', word_to_id, id_to_word))
print(generate(model, '레', word_to_id, id_to_word))

레몬
레몬
레몬
