In [1]:
# 앞에서 만든 class를 활용하여 자연어 처리 기법으로 다음으로 나올 class를 예측 해봄.

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('data/class.csv')
train = np.array(data['a5930'][:-20]) # 20개는 테스트 셋으로 활용 예정
test = data['a5930'][-20:-3]

In [4]:
# 데이터 길이 = 그 안에 있는 class의 수
print ('데이터의 총 길이: {}'.format(len(train)))

# 데이터의 처음 10개의 class
print(train[:10])

데이터의 총 길이: 236
['c25' 'c23' 'c23' 'c03' 'c25' 'c13' 'c05' 'c13' 'c03' 'c01']


In [5]:
# class_set 생성
class_set = sorted(set(train)) # 텍스트에 들어간 각 단어가 중복되지 않는 리스트
class_set.append('NO') # 리스트에 존재하지 않는 토큰을 나타내는 'NO' 를 첨가
print ('{} unique class'.format(len(class_set)))

# class_set을 숫자로 맵핑
word_idx = {u:i for i, u in enumerate(class_set)}
idx_word = np.array(class_set)

text_as_int = np.array([word_idx[c] for c in train])

# word_idx 확인
for word,_ in zip(word_idx, range(len(class_set))):
    print('  {:4s}: {:3d},'.format(repr(word), word_idx[word]))
print('index of NO: {}'.format(word_idx['NO']))

16 unique class
  'c01':   0,
  'c03':   1,
  'c05':   2,
  'c06':   3,
  'c08':   4,
  'c10':   5,
  'c11':   6,
  'c13':   7,
  'c15':   8,
  'c16':   9,
  'c18':  10,
  'c20':  11,
  'c21':  12,
  'c23':  13,
  'c25':  14,
  'NO':  15,
index of NO: 15


In [6]:
# 토큰화 확인
print(train[:10])
print(text_as_int[:10])

['c25' 'c23' 'c23' 'c03' 'c25' 'c13' 'c05' 'c13' 'c03' 'c01']
[14 13 13  1 14  7  2  7  1  0]


In [7]:
#학습을 위한 데이터 세트 만들기
seq_length = 10 # 10개의 단어가 주어졌을 때 다음 단어를 예측하도록 데이터를 만듬
examples_per_epoch = len(text_as_int) // seq_length
sentence_dataset = tf.data.Dataset.from_tensor_slices(text_as_int) # dataset를 생성하는 코드

sentence_dataset = sentence_dataset.batch(seq_length+2, drop_remainder=True) # +2은 정답이 될 2개의 클래스를 합쳐서 반환하기 위함.
for item in sentence_dataset.take(1):
    print(idx_word[item.numpy()], len(idx_word[item.numpy()]),'개') # 10개를 활용해 12개가 나옴
    print(item.numpy(), len(item.numpy()),'개') # 마찬가지로 12개 출력

['c25' 'c23' 'c23' 'c03' 'c25' 'c13' 'c05' 'c13' 'c03' 'c01' 'c20' 'c18'] 12 개
[14 13 13  1 14  7  2  7  1  0 11 10] 12 개


In [8]:
def split(data): # 2개의 단어를 잘라줌(2개를 예측하였기 때문에)
    return [data[:-2], data[-2]]

train_dataset = sentence_dataset.map(split) # 새로운 데이터셋 생성
for x,y in train_dataset.take(2):
    print(idx_word[x.numpy()])
    print(x.numpy())
    print(idx_word[y.numpy()])
    print(y.numpy())

['c25' 'c23' 'c23' 'c03' 'c25' 'c13' 'c05' 'c13' 'c03' 'c01']
[14 13 13  1 14  7  2  7  1  0]
c20
11
['c05' 'c23' 'c13' 'c13' 'c13' 'c03' 'c05' 'c18' 'c03' 'c15']
[ 2 13  7  7  7  1  2 10  1  8]
c20
11


In [9]:
BATCH_SIZE = 10 #한번에 10개의 데이터를 사용 
steps_per_epoch = examples_per_epoch // BATCH_SIZE
BUFFER_SIZE = 10000

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [19]:
examples_per_epoch

23

In [18]:
seq_length

10

In [10]:
total_words = len(class_set)
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(total_words,20, input_length=seq_length),
    tf.keras.layers.LSTM(units=64, return_sequences=True),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.LSTM(units=32),
    tf.keras.layers.Dense(total_words, activation='softmax')])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 20)            320       
_________________________________________________________________
lstm (LSTM)                  (None, 10, 64)            21760     
_________________________________________________________________
dropout (Dropout)            (None, 10, 64)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense (Dense)                (None, 16)                528       
Total params: 35,024
Trainable params: 35,024
Non-trainable params: 0
_________________________________________________________________


In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def testmodel(epoch, logs):
    if epoch % 5 != 0 and epoch != 49:
        return
    test_sentence = data['a28260'][-20:]

    next_words = 5
    for _ in range(next_words):
        test_text_X = test_sentence[-seq_length:]
        test_text_X = np.array([word_idx[c] if c in word_idx else word_idx['NO'] for c in test_text_X])
        test_text_X = pad_sequences([test_text_X], maxlen=seq_length, padding='pre', value=word_idx['NO'])

        output_idx = model.predict_classes(test_text_X)
        test_sentence += ' ' + idx_word[output_idx[0]]
        
testmodelcb = tf.keras.callbacks.LambdaCallback(on_epoch_end=testmodel)

In [12]:
history = model.fit(train_dataset.repeat(), epochs=100, steps_per_epoch=steps_per_epoch, callbacks=[testmodelcb], verbose=2)

Train for 2 steps
Epoch 1/100
2/2 - 3s - loss: 2.7709 - accuracy: 0.0500
Epoch 2/100
2/2 - 0s - loss: 2.7641 - accuracy: 0.3000
Epoch 3/100
2/2 - 0s - loss: 2.7588 - accuracy: 0.2000
Epoch 4/100
2/2 - 0s - loss: 2.7525 - accuracy: 0.2000
Epoch 5/100
2/2 - 0s - loss: 2.7478 - accuracy: 0.2000
Epoch 6/100
2/2 - 0s - loss: 2.7352 - accuracy: 0.2000
Epoch 7/100
2/2 - 0s - loss: 2.7275 - accuracy: 0.1500
Epoch 8/100
2/2 - 0s - loss: 2.7308 - accuracy: 0.1000
Epoch 9/100
2/2 - 0s - loss: 2.7122 - accuracy: 0.0500
Epoch 10/100
2/2 - 0s - loss: 2.6744 - accuracy: 0.2500
Epoch 11/100
2/2 - 0s - loss: 2.6977 - accuracy: 0.1500
Epoch 12/100
2/2 - 0s - loss: 2.6068 - accuracy: 0.2000
Epoch 13/100
2/2 - 0s - loss: 2.6167 - accuracy: 0.2000
Epoch 14/100
2/2 - 0s - loss: 2.6302 - accuracy: 0.1500
Epoch 15/100
2/2 - 0s - loss: 2.5472 - accuracy: 0.1500
Epoch 16/100
2/2 - 0s - loss: 2.5821 - accuracy: 0.1500
Epoch 17/100
2/2 - 0s - loss: 2.3817 - accuracy: 0.2000
Epoch 18/100
2/2 - 0s - loss: 2.3359 - 

In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
test_sentence = 'c13 c03 c08 c23 c23 c25 c15 c18 c03 c18 c23 c03 c21 c01 c01 c23 c01 c23'

next_words = 2
for _ in range(next_words):
    test_text_X = test_sentence.split(' ')[-seq_length:]
    test_text_X = np.array([word_idx[c] if c in word_idx else word_idx['NO'] for c in test_text_X])
    test_text_X = pad_sequences([test_text_X], maxlen=seq_length, padding='pre', value=word_idx['NO'])
    
    output_idx = model.predict_classes(test_text_X)
    test_sentence += ' ' + idx_word[output_idx[0]]

print(test_sentence)

c13 c03 c08 c23 c23 c25 c15 c18 c03 c18 c23 c03 c21 c01 c01 c23 c01 c23 c16 c08


In [17]:
np.array(data['a5930'][-20:])

array(['c13', 'c03', 'c08', 'c23', 'c23', 'c25', 'c15', 'c18', 'c03',
       'c18', 'c23', 'c03', 'c21', 'c01', 'c01', 'c23', 'c01', 'c23',
       'c06', 'c06'], dtype=object)