# 문서 임베딩 : 워드 임베딩의 평균(Average Word Embedding)

In [1]:
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
vocab_size = 2000

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
print('훈련용 리뷰 개수 :', len(x_train))
print('테스트용 리뷰 개수 :', len(y_train))

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz




훈련용 리뷰 개수 : 25000
테스트용 리뷰 개수 : 25000


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [3]:
x_train[:5]

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 2, 66, 2, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2, 19, 14, 22, 4, 1920, 2, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2, 2, 16, 480, 66, 2, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 2, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]),
       list([1, 194, 1153, 194, 2, 78, 228, 5, 6, 1463, 2, 2, 134, 26, 4, 715, 8, 118, 1634

In [4]:
y_train[:5]

array([1, 0, 0, 1, 0], dtype=int64)

정수 인코딩이 되어있음을 볼 수 있음

In [5]:
print('훈련 데이터의 첫번째 샘플 :', x_train[0])
print('훈련 데이터의 첫번째 샘플의 레이블 :', y_train[0])

훈련 데이터의 첫번째 샘플 : [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 2, 66, 2, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 2, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2, 19, 14, 22, 4, 1920, 2, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2, 2, 16, 480, 66, 2, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 2, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 2, 113, 103, 32, 15, 16, 2, 19, 178, 32]
훈련 데이터의 첫번째 샘플의 레이블 : 1


In [7]:
print('훈련 용 리뷰의 평균 길이: ', np.mean(list(map(len, x_train)), dtype=int))
print('테스트 용 리뷰의 평균 길이: ', np.mean(list(map(len, x_test)), dtype=int))

훈련 용 리뷰의 평균 길이:  238
테스트 용 리뷰의 평균 길이:  230


400으로 패딩

In [8]:
max_len = 400
x_train = pad_sequences(x_train, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)
print('x_train의 크기(shape) :', x_train.shape)
print('x_test 의 크기(shape) :', x_test.shape)

x_train의 크기(shape) : (25000, 400)
x_test 의 크기(shape) : (25000, 400)


# 모델 설계하기
임베딩 벡터를 평균으로 사용하는 모델을 설계 해보자
GlobalAveragePooling1D()는 입력으로 들어오는 모든 벡터들의 평균을 구하는 역할을 함.
Embedding() 다음에 GlobalAveragePooling1D()을 추가하면 해당 문장의 모든 단어 벡터들의 평균 벡터를 구함

이진 분류를 수행해야 하므로 그 후에는 시그모이드 함수를 활성화 함수로 사용하는 뉴런 1개를 배치.
훈련데이터의 20%를 검증 데이터로 사용하고 총 10 에포크 학습

In [9]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, GlobalAveragePooling1D, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

embedding_dim = 64

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))

model.add(GlobalAveragePooling1D())
model.add(Dense(1, activation='sigmoid'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('embedding_average_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['acc']
)

model.fit(x_train, y_train, batch_size=32, epochs=10, callbacks=[es, mc], validation_split=0.2)

Epoch 1/10
Epoch 00001: val_acc improved from -inf to 0.78900, saving model to embedding_average_model.h5
Epoch 2/10
Epoch 00002: val_acc improved from 0.78900 to 0.85080, saving model to embedding_average_model.h5
Epoch 3/10
Epoch 00003: val_acc improved from 0.85080 to 0.85980, saving model to embedding_average_model.h5
Epoch 4/10
Epoch 00004: val_acc improved from 0.85980 to 0.86940, saving model to embedding_average_model.h5
Epoch 5/10
Epoch 00005: val_acc improved from 0.86940 to 0.87580, saving model to embedding_average_model.h5
Epoch 6/10
Epoch 00006: val_acc improved from 0.87580 to 0.87960, saving model to embedding_average_model.h5
Epoch 7/10
Epoch 00007: val_acc did not improve from 0.87960
Epoch 8/10
Epoch 00008: val_acc did not improve from 0.87960
Epoch 9/10
Epoch 00009: val_acc did not improve from 0.87960
Epoch 10/10
Epoch 00010: val_acc did not improve from 0.87960


<tensorflow.python.keras.callbacks.History at 0x19f92702dc8>

In [10]:
loaded_model = load_model('embedding_average_model.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(x_test, y_test)[1]))


 테스트 정확도: 0.8756
