# 1. Keras 임베딩 층
> **1) 임베딩 층은 룩업 테이블**   
> 입력 시퀀스 단어들은 정수 인코딩을 거침      
> ![Embedding Vector](https://wikidocs.net/images/page/33793/lookup_table.PNG "Embedding Vector")

In [1]:
# v = Embedding(vocab_size=20000, output_dim=128, input_length=500)

> **2) 임베딩 층 사용**   
> 문장의 긍, 부정을 판단하는 감성 분류 모델

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [3]:
# 긍, 부정에 따른 레이블
sentences = ['nice great best amazing', 'stop lies', 'pitiful nerd', 'excellent work', 'supreme quality', 'bad', 'highly respectable']
y_train = [1, 0, 0, 1, 1, 0, 1]

In [4]:
# 토큰화
t = Tokenizer()
t.fit_on_texts(sentences)

vocab_size = len(t.word_index) + 1
print(vocab_size)

16


In [5]:
# 정수 인코딩
X_encoded = t.texts_to_sequences(sentences)
print(X_encoded)

[[1, 2, 3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13], [14, 15]]


In [6]:
max_len = max(len(l) for l in X_encoded)
print(max_len)

4


In [7]:
# 문장 패딩
X_train = pad_sequences(X_encoded, maxlen=max_len, padding='post')
y_train = np.array(y_train)
print(X_train)

[[ 1  2  3  4]
 [ 5  6  0  0]
 [ 7  8  0  0]
 [ 9 10  0  0]
 [11 12  0  0]
 [13  0  0  0]
 [14 15  0  0]]


In [8]:
# 모델링
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

model = Sequential()
model.add(Embedding(vocab_size, 4, input_length=max_len))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=100, verbose=2)

Epoch 1/100
1/1 - 1s - loss: 0.6831 - acc: 0.7143
Epoch 2/100
1/1 - 0s - loss: 0.6816 - acc: 0.8571
Epoch 3/100
1/1 - 0s - loss: 0.6801 - acc: 1.0000
Epoch 4/100
1/1 - 0s - loss: 0.6786 - acc: 1.0000
Epoch 5/100
1/1 - 0s - loss: 0.6771 - acc: 1.0000
Epoch 6/100
1/1 - 0s - loss: 0.6756 - acc: 1.0000
Epoch 7/100
1/1 - 0s - loss: 0.6741 - acc: 1.0000
Epoch 8/100
1/1 - 0s - loss: 0.6726 - acc: 1.0000
Epoch 9/100
1/1 - 0s - loss: 0.6711 - acc: 1.0000
Epoch 10/100
1/1 - 0s - loss: 0.6696 - acc: 1.0000
Epoch 11/100
1/1 - 0s - loss: 0.6681 - acc: 1.0000
Epoch 12/100
1/1 - 0s - loss: 0.6665 - acc: 1.0000
Epoch 13/100
1/1 - 0s - loss: 0.6650 - acc: 1.0000
Epoch 14/100
1/1 - 0s - loss: 0.6634 - acc: 1.0000
Epoch 15/100
1/1 - 0s - loss: 0.6619 - acc: 1.0000
Epoch 16/100
1/1 - 0s - loss: 0.6603 - acc: 1.0000
Epoch 17/100
1/1 - 0s - loss: 0.6588 - acc: 1.0000
Epoch 18/100
1/1 - 0s - loss: 0.6572 - acc: 1.0000
Epoch 19/100
1/1 - 0s - loss: 0.6556 - acc: 1.0000
Epoch 20/100
1/1 - 0s - loss: 0.6541 - a

<tensorflow.python.keras.callbacks.History at 0x16509ab5d60>

# 2. 사전 훈련된 워드 임베딩
> **1) 사전 훈련된 Word2Vec**

In [9]:
import numpy as np
import gensim



In [10]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [11]:
print(word2vec_model.vectors.shape)

(3000000, 300)


In [12]:
# 단어 집합 크기의 행과 300개의 열을 가지는 행렬
embedding_matrix = np.zeros((vocab_size, 300)) 
np.shape(embedding_matrix)

(16, 300)

In [13]:
def get_vector(word):
    if word in word2vec_model:
        return word2vec_model[word]
    else:
        return None

In [14]:
for word, i in t.word_index.items():
    temp = get_vector(word)
    if temp is not None:
        embedding_matrix[i] = temp

In [15]:
print(word2vec_model['nice'])

[ 0.15820312  0.10595703 -0.18945312  0.38671875  0.08349609 -0.26757812
  0.08349609  0.11328125 -0.10400391  0.17871094 -0.12353516 -0.22265625
 -0.01806641 -0.25390625  0.13183594  0.0859375   0.16113281  0.11083984
 -0.11083984 -0.0859375   0.0267334   0.34570312  0.15136719 -0.00415039
  0.10498047  0.04907227 -0.06982422  0.08642578  0.03198242 -0.02844238
 -0.15722656  0.11865234  0.36132812  0.00173187  0.05297852 -0.234375
  0.11767578  0.08642578 -0.01123047  0.25976562  0.28515625 -0.11669922
  0.38476562  0.07275391  0.01147461  0.03466797  0.18164062 -0.03955078
  0.04199219  0.01013184 -0.06054688  0.09765625  0.06689453  0.14648438
 -0.12011719  0.08447266 -0.06152344  0.06347656  0.3046875  -0.35546875
 -0.2890625   0.19628906 -0.33203125 -0.07128906  0.12792969  0.09619141
 -0.12158203 -0.08691406 -0.12890625  0.27734375  0.265625    0.1796875
  0.12695312  0.06298828 -0.34375    -0.05908203  0.0456543   0.171875
  0.08935547  0.14648438 -0.04638672 -0.00842285 -0.0279

In [16]:
print('단어 nice의 정수 인덱스: ', t.word_index['nice'])

단어 nice의 정수 인덱스:  1


In [17]:
print(embedding_matrix[1])

[ 0.15820312  0.10595703 -0.18945312  0.38671875  0.08349609 -0.26757812
  0.08349609  0.11328125 -0.10400391  0.17871094 -0.12353516 -0.22265625
 -0.01806641 -0.25390625  0.13183594  0.0859375   0.16113281  0.11083984
 -0.11083984 -0.0859375   0.0267334   0.34570312  0.15136719 -0.00415039
  0.10498047  0.04907227 -0.06982422  0.08642578  0.03198242 -0.02844238
 -0.15722656  0.11865234  0.36132812  0.00173187  0.05297852 -0.234375
  0.11767578  0.08642578 -0.01123047  0.25976562  0.28515625 -0.11669922
  0.38476562  0.07275391  0.01147461  0.03466797  0.18164062 -0.03955078
  0.04199219  0.01013184 -0.06054688  0.09765625  0.06689453  0.14648438
 -0.12011719  0.08447266 -0.06152344  0.06347656  0.3046875  -0.35546875
 -0.2890625   0.19628906 -0.33203125 -0.07128906  0.12792969  0.09619141
 -0.12158203 -0.08691406 -0.12890625  0.27734375  0.265625    0.1796875
  0.12695312  0.06298828 -0.34375    -0.05908203  0.0456543   0.171875
  0.08935547  0.14648438 -0.04638672 -0.00842285 -0.0279

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Input

# Embedding에 사전 훈련된 embedding_matrix를 넣고 모델 학습
model = Sequential()
model.add(Input(shape=(max_len,), dtype='int32'))

e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_len, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=100, verbose=2)

Epoch 1/100
1/1 - 1s - loss: 0.7250 - acc: 0.2857
Epoch 2/100
1/1 - 0s - loss: 0.7056 - acc: 0.4286
Epoch 3/100
1/1 - 0s - loss: 0.6867 - acc: 0.7143
Epoch 4/100
1/1 - 0s - loss: 0.6684 - acc: 0.7143
Epoch 5/100
1/1 - 0s - loss: 0.6506 - acc: 0.8571
Epoch 6/100
1/1 - 0s - loss: 0.6334 - acc: 0.8571
Epoch 7/100
1/1 - 0s - loss: 0.6167 - acc: 0.8571
Epoch 8/100
1/1 - 0s - loss: 0.6006 - acc: 1.0000
Epoch 9/100
1/1 - 0s - loss: 0.5851 - acc: 1.0000
Epoch 10/100
1/1 - 0s - loss: 0.5700 - acc: 1.0000
Epoch 11/100
1/1 - 0s - loss: 0.5555 - acc: 1.0000
Epoch 12/100
1/1 - 0s - loss: 0.5415 - acc: 1.0000
Epoch 13/100
1/1 - 0s - loss: 0.5280 - acc: 1.0000
Epoch 14/100
1/1 - 0s - loss: 0.5150 - acc: 1.0000
Epoch 15/100
1/1 - 0s - loss: 0.5023 - acc: 1.0000
Epoch 16/100
1/1 - 0s - loss: 0.4902 - acc: 1.0000
Epoch 17/100
1/1 - 0s - loss: 0.4784 - acc: 1.0000
Epoch 18/100
1/1 - 0s - loss: 0.4671 - acc: 1.0000
Epoch 19/100
1/1 - 0s - loss: 0.4561 - acc: 1.0000
Epoch 20/100
1/1 - 0s - loss: 0.4455 - a

<tensorflow.python.keras.callbacks.History at 0x1651ae87190>