<a href="https://colab.research.google.com/github/rtajeong/Kyungnam_univ_M4/blob/main/lab_63_embedding_rev1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 단어 임베딩 (words -> continuous vector space 로 projection)
- words are represented by dense vectors where a vector represents the projection of the word into a continuous vector space.
- The position of a word within the vector space is learned from text and is based on the words that surround the word when it is used.
- The position of a word in the learned vector space is referred to as its embedding.

### Embedding 계층을 사용하여 쉽게 만들 수 있다
- 정수 인덱스를 벡터로 매핑하는 딕셔너리 구조 (인덱스 크기, 벡터 크기)
- 학습 시키는 데이터에 따라 다른 임베딩이 만들어진다.

### IMDB 영화 리뷰 데이터를 사용한 임베딩 예제
- IMDB: (internet movie database) the world's most popular and authoritative source for movie, TV and celebrity content



In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Embedding
import os, os.path
import zipfile
from tensorflow.keras.datasets import imdb
from tensorflow.keras import preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

- 10000 개의 단어만 사용하고, 각 문장에서는 뒤에서부터 20 개의 단어만 사용하겠음.

In [2]:
max_features = 10000
maxlen = 20
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [3]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((25000,), (25000,), (25000,), (25000,))

In [4]:
y_train[:5]

array([1, 0, 0, 1, 0])

In [5]:
# 각 문장이 몇개의 단어로 구성되어 있는지 확인
[len(x_train[i]) for i in range(10)]

[218, 189, 141, 550, 147, 43, 123, 562, 233, 130]

In [6]:
x_train[0:2]   # words tokenized and expressed by (word) numbers

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1153, 194, 8255, 78, 228,

In [7]:
# 마지막 20개의 단어들만 사용한다. -> 20개보다 적으면 똑같은 길이로 만들어 준다. padding position is 'post'
x_train_p=preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen, padding='post')
x_test_p=preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen, padding='post')

In [8]:
x_train_p[0:2]

array([[  65,   16,   38, 1334,   88,   12,   16,  283,    5,   16, 4472,
         113,  103,   32,   15,   16, 5345,   19,  178,   32],
       [  23,    4, 1690,   15,   16,    4, 1355,    5,   28,    6,   52,
         154,  462,   33,   89,   78,  285,   16,  145,   95]],
      dtype=int32)

- Embedding()은 (number of samples, input_length)인 2D 정수 텐서를 입력받습니다. 이 때 각 sample은 정수 인코딩이 된 결과로, 정수의 시퀀스입니다. Embedding()은 워드 임베딩 작업을 수행하고 (number of samples, input_length, embedding word dimensionality)인 3D 텐서를 리턴합니다.

In [9]:
model = Sequential()
model.add(Embedding(10000, 8, input_length=maxlen)) # input 각 단어에 대해 8-vector 로 임베딩
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 8)             80000     
_________________________________________________________________
flatten (Flatten)            (None, 160)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________


In [10]:
history = model.fit(x_train_p, y_train,
                    epochs=10, batch_size=32,
                    validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## 위의 결과는 20개의 단어만 고려한 것임
### 성능이 75% 정도 됨
- 각 단어를 독립적으로 다루었으며, 문장의 구성 정보를 고려하지 않음
- 문장의 구조 정보를 고려하려면 임베딩 층 위에 합성곱이나 순환신경망 층을 추가한다

## RNN 추가

In [None]:
x_train_p.shape

(25000, 20)

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense

model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=8))
model.add(SimpleRNN(128))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 8)           80000     
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 128)               17536     
_________________________________________________________________
flatten_1 (Flatten)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 97,665
Trainable params: 97,665
Non-trainable params: 0
_________________________________________________________________


In [12]:
history = model.fit(x_train_p, y_train,
                    epochs=20, batch_size=32,
                    validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


# 연습

In [None]:
import tensorflow as tf
# 문장 토큰화와 단어 토큰화
text=[['Hope', 'to', 'see', 'you', 'soon'],['Nice', 'to', 'see', 'you', 'again']]

# 각 단어에 대한 정수 인코딩
text=[[0, 1, 2, 3, 4],[5, 1, 2, 3, 6]]

# 위 데이터가 아래의 임베딩 층의 입력이 된다.
embedding_layer = Embedding(7, 2, input_length=5)
result = embedding_layer(tf.constant([0, 1, 2, 3, 4, 5, 6]))
print(result.numpy())

# 7은 단어의 개수. 즉, 단어 집합(vocabulary)의 크기이다.
# 2는 임베딩한 후의 벡터의 크기이다.
# 5는 각 입력 시퀀스의 길이. 즉, input_length이다. 아래와 같은 형태가 됨.


<tensorflow.python.keras.layers.embeddings.Embedding object at 0x7f4b325fe208>
[[-0.04526671 -0.00644044]
 [-0.03116806 -0.01851275]
 [-0.03980669  0.01063291]
 [-0.04599432 -0.02542104]
 [-0.04872879 -0.01556901]
 [-0.0287668  -0.04047495]
 [ 0.03014007 -0.02833869]]


In [None]:
model = Sequential()
model.add(Embedding(7, 2, input_length=5))
model.add(Flatten())
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 5, 2)              14        
_________________________________________________________________
flatten (Flatten)            (None, 10)                0         
Total params: 14
Trainable params: 14
Non-trainable params: 0
_________________________________________________________________


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ['Hope to see you soon',
         'Nice to see you again']
cv = CountVectorizer()
cv.fit_transform(corpus).toarray() , cv.get_feature_names(), cv.vocabulary_

(array([[0, 1, 0, 1, 1, 1, 1],
        [1, 0, 1, 1, 0, 1, 1]]),
 ['again', 'hope', 'nice', 'see', 'soon', 'to', 'you'],
 {'again': 0, 'hope': 1, 'nice': 2, 'see': 3, 'soon': 4, 'to': 5, 'you': 6})

In [None]:
from gensim.models.word2vec import Word2Vec

text=[['Hope', 'to', 'see', 'you', 'soon'],
      ['Nice', 'to', 'see', 'you', 'again']]

model = Word2Vec(text, min_count=1, size=2)

In [None]:
for i in range(len(text)):
    print(model[text[i]])

[[ 0.23644601  0.13282813]
 [-0.10490499  0.02300035]
 [ 0.09623462  0.12984623]
 [ 0.2290124  -0.01340628]
 [ 0.07774924  0.02938988]]
[[-0.00383358 -0.0236154 ]
 [-0.10490499  0.02300035]
 [ 0.09623462  0.12984623]
 [ 0.2290124  -0.01340628]
 [ 0.24975368  0.22249663]]


  
