#  문장 내의 단어들을 임베딩
- keras.layers.Embedding 레이어 사용

In [7]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding

# 샘플 데이터: 간단한 문장들의 모음
sentences = [
    "I love machine learning",
    "I love coding in Python",
    "Deep learning is fun"
]

In [8]:
# 각 문장을 단어로 분할하고, 각 단어에 대한 고유한 인덱스를 생성
word_index = {}

for sentence in sentences:
    for word in sentence.split():
        if word not in word_index:
            word_index[word] = len(word_index) + 1

word_index

{'I': 1,
 'love': 2,
 'machine': 3,
 'learning': 4,
 'coding': 5,
 'in': 6,
 'Python': 7,
 'Deep': 8,
 'is': 9,
 'fun': 10}

In [9]:
# 문장들을 단어 인덱스의 시퀀스로 변환
sequences = [[word_index[word] for word in sentence.split()] for sentence in sentences]
sequences

[[1, 2, 3, 4], [1, 2, 5, 6, 7], [8, 4, 9, 10]]

In [10]:
# 문장들 중 가장 긴 것의 길이를 구함
max_length = max([len(seq) for seq in sequences])

# 모든 문장을 가장 긴 문장의 길이로 패딩
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length, padding='post')
padded_sequences

array([[ 1,  2,  3,  4,  0],
       [ 1,  2,  5,  6,  7],
       [ 8,  4,  9, 10,  0]], dtype=int32)

In [11]:
# Embedding 레이어 생성
embedding_dim = 8
embedding_layer = Embedding(input_dim=len(word_index) + 1, output_dim=embedding_dim, input_length=max_length)

# 패딩된 시퀀스를 Embedding 레이어에 통과시켜 임베딩된 결과를 얻음
embedded_sequences = embedding_layer(padded_sequences)

print(embedded_sequences.shape)
print(embedded_sequences)

(3, 5, 8)
tf.Tensor(
[[[ 0.02292763  0.01898408  0.03058434  0.03767885 -0.00078635
   -0.04131095  0.01049863 -0.01070913]
  [-0.03095946 -0.01208802 -0.01741817  0.01822146  0.01489382
   -0.04860253 -0.02816772  0.04655448]
  [-0.03956745 -0.02778288 -0.01271453 -0.0489408   0.01927609
   -0.03345201 -0.01254299 -0.03338441]
  [-0.00717854  0.01874379  0.04830558  0.03144133 -0.02392714
    0.02617493  0.03784785  0.03108868]
  [ 0.0118425   0.0198478  -0.04260274  0.04898245  0.03648787
    0.04960028 -0.0072081  -0.00678673]]

 [[ 0.02292763  0.01898408  0.03058434  0.03767885 -0.00078635
   -0.04131095  0.01049863 -0.01070913]
  [-0.03095946 -0.01208802 -0.01741817  0.01822146  0.01489382
   -0.04860253 -0.02816772  0.04655448]
  [ 0.01120577 -0.00184532  0.01643792  0.00330768  0.02645927
    0.02163352 -0.03643812  0.04222197]
  [-0.01672084  0.01964724 -0.02386712 -0.03663919  0.04117819
   -0.01197306 -0.02973223 -0.04192952]
  [-0.00805389 -0.00713674  0.01546675  0.01619751

In [12]:
# Embedding 레이어의 가중치 (단어 임베딩 행렬) 출력
embeddings = embedding_layer.get_weights()[0]
print("Embedding Layer Shape :", embeddings.shape)
print("Embedding Layer Weights (Word Embeddings):\n", embeddings)
print()

# 예: 'love'라는 단어의 임베딩 벡터를 출력
print("\nEmbedding for 'love':\n", embeddings[word_index['love']])

Embedding Layer Shape : (11, 8)
Embedding Layer Weights (Word Embeddings):
 [[ 0.0118425   0.0198478  -0.04260274  0.04898245  0.03648787  0.04960028
  -0.0072081  -0.00678673]
 [ 0.02292763  0.01898408  0.03058434  0.03767885 -0.00078635 -0.04131095
   0.01049863 -0.01070913]
 [-0.03095946 -0.01208802 -0.01741817  0.01822146  0.01489382 -0.04860253
  -0.02816772  0.04655448]
 [-0.03956745 -0.02778288 -0.01271453 -0.0489408   0.01927609 -0.03345201
  -0.01254299 -0.03338441]
 [-0.00717854  0.01874379  0.04830558  0.03144133 -0.02392714  0.02617493
   0.03784785  0.03108868]
 [ 0.01120577 -0.00184532  0.01643792  0.00330768  0.02645927  0.02163352
  -0.03643812  0.04222197]
 [-0.01672084  0.01964724 -0.02386712 -0.03663919  0.04117819 -0.01197306
  -0.02973223 -0.04192952]
 [-0.00805389 -0.00713674  0.01546675  0.01619751 -0.0189808  -0.00284543
   0.04655034  0.04955767]
 [-0.01536431 -0.00179927 -0.0039075   0.04382097  0.04367017 -0.02264318
   0.02916554 -0.03942636]
 [ 0.04830645 -

0은 보통 패딩을 나타내는 인덱스로 사용됩니다. 결과적으로, Embedding 레이어의 가중치 행렬의 크기는 (고유한 단어 수 + 1, 임베딩 벡터의 차원수)가 되므로, (11, 8)이 됩니다.