In [6]:
"""

단어 수준의 원-핫 인코딩하기(keras의 tokenizer를 사용하지 않고)

"""

import numpy as np

text_samples = ["The cat sat on the mat.", "The dog ate my homework."]
token_attached_with_index = {}

# 해당 중첩 for문을 통해서 토큰화가 진행되고, 각 토큰에(단어에) 고유 index가 주어짐

for sample in text_samples:
    
    for word in sample.split():
        
        if word not in token_attached_with_index:
            
            token_attached_with_index[word] = len(token_attached_with_index) + 1

print("* * token_attached_with_index :", token_attached_with_index, "\n")

# 'token_attached_with_index'을 기반으로, 주어진 두 문장(text_samples)을 one-hot vector로 나타내기

all_tokens_count = len(token_attached_with_index.keys())
result = np.zeros(shape = (len(text_samples), all_tokens_count, max(token_attached_with_index.values())+1))
sample_number = 1
for i, sample in enumerate(text_samples):
    
    for j, word in list(enumerate(sample.split()))[:all_tokens_count]:
        
        index = token_attached_with_index.get(word)
        result[i, j, index] = 1.
    print("* number", sample_number, "text sample's index and token :", list(enumerate(sample.split()))[:all_tokens_count])
    sample_number = sample_number + 1

print("\n")    
print("* tokenized result :", "\n")
print(result)
        

* * token_attached_with_index : {'The': 1, 'cat': 2, 'sat': 3, 'on': 4, 'the': 5, 'mat.': 6, 'dog': 7, 'ate': 8, 'my': 9, 'homework.': 10} 

* number 1 text sample's index and token : [(0, 'The'), (1, 'cat'), (2, 'sat'), (3, 'on'), (4, 'the'), (5, 'mat.')]
* number 2 text sample's index and token : [(0, 'The'), (1, 'dog'), (2, 'ate'), (3, 'my'), (4, 'homework.')]


* tokenized result : 

[[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 

In [1]:
"""

keras의 Tokenizer 객체를 이용하여 토큰화 실시

"""

from tensorflow.keras.preprocessing.text import Tokenizer

text_samples = ["The cat sat on the mat.", "The dog ate my homework."]

# tokenizer 객체 선언
tokenizer = Tokenizer(num_words=1000)

# tokenizer를 주어진 text samples에 적용
tokenizer.fit_on_texts(text_samples)

# word dictionary 산출
print("* word dictionary :", tokenizer.word_index, "\n")

# 앞선 토큰화를 토대로, 정수로 인코딩된 텍스트 데이터를 반환
sequences = tokenizer.texts_to_sequences(text_samples)
print("* text samples expressed with token's index :", sequences, "\n")

# 앞선 토큰화를 토대로, 원 핫 인코딩된 결과를 반환
print("* text samples expressed with one-hot vector :", "\n")
one_hot_result = tokenizer.texts_to_matrix(text_samples, mode="binary")
print(one_hot_result, "\n")

* word dictionary : {'the': 1, 'cat': 2, 'sat': 3, 'on': 4, 'mat': 5, 'dog': 6, 'ate': 7, 'my': 8, 'homework': 9} 

* text samples expressed with token's index : [[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]] 

* text samples expressed with one-hot vector : 

[[0. 1. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]] 



In [8]:
#  embadding을 하기 위해 정수로 인코딩 된 텍스트 데이터들의 길이를 동일하게 맞추기

from tensorflow.keras import preprocessing

padding_result = preprocessing.sequence.pad_sequences(sequences, maxlen=10, truncating="post")
print("* padding_result :", "\n")
print(padding_result)

* padding_result : 

[[0 0 0 0 1 2 3 4 1 5]
 [0 0 0 0 0 1 6 7 8 9]]
