In [1]:
print(1)

1


### 정수 인코딩

In [2]:
sentences = [
    "나는 오늘 기분이 좋다",
    "오늘 날씨가 좋다",
    "나는 기분이 나쁘다"
]

# tokens라는 빈 리스트를 만든다
tokens = []

# 각 문장마다 스페이스를 기준으로 쪼갠 후 tokens에 넣는다 (.split())
for s in sentences:
    tokens.extend(s.split())
print(tokens)

# tokens를 집합으로 바꾼다.
token_set = set(tokens)
print(token_set)

# tokens = []
# for s in sentences:
#     tokens.append(s.split())
# print(tokens)
# print(sum(tokens, []))


['나는', '오늘', '기분이', '좋다', '오늘', '날씨가', '좋다', '나는', '기분이', '나쁘다']
{'기분이', '오늘', '나는', '날씨가', '나쁘다', '좋다'}


In [4]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded = label_encoder.fit_transform(list(token_set))
print(encoded)
print(label_encoder.classes_)
print(type(encoded))

word_info = {}
for word, index in zip(label_encoder.classes_, encoded):
    word_info[int(index)] = str(word)

print(word_info)


[0 4 1 3 2 5]
['기분이' '나는' '나쁘다' '날씨가' '오늘' '좋다']
<class 'numpy.ndarray'>
{0: '기분이', 4: '나는', 1: '나쁘다', 3: '날씨가', 2: '오늘', 5: '좋다'}


### One Hot encoding

In [5]:
# onehot encoding
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# 집합을 배열로 바꾸기 
token_arr_tmp = np.array(list(token_set))
print(token_arr_tmp)

# 배열 형태 바꾸기
token_arr = token_arr_tmp.reshape(-1, 1)
print(token_arr)

oh_encoder = OneHotEncoder()
encoded = oh_encoder.fit_transform(token_arr)
print(encoded.toarray())

# oh_encoder2 = OneHotEncoder()
# encoded2 = oh_encoder2.fit_transform(np.array([sentences]))
# print(encoded2.toarray())


['기분이' '오늘' '나는' '날씨가' '나쁘다' '좋다']
[['기분이']
 ['오늘']
 ['나는']
 ['날씨가']
 ['나쁘다']
 ['좋다']]
[[1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]]


### BOW (Bag of Words)

In [6]:
sentences = [
    "나는 오늘 기분이 좋다",
    "오늘 날씨가 좋다",
    "나는 기분이 나쁘다"
]

# sentences에서 나올 수 있는 단어들에 대해 몇번 등장하는지 count를 확인
words_s = [s.split() for s in sentences]
print(words_s)

words_a = sum(words_s, [])
print(words_a)

words_set = set(words_a)
print(words_set)

# 딕셔너리로 출력
words_count_dict = {w: words_a.count(w) for w in words_set}

print(words_count_dict)


[['나는', '오늘', '기분이', '좋다'], ['오늘', '날씨가', '좋다'], ['나는', '기분이', '나쁘다']]
['나는', '오늘', '기분이', '좋다', '오늘', '날씨가', '좋다', '나는', '기분이', '나쁘다']
{'기분이', '오늘', '나는', '날씨가', '나쁘다', '좋다'}
{'기분이': 2, '오늘': 2, '나는': 2, '날씨가': 1, '나쁘다': 1, '좋다': 2}


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

sentences = [
    "나는 오늘 기분이 좋다",
    "오늘 날씨가 좋다",
    "나는 기분이 나쁘다"
]

vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(sentences)
print(vectorizer.vocabulary_)
print(bow_matrix.toarray())


{'나는': 1, '오늘': 4, '기분이': 0, '좋다': 5, '날씨가': 3, '나쁘다': 2}
[[1 1 0 0 1 1]
 [0 0 0 1 1 1]
 [1 1 1 0 0 0]]


### TF-IDF

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

sentences = [
    "나는 오늘 기분이 좋다 나는 겨울이 좋다",
    "오늘 날씨가 좋다",
    "나는 기분이 나쁘다",
    "집에 가고 싶다 집 좋다"
]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(sentences)
print(vectorizer.vocabulary_)
print(tfidf_matrix.toarray())
print(tfidf_matrix)



{'나는': 3, '오늘': 7, '기분이': 2, '좋다': 8, '겨울이': 1, '날씨가': 5, '나쁘다': 4, '집에': 9, '가고': 0, '싶다': 6}
[[0.         0.39655081 0.31264522 0.62529043 0.         0.
  0.         0.31264522 0.5062265  0.        ]
 [0.         0.         0.         0.         0.         0.70203482
  0.         0.55349232 0.44809973 0.        ]
 [0.         0.         0.52640543 0.52640543 0.66767854 0.
  0.         0.         0.         0.        ]
 [0.5417361  0.         0.         0.         0.         0.
  0.5417361  0.         0.34578314 0.5417361 ]]
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 15 stored elements and shape (4, 10)>
  Coords	Values
  (0, 3)	0.6252904326448119
  (0, 7)	0.31264521632240594
  (0, 2)	0.31264521632240594
  (0, 8)	0.5062264952069911
  (0, 1)	0.39655081366042105
  (1, 7)	0.5534923152870045
  (1, 8)	0.4480997313625986
  (1, 5)	0.7020348194149619
  (2, 3)	0.5264054336099155
  (2, 2)	0.5264054336099155
  (2, 4)	0.6676785446095399
  (3, 8)	0.3457831381910465
  (3, 9)	0.54

In [32]:
# TF-IDF 유사도 계산

from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(tfidf_matrix)
print(similarity)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 15 stored elements and shape (4, 10)>
  Coords	Values
  (0, 3)	0.6252904326448119
  (0, 7)	0.31264521632240594
  (0, 2)	0.31264521632240594
  (0, 8)	0.5062264952069911
  (0, 1)	0.39655081366042105
  (1, 7)	0.5534923152870045
  (1, 8)	0.4480997313625986
  (1, 5)	0.7020348194149619
  (2, 3)	0.5264054336099155
  (2, 2)	0.5264054336099155
  (2, 4)	0.6676785446095399
  (3, 8)	0.3457831381910465
  (3, 9)	0.5417361046803605
  (3, 0)	0.5417361046803605
  (3, 6)	0.5417361046803605
[[1.         0.39988668 0.49373442 0.17504459]
 [0.39988668 1.         0.         0.15494533]
 [0.49373442 0.         1.         0.        ]
 [0.17504459 0.15494533 0.         1.        ]]


### 문장 임베딩

In [29]:
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
    "나는 오늘 기분이 좋다 나는 겨울이 좋다",
    "오늘 날씨가 좋다",
    "나는 기분이 나쁘다",
    "집에 가고 싶다 집 좋다"
]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sequence = tokenizer.texts_to_sequences(sentences)
print(tokenizer.word_index)
print(sequence)

{'좋다': 1, '나는': 2, '오늘': 3, '기분이': 4, '겨울이': 5, '날씨가': 6, '나쁘다': 7, '집에': 8, '가고': 9, '싶다': 10, '집': 11}
[[2, 3, 4, 1, 2, 5, 1], [3, 6, 1], [2, 4, 7], [8, 9, 10, 11, 1]]


In [30]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

print(sequence)
padded_sequences = pad_sequences(sequence, maxlen=5)
print(padded_sequences)

[[2, 3, 4, 1, 2, 5, 1], [3, 6, 1], [2, 4, 7], [8, 9, 10, 11, 1]]
[[ 4  1  2  5  1]
 [ 0  0  3  6  1]
 [ 0  0  2  4  7]
 [ 8  9 10 11  1]]


In [31]:
similarity = cosine_similarity(padded_sequences)
print(padded_sequences)
print(similarity)

[[ 4  1  2  5  1]
 [ 0  0  3  6  1]
 [ 0  0  2  4  7]
 [ 8  9 10 11  1]]
[[1.         0.79574493 0.54436252 0.8908486 ]
 [0.79574493 1.         0.65674725 0.74655152]
 [0.54436252 0.65674725 1.         0.44617042]
 [0.8908486  0.74655152 0.44617042 1.        ]]
