# 문장의 유사도 - CountVectorizer

In [1]:
from konlpy.tag import Okt

okt = Okt()

In [None]:
def build_bag_of_words(document) :
    document = document.replace('.', '')
    tokenized_document = okt.morphs(document)   # 형태소 분석

    word_to_index = {}
    bow = []

    for word in tokenized_document :
        if word not in word_to_index.keys() :
            word_to_index[word] = len(word_to_index)
            bow.insert(len(word_to_index)-1, 1)     # BoW에 전부 기본값 1을 삽입

        else :
            index = word_to_index.get(word)     # 재등장하는 단어 인덱스
            bow[index] = bow[index] + 1         # 재등장한 단어는 해당하는 인덱스의 위치에 1을 +

    return word_to_index, bow

In [3]:
doc = "정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다."

vocab, bow = build_bag_of_words(doc)

print("vocabulary : ", vocab)
print("bag of words vector : ", bow)

vocabulary :  {'정부': 0, '가': 1, '발표': 2, '하는': 3, '물가상승률': 4, '과': 5, '소비자': 6, '느끼는': 7, '은': 8, '다르다': 9}
bag of words vector :  [1, 2, 1, 1, 2, 1, 1, 1, 1, 1]


## sklearn CountVectorizer

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=1)

In [None]:
# train data
contents = ['상처받은 아이들은 너무 일찍 커버려',
            '내가 상처받은 거 아는 사람 불편해',
            '잘 사는 사람들은 좋은 사람 되기 쉬워',
            '아무 일도 아니야 괜찮아']

In [6]:
from konlpy.tag import Okt

okt = Okt()

In [7]:
contents_tokens = [okt.morphs(row) for row in contents]

contents_tokens

[['상처', '받은', '아이', '들', '은', '너무', '일찍', '커버', '려'],
 ['내', '가', '상처', '받은', '거', '아는', '사람', '불편해'],
 ['잘', '사는', '사람', '들', '은', '좋은', '사람', '되기', '쉬워'],
 ['아무', '일도', '아니야', '괜찮아']]

In [9]:
contents_for_vectorize = []

for content in contents_tokens :
    sentence = ''
    for word in content :
        sentence = sentence + ' ' + word
    
    contents_for_vectorize.append(sentence)

contents_for_vectorize

[' 상처 받은 아이 들 은 너무 일찍 커버 려',
 ' 내 가 상처 받은 거 아는 사람 불편해',
 ' 잘 사는 사람 들 은 좋은 사람 되기 쉬워',
 ' 아무 일도 아니야 괜찮아']

In [11]:
x = vectorizer.fit_transform(contents_for_vectorize)

num_samples, num_features = x.shape
num_samples, num_features

(4, 17)

In [14]:
vectorizer.get_feature_names_out(), len(vectorizer.get_feature_names_out())

(array(['괜찮아', '너무', '되기', '받은', '불편해', '사는', '사람', '상처', '쉬워', '아는',
        '아니야', '아무', '아이', '일도', '일찍', '좋은', '커버'], dtype=object),
 17)

In [15]:
x.toarray()

array([[0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1],
       [0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 1, 2, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0]])

In [21]:
import pandas as pd

dtm = pd.DataFrame(data=x.toarray(), columns=vectorizer.get_feature_names_out())

dtm

Unnamed: 0,괜찮아,너무,되기,받은,불편해,사는,사람,상처,쉬워,아는,아니야,아무,아이,일도,일찍,좋은,커버
0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,1,0,1
1,0,0,0,1,1,0,1,1,0,1,0,0,0,0,0,0,0
2,0,0,1,0,0,1,2,0,1,0,0,0,0,0,0,1,0
3,1,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0


In [22]:
# test data
new_post = ['상처받기 싫어 괜찮아']
new_post_tokens = [okt.morphs(row) for row in new_post]

new_post_for_vectorize = []

for content in new_post_tokens :
    sentence = ''
    for word in content :
        sentence = sentence + ' ' + word

    new_post_for_vectorize.append(sentence)

new_post_for_vectorize

[' 상처 받기 싫어 괜찮아']

In [23]:
new_post_vec = vectorizer.transform(new_post_for_vectorize)
new_post_vec.toarray()

array([[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [24]:
# 벡터간 기하학적 거리 측정
import scipy as sp

def dist_raw(v1, v2) :
    delta = v1 - v2
    return sp.linalg.norm(delta.toarray())

In [25]:
# 4개의 문장 벡터`contents_for_vectorize` 와 테스트 문장 벡터`new_post_for_vectorize` 간의 거리 계산
dist = [dist_raw(each, new_post_vec) for each in x]

dist

[2.449489742783178, 2.23606797749979, 3.1622776601683795, 2.0]

In [26]:
print('Best post is ', dist.index(min(dist)), ', dist = ', min(dist))
print('Test post is --> ', new_post)
print('Best dist post is --> ', contents[dist.index(min(dist))])

Best post is  3 , dist =  2.0
Test post is -->  ['상처받기 싫어 괜찮아']
Best dist post is -->  아무 일도 아니야 괜찮아


In [27]:
for i in range(0, len(contents)) :
    print(x.getrow(i).toarray())

print('-'*40)
print(new_post_vec.toarray())

[[0 1 0 1 0 0 0 1 0 0 0 0 1 0 1 0 1]]
[[0 0 0 1 1 0 1 1 0 1 0 0 0 0 0 0 0]]
[[0 0 1 0 0 1 2 0 1 0 0 0 0 0 0 1 0]]
[[1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0]]
----------------------------------------
[[1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]]
