# 문장의 유사도 - TF-IDF

In [1]:
import pandas as pd
from math import log

docs = [
    '먹고 싶은 사과',
    '먹고 싶은 바나나',
    '길고 노란 바나나 바나나',
    '저는 과일이 좋아요'
]

vocab = list(set(w for doc in docs for w in doc.split()))
vocab.sort()

vocab

['과일이', '길고', '노란', '먹고', '바나나', '사과', '싶은', '저는', '좋아요']

In [2]:
# 총 문서의 수
n = len(docs)

# tf
def tf(t, d) :
    return d.count(t)

# idf
def idf(t) :
    df = 0
    for doc in docs :
        df += t in doc
    return log(n/(1+df))

# tf-idf
def tfidf(t, d) :
    return tf(t,d) * idf(t)

In [None]:
result = []

for i in range(n) :
    result.append([])
    d = docs[i]
    for j in range(len(vocab)) :
        t = vocab[j]
        result[-1].append(tf(t,d))

# DTM
tf_ = pd.DataFrame(result, columns=vocab)

tf_

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


In [6]:
result = []

for j in range(len(vocab)) :
    t = vocab[j]
    result.append(idf(t))

idf_ = pd.DataFrame(result, index=vocab, columns=["IDF"])
idf_

Unnamed: 0,IDF
과일이,0.693147
길고,0.693147
노란,0.693147
먹고,0.287682
바나나,0.287682
사과,0.693147
싶은,0.287682
저는,0.693147
좋아요,0.693147


In [7]:
result = []

for i in range(n) :
    result.append([])
    d = docs[i]
    for j in range(len(vocab)) :
        t = vocab[j]
        result[-1].append(tfidf(t, d))

tfidf_ = pd.DataFrame(result, columns=vocab)

tfidf_

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0.0,0.0,0.0,0.287682,0.0,0.693147,0.287682,0.0,0.0
1,0.0,0.0,0.0,0.287682,0.287682,0.0,0.287682,0.0,0.0
2,0.0,0.693147,0.693147,0.0,0.575364,0.0,0.0,0.0,0.0
3,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147


## sklearn TfidfVectorizer

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=1, decode_error='ignore')

In [9]:
contents = ['상처받은 아이들은 너무 일찍 커버려',
            '내가 상처받은 거 아는 사람 불편해',
            '잘 사는 사람들은 좋은 사람 되기 쉬워',
            '아무 일도 아니야 괜찮아']

In [10]:
from konlpy.tag import Okt

okt = Okt()

# 형태소 분석
contents_tokens = [okt.morphs(row) for row in contents]

contents_tokens

[['상처', '받은', '아이', '들', '은', '너무', '일찍', '커버', '려'],
 ['내', '가', '상처', '받은', '거', '아는', '사람', '불편해'],
 ['잘', '사는', '사람', '들', '은', '좋은', '사람', '되기', '쉬워'],
 ['아무', '일도', '아니야', '괜찮아']]

In [11]:
contents_for_vectorize = []

for content in contents_tokens :
    sentence = ''
    for word in content :
        sentence = sentence + ' ' + word
    
    contents_for_vectorize.append(sentence)

contents_for_vectorize

[' 상처 받은 아이 들 은 너무 일찍 커버 려',
 ' 내 가 상처 받은 거 아는 사람 불편해',
 ' 잘 사는 사람 들 은 좋은 사람 되기 쉬워',
 ' 아무 일도 아니야 괜찮아']

In [12]:
x = vectorizer.fit_transform(contents_for_vectorize)

num_samples, num_features = x.shape
num_samples, num_features

(4, 17)

In [13]:
vectorizer.get_feature_names_out(), len(vectorizer.get_feature_names_out())

(array(['괜찮아', '너무', '되기', '받은', '불편해', '사는', '사람', '상처', '쉬워', '아는',
        '아니야', '아무', '아이', '일도', '일찍', '좋은', '커버'], dtype=object),
 17)

In [15]:
print(x.toarray())

[[0.         0.43671931 0.         0.34431452 0.         0.
  0.         0.34431452 0.         0.         0.         0.
  0.43671931 0.         0.43671931 0.         0.43671931]
 [0.         0.         0.         0.40104275 0.50867187 0.
  0.40104275 0.40104275 0.         0.50867187 0.         0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.39264414 0.         0.         0.39264414
  0.6191303  0.         0.39264414 0.         0.         0.
  0.         0.         0.         0.39264414 0.        ]
 [0.5        0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.5        0.5
  0.         0.5        0.         0.         0.        ]]


In [17]:
import pandas as pd

tfidf = pd.DataFrame(data=x.toarray(), columns=vectorizer.get_feature_names_out())

tfidf

Unnamed: 0,괜찮아,너무,되기,받은,불편해,사는,사람,상처,쉬워,아는,아니야,아무,아이,일도,일찍,좋은,커버
0,0.0,0.436719,0.0,0.344315,0.0,0.0,0.0,0.344315,0.0,0.0,0.0,0.0,0.436719,0.0,0.436719,0.0,0.436719
1,0.0,0.0,0.0,0.401043,0.508672,0.0,0.401043,0.401043,0.0,0.508672,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.392644,0.0,0.0,0.392644,0.61913,0.0,0.392644,0.0,0.0,0.0,0.0,0.0,0.0,0.392644,0.0
3,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.5,0.0,0.0,0.0


In [18]:
# test data
new_post = ['상처받기 싫어 괜찮아']
# 형태소 분석
new_post_tokens = [okt.morphs(row) for row in new_post]

# 띄어쓰기 기준 형태소를 다시 문장으로
new_post_for_vectorize = []

for content in new_post_tokens :
    sentence = ''
    for word in content :
        sentence = sentence + ' ' + word

    new_post_for_vectorize.append(sentence)

new_post_for_vectorize

[' 상처 받기 싫어 괜찮아']

In [20]:
new_post_vec = vectorizer.transform(new_post_for_vectorize)
print(new_post_vec.toarray())

[[0.78528828 0.         0.         0.         0.         0.
  0.         0.6191303  0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]]


In [28]:
# 벡터간 기하학적 거리 측정
# vector의 크기를 1로 normalize
import scipy as sp

def dist_norm(v1, v2) :
    v1_normalized = v1 / sp.linalg.norm(v1.toarray())
    v2_normalized = v2 / sp.linalg.norm(v2.toarray())

    delta = v1_normalized - v2_normalized

    return sp.linalg.norm(delta.toarray())

In [29]:
dist = [dist_norm(each, new_post_vec) for each in x]

dist

[1.2544516324460195, 1.2261339938790285, 1.4142135623730951, 1.102139611977359]

In [30]:
print('Best post is ', dist.index(min(dist)), ', dist = ', min(dist))
print('Test post is --> ', new_post)
print('Best dist post is --> ', contents[dist.index(min(dist))])

Best post is  3 , dist =  1.102139611977359
Test post is -->  ['상처받기 싫어 괜찮아']
Best dist post is -->  아무 일도 아니야 괜찮아
