# 1. 파이썬을 이용한 TF-IDF 행렬 구현

In [1]:
import pandas as pd
from math import log

In [3]:
docs = [
  '먹고 싶은 사과',
  '먹고 싶은 바나나',
  '길고 노란 바나나 바나나',
  '저는 과일이 좋아요'
]

vocab = list(set(w for doc in docs for w in doc.split()))
vocab.sort()

In [4]:
print(vocab)

['과일이', '길고', '노란', '먹고', '바나나', '사과', '싶은', '저는', '좋아요']


In [5]:
N = len(docs)
print('총 문서의 수', N)

총 문서의 수 4


TF, IDF, 그리고 TF-IDF 값을 구하는 함수를 구현합니다.



In [6]:
# TF를 구하는 함수
def tf(t, d):
  return d.count(t)

In [7]:
# IDF를 구하는 함수
def idf(t):
  df = 0
  for doc in docs:
    df += t in doc
  return log(N/(df+1))

In [8]:
# TF와 IDF의 값을 곱하는 함수
def tfidf(t, d):
  return tf(t,d)* idf(t)

In [9]:
result = []

# 각 문서에 대해서 아래 연산을 반복
for i in range(N):
  result.append([])
  d = docs[i]
  for j in range(len(vocab)):
    t = vocab[j]
    # tf 함수를 호출 : TF 값을 계산
    result[-1].append(tf(t, d))

tf_ = pd.DataFrame(result, columns = vocab)

In [10]:
# TF. 즉, DTM 출력
tf_

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


In [11]:
result = []

# 각 단어에 대해서 idf값을 계산
for j in range(len(vocab)):
    t = vocab[j]
    # idf 함수를 호출 : IDF 값을 계산
    result.append(idf(t))

# IDF 출력
idf_ = pd.DataFrame(result, index=vocab, columns=["IDF"])
idf_

Unnamed: 0,IDF
과일이,0.693147
길고,0.693147
노란,0.693147
먹고,0.287682
바나나,0.287682
사과,0.693147
싶은,0.287682
저는,0.693147
좋아요,0.693147


TF-IDF 행렬을 출력해봅시다.

In [12]:
result = []
for i in range(N):
  result.append([])
  d = docs[i]
  for j in range(len(vocab)):
    t = vocab[j]
    # tfidf 함수를 호출 : TF-IDF 값 계산
    result[-1].append(tfidf(t,d))

# TF-IDF 행렬
tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0.0,0.0,0.0,0.287682,0.0,0.693147,0.287682,0.0,0.0
1,0.0,0.0,0.0,0.287682,0.287682,0.0,0.287682,0.0,0.0
2,0.0,0.693147,0.693147,0.0,0.575364,0.0,0.0,0.0,0.0
3,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147


# 2. 사이킷런을 이용한 DTM과 TF-IDF 실습

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    'you know I want your love',
    'I like you',
    'what should I do ',
]

vector = CountVectorizer()

# 코퍼스로부터 각 단어의 빈도수를 기록
print(vector.fit_transform(corpus).toarray())

# 각 단어와 맵핑된 인덱스 출력
print(vector.vocabulary_)

[[0 1 0 1 0 1 0 1 1]
 [0 0 1 0 0 0 0 1 0]
 [1 0 0 0 1 0 1 0 0]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'you know I want your love',
    'I like you',
    'what should I do ',
]

tfidfv = TfidfVectorizer().fit(corpus)
print(tfidfv.transform(corpus).toarray())
print(tfidfv.vocabulary_)

[[0.         0.46735098 0.         0.46735098 0.         0.46735098
  0.         0.35543247 0.46735098]
 [0.         0.         0.79596054 0.         0.         0.
  0.         0.60534851 0.        ]
 [0.57735027 0.         0.         0.         0.57735027 0.
  0.57735027 0.         0.        ]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


# 4. 코사인 유사도

In [15]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
doc1 = np.array([[0,1,1,1]])
doc2 = np.array([[1,0,1,1]])
doc3 = np.array([[2,0,2,2]])

print('문서 1과 문서2의 유사도 :',cosine_similarity(doc1, doc2))
print('문서 1과 문서3의 유사도 :',cosine_similarity(doc1, doc3))
print('문서 2와 문서3의 유사도 :',cosine_similarity(doc2, doc3))

문서 1과 문서2의 유사도 : [[0.66666667]]
문서 1과 문서3의 유사도 : [[0.66666667]]
문서 2와 문서3의 유사도 : [[1.]]
