# TF - IDF Algorithm

## CountVectorizer 사용해 추천

In [1]:
docs=[
      "먹고 싶은 사과", # 문서 0
      "먹고 싶은 바나나", # 문서 1
      "길고 노란 바나나 바나나", # 문서 2
      "저는 과일이 좋아요" # 문서 3
      ]

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
vect= CountVectorizer() # Counter Vectorizer 객체 생성

In [4]:
# 문장을 counter vectorizer 형태로 변형
countvect= vect.fit_transform(docs)
countvect # 4*9의 sparse matrix볼 수 있음 : 4개의 문서에 9개의 단어 

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [5]:
# toarray()통해 문장이 vector형태의 값을 얻을 수 있음 
# sparse matrix -> numpy형태로 만들어줌 
countvect.toarray()

array([[0, 0, 0, 1, 0, 1, 1, 0, 0],
       [0, 0, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 1, 0, 2, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 1]])

In [6]:
# 하지만, 각 index , column이 무엇을 의미하는지에 대해 알 수 X
# 각 값에 대한 정보 나옴 
vect.vocabulary_

{'과일이': 0,
 '길고': 1,
 '노란': 2,
 '먹고': 3,
 '바나나': 4,
 '사과': 5,
 '싶은': 6,
 '저는': 7,
 '좋아요': 8}

In [7]:
# sorted 함수 통해 단어 정렬 
sorted(vect.vocabulary_)

['과일이', '길고', '노란', '먹고', '바나나', '사과', '싶은', '저는', '좋아요']

In [8]:
# 이를 가지고 DataFrame 만들어 주기
import pandas as pd
countvect_df= pd.DataFrame(countvect.toarray(), columns= sorted(vect.vocabulary_))
countvect_df.index=['문서0','문서1', '문서2','문서3']
countvect_df

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
문서0,0,0,0,1,0,1,1,0,0
문서1,0,0,0,1,1,0,1,0,0
문서2,0,1,1,0,2,0,0,0,0
문서3,1,0,0,0,0,0,0,1,1


In [9]:
# 위의 Data Frame 형태의 코사인 유사도 계산 
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(countvect_df, countvect_df)

array([[1.        , 0.66666667, 0.        , 0.        ],
       [0.66666667, 1.        , 0.47140452, 0.        ],
       [0.        , 0.47140452, 1.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        ]])

## Result
- 문서 0 & 문서 1 유사함
- 문서 1 & 문서 2 유사함 

- 추천의 경우 문서 0을 본 사람에게 문서 1을 추천해줌

## TF-IDF 사용해 추천


In [10]:
# Tfid
from sklearn.feature_extraction.text import TfidfVectorizer
vect=TfidfVectorizer()
tfvect=vect.fit(docs)

In [11]:
tfidv_df = pd.DataFrame(tfvect.transform(docs).toarray() , columns= sorted(vect.vocabulary_))
tfidv_df.index=['문서1', '문서2','문서3','문서4']
tfidv_df

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
문서1,0.0,0.0,0.0,0.526405,0.0,0.667679,0.526405,0.0,0.0
문서2,0.0,0.0,0.0,0.57735,0.57735,0.0,0.57735,0.0,0.0
문서3,0.0,0.47212,0.47212,0.0,0.74445,0.0,0.0,0.0,0.0
문서4,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735


In [12]:
# 위의 Data Frame 형태의 코사인 유사도 계산 
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(tfidv_df, tfidv_df)

array([[1.        , 0.60784064, 0.        , 0.        ],
       [0.60784064, 1.        , 0.42980824, 0.        ],
       [0.        , 0.42980824, 1.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        ]])