# Tf-Idf
#### TF（Term Frequency）表示某個關鍵詞在整篇文章中出現的頻率
#### IDF（InversDocument Frequency）表示計算倒文本頻率。文本頻率是指某關鍵詞在整個語料"所有文章"中出現的次數

# 計算詞頻

In [1]:
from  sklearn.feature_extraction.text  import  CountVectorizer  
#sklearn.feature_extraction.text.CountVectorizer → Convert a collection of text documents to a matrix of token counts
#CountVectorizer(input="data", encoding="編碼", lowercase=True)

#語料  
corpus = [  
    'This is the first document.' ,  
    'This is the second second document.' ,  
    'And the third one.' ,  
    'Is this the first document?' ,  
]  
#將文本中的詞語轉換為詞頻矩陣  
vectorizer = CountVectorizer()  

#計算詞語出現的次數
#fit_transform(raw_documents[, y]) → Learn the vocabulary dictionary and return term-document matrix
#以此例子:(0,1)→1  (0,2)→1  (0,6)→1  (0,3)→1  (0,8)→1 
#代表第一行在['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']中，'document', 'first', 'is','the','this'各有一值(0不顯示)
X = vectorizer.fit_transform(corpus)  

#獲取詞袋中所有文本關鍵詞
#get_feature_names() → Array mapping from feature integer indices to feature name
word = vectorizer.get_feature_names()  

print(X)
print(word)  
print(X.toarray())

  (0, 1)	1
  (0, 2)	1
  (0, 6)	1
  (0, 3)	1
  (0, 8)	1
  (1, 5)	2
  (1, 1)	1
  (1, 6)	1
  (1, 3)	1
  (1, 8)	1
  (2, 4)	1
  (2, 7)	1
  (2, 0)	1
  (2, 6)	1
  (3, 1)	1
  (3, 2)	1
  (3, 6)	1
  (3, 3)	1
  (3, 8)	1
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]


# 計算Tf-Idf

In [2]:
from  sklearn.feature_extraction.text  import  TfidfTransformer  
  

transformer = TfidfTransformer()   # Transform a count matrix to a normalized tf or tf-idf representation
 
#將詞頻矩陣X統計成TF-IDF值  
tfidf = transformer.fit_transform(X)  
#查看數據結構 tfidf[i][j]表示i類文本中的tf-idf權重  
print(tfidf)
print(tfidf.toarray())

  (0, 8)	0.438776742859
  (0, 3)	0.438776742859
  (0, 6)	0.358728738248
  (0, 2)	0.541976569726
  (0, 1)	0.438776742859
  (1, 8)	0.272301467523
  (1, 3)	0.272301467523
  (1, 6)	0.222624292325
  (1, 1)	0.272301467523
  (1, 5)	0.853225736145
  (2, 6)	0.28847674875
  (2, 0)	0.552805319991
  (2, 7)	0.552805319991
  (2, 4)	0.552805319991
  (3, 8)	0.438776742859
  (3, 3)	0.438776742859
  (3, 6)	0.358728738248
  (3, 2)	0.541976569726
  (3, 1)	0.438776742859
[[ 0.          0.43877674  0.54197657  0.43877674  0.          0.
   0.35872874  0.          0.43877674]
 [ 0.          0.27230147  0.          0.27230147  0.          0.85322574
   0.22262429  0.          0.27230147]
 [ 0.55280532  0.          0.          0.          0.55280532  0.
   0.28847675  0.55280532  0.        ]
 [ 0.          0.43877674  0.54197657  0.43877674  0.          0.
   0.35872874  0.          0.43877674]]


# 計算文件相似度

In [4]:
print(tfidf[0:1])     # tfidf[0:1]為第一行各詞的tfidf

  (0, 8)	0.438776742859
  (0, 3)	0.438776742859
  (0, 6)	0.358728738248
  (0, 2)	0.541976569726
  (0, 1)	0.438776742859


In [5]:
print(tfidf)          # tfidf為全部文本的tfidf

  (0, 8)	0.438776742859
  (0, 3)	0.438776742859
  (0, 6)	0.358728738248
  (0, 2)	0.541976569726
  (0, 1)	0.438776742859
  (1, 8)	0.272301467523
  (1, 3)	0.272301467523
  (1, 6)	0.222624292325
  (1, 1)	0.272301467523
  (1, 5)	0.853225736145
  (2, 6)	0.28847674875
  (2, 0)	0.552805319991
  (2, 7)	0.552805319991
  (2, 4)	0.552805319991
  (3, 8)	0.438776742859
  (3, 3)	0.438776742859
  (3, 6)	0.358728738248
  (3, 2)	0.541976569726
  (3, 1)	0.438776742859


In [6]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(tfidf[0:1], tfidf)    # cosine_similarity(X, Y=None, dense_output=True)

array([[ 1.        ,  0.43830038,  0.1034849 ,  1.        ]])

In [None]:
# 因此可知第一行和第四行相似度高達1，和第三行只有0.103的相似度