In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
corpus = [
    '我 爱 北京 天安门',
    '北京 欢迎 您',
    '欢迎 到 北京 来',
    '同一个 世界 同一个 梦想'
]

vectorizer = TfidfVectorizer()

In [2]:
# Learn vocabulary and idf, return term-document matrix.
X = vectorizer.fit_transform(corpus)
X

<4x6 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [3]:
# (文档，特征文本词索引) 权重
print(type(X))
print(X)

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 1)	0.538028969103
  (0, 3)	0.84292634815
  (1, 1)	0.62922751467
  (1, 5)	0.777221162079
  (2, 1)	0.62922751467
  (2, 5)	0.777221162079
  (3, 2)	0.816496580928
  (3, 0)	0.408248290464
  (3, 4)	0.408248290464


In [4]:
vectorizer.inverse_transform(X)

[array(['北京', '天安门'],
       dtype='<U3'), array(['北京', '欢迎'],
       dtype='<U3'), array(['北京', '欢迎'],
       dtype='<U3'), array(['同一个', '世界', '梦想'],
       dtype='<U3')]

In [5]:
# 以list形式返回整个语料库的文本特征
feature_names = vectorizer.get_feature_names()
feature_names

['世界', '北京', '同一个', '天安门', '梦想', '欢迎']

In [6]:
# 文档-词汇矩阵，行表示文档，列表示特征词，值表示权重
print(X.toarray())

[[ 0.          0.53802897  0.          0.84292635  0.          0.        ]
 [ 0.          0.62922751  0.          0.          0.          0.77722116]
 [ 0.          0.62922751  0.          0.          0.          0.77722116]
 [ 0.40824829  0.          0.81649658  0.          0.40824829  0.        ]]


In [7]:
doc_item_mat = X.toarray()
rows,cols = np.shape(doc_item_mat)
print(rows,cols)

4 6


In [8]:
for row_index, row in enumerate(doc_item_mat):
    print('No.',row_index)
    print([feature_names[i] for i in row.argsort()[:-3:-1]])

No. 0
['天安门', '北京']
No. 1
['欢迎', '北京']
No. 2
['欢迎', '北京']
No. 3
['同一个', '梦想']


In [9]:
print(feature_names)
# 用拟合的特征词进行预测
fit_opt = vectorizer.fit(corpus)
fit_opt.transform(['北京 天安门 欢迎 您']).toarray()

['世界', '北京', '同一个', '天安门', '梦想', '欢迎']


array([[ 0.        ,  0.44809973,  0.        ,  0.70203482,  0.        ,
         0.55349232]])