In [1]:
#提取tf-idf特征
#语料库
corpus = [
    'this is the first document',
    'this is the second second document',
    'and the third one',
    'is this the first document'
]

In [2]:
#分词
word_list = []
for i in range(len(corpus)):
    word_list.append(corpus[i].split(' '))
print(word_list)

[['this', 'is', 'the', 'first', 'document'], ['this', 'is', 'the', 'second', 'second', 'document'], ['and', 'the', 'third', 'one'], ['is', 'this', 'the', 'first', 'document']]


In [3]:
#使用gensim提取tf-idf
from gensim import corpora
# 赋给语料库中每个词(不重复的词)一个整数id
dictionary = corpora.Dictionary(word_list)
new_corpus = [dictionary.doc2bow(text) for text in word_list]
# 元组中第一个元素是词语在词典中对应的id，第二个元素是词语在文档中出现的次数
print(new_corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(0, 1), (2, 1), (3, 1), (4, 1), (5, 2)], [(3, 1), (6, 1), (7, 1), (8, 1)], [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]]


In [4]:
 # 通过下面的方法可以看到语料库中每个词对应的id
 print(dictionary.token2id)

{'document': 0, 'first': 1, 'is': 2, 'the': 3, 'this': 4, 'second': 5, 'and': 6, 'one': 7, 'third': 8}


In [None]:
# 训练模型并保存
from gensim import models
tfidf = models.TfidfModel(new_corpus)
tfidf.save("my_model.tfidf")

# 载入模型
tfidf = models.TfidfModel.load("my_model.tfidf")

# 使用这个训练好的模型得到单词的tfidf值
tfidf_vec = []
for i in range(len(corpus)):
    string = corpus[i]
    string_bow = dictionary.doc2bow(string.lower().split())
    string_tfidf = tfidf[string_bow]
    tfidf_vec.append(string_tfidf)
print(tfidf_vec)

结论

gensim训练出来的tf-idf值左边是词的id，右边是词的tfidf值  
gensim有自动去除停用词的功能，比如the    
gensim会自动去除单个字母，比如i  
gensim会去除没有被训练到的词，比如name  
所以通过gensim并不能计算每个单词的tfidf值  

In [6]:
#sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer()
tfidf_matrix = tfidf_vec.fit_transform(corpus)

# 得到语料库所有不重复的词
print(tfidf_vec.get_feature_names())

# 得到每个单词对应的id值
print(tfidf_vec.vocabulary_)

# 得到每个句子所对应的向量
# 向量里数字的顺序是按照词语的id顺序来的
print(tfidf_matrix.toarray())


['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
{'this': 8, 'is': 3, 'the': 6, 'first': 2, 'document': 1, 'second': 5, 'and': 0, 'third': 7, 'one': 4}
[[0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]
 [0.         0.27230147 0.         0.27230147 0.         0.85322574
  0.22262429 0.         0.27230147]
 [0.55280532 0.         0.         0.         0.55280532 0.
  0.28847675 0.55280532 0.        ]
 [0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]]


In [9]:
#直接用python
#统计词频
from collections import Counter
countlist = []
for i in range(len(word_list)):
    count = Counter(word_list[i])
    countlist.append(count)
countlist


[Counter({'this': 1, 'is': 1, 'the': 1, 'first': 1, 'document': 1}),
 Counter({'this': 1, 'is': 1, 'the': 1, 'second': 2, 'document': 1}),
 Counter({'and': 1, 'the': 1, 'third': 1, 'one': 1}),
 Counter({'is': 1, 'this': 1, 'the': 1, 'first': 1, 'document': 1})]

In [10]:
# word可以通过count得到，count可以通过countlist得到

# count[word]可以得到每个单词的词频， sum(count.values())得到整个句子的单词总数
def tf(word, count):
    return count[word] / sum(count.values())

# 统计的是含有该单词的句子数
def n_containing(word, count_list):
    return sum(1 for count in count_list if word in count)
 
# len(count_list)是指句子的总数，n_containing(word, count_list)是指含有该单词的句子的总数，加1是为了防止分母为0
def idf(word, count_list):
    return math.log(len(count_list) / (1 + n_containing(word, count_list)))

# 将tf和idf相乘
def tfidf(word, count, count_list):
    return tf(word, count) * idf(word, count_list)

In [11]:
import math
for i, count in enumerate(countlist):
    print("Top words in document {}".format(i + 1))
    scores = {word: tfidf(word, count, countlist) for word in count}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

Top words in document 1
	Word: first, TF-IDF: 0.05754
	Word: this, TF-IDF: 0.0
	Word: is, TF-IDF: 0.0
	Word: document, TF-IDF: 0.0
	Word: the, TF-IDF: -0.04463
Top words in document 2
	Word: second, TF-IDF: 0.23105
	Word: this, TF-IDF: 0.0
	Word: is, TF-IDF: 0.0
	Word: document, TF-IDF: 0.0
	Word: the, TF-IDF: -0.03719
Top words in document 3
	Word: and, TF-IDF: 0.17329
	Word: third, TF-IDF: 0.17329
	Word: one, TF-IDF: 0.17329
	Word: the, TF-IDF: -0.05579
Top words in document 4
	Word: first, TF-IDF: 0.05754
	Word: is, TF-IDF: 0.0
	Word: this, TF-IDF: 0.0
	Word: document, TF-IDF: 0.0
	Word: the, TF-IDF: -0.04463


In [None]:
#计算互信息
#x为特征，y为预测biao'qian
from sklearn import metrics as mr
mr.mutual_info_score(label,x)