In [None]:
# coding: utf-8
"""采用TF-IDF"""
import sys, codecs
import numpy as np
import pandas as pd
import jieba.posseg
import jieba.analyse
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
"""
    TF-IDF:
    1、CountVectorizer 构建词频矩阵
    2、TfidfTransformer构建tf-idf权值计算
    3、文本的关键字
    4、对应的tfidf矩阵
"""

# 数据预处理操作：分词、去停用词、词性筛选
def dataPrepos(text, stopkey):
    result = []
    # 定义选取的词性
    pos = ['n', 'nz', 'v', 'vd', 'vn', 'l', 'a', 'd']
    # 分词
    wordsList = jieba.posseg.cut(text)
    # 去停用词和词性筛选
    for i in wordsList:
        if i.word not in stopkey and i.flag in pos:
            result.append(i.word)
    # 返回结果
    return result


# tf-idf获取文本top10关键词
def getKeyWords_tfidf(data, stopkey, topK):
    idList, titleList, abstractList = data[u'id'], data[u'title'], data[u'abstract']
    corpus = []
    for i in range(len(idList)):
        # 拼接标题和摘要
        text = '%s %s' % (titleList[i], abstractList[i])
        text = dataPrepos(text, stopkey)
        text = " ".join(text)
        corpus.append(text)
        
    # 构建词频矩阵，将文本中的词语转化为词频矩阵
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)
    # 统计每个词的tf-idf值
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(X)
    # 获取词袋模型中的关键字
    word = vectorizer.get_feature_names()
    # 获取tf-idf矩阵，a[i][j]表示j词在i篇文本中的tf-idf权重
    weight = tfidf.toarray()
    # 打印词语权重
    ids, titles, keys = [], [], []
    for i in range(len(weight)):
        print(u'--这里输出第', i+1, u'篇文本的词语tf-idf--')
        ids.append(idList[i])
        titles.append(titleList[i])
        df_word, df_weight = [], [] # 当前文章的所有词汇列表，词汇对应权重列表
        for j in range(len(word)):
            df_word.append(word[j])
            df_weight.append(weight[i][j])
        
        df_word = pd.DataFrame(df_word, columns=['word'])
        df_weight = pd.DataFrame(df_weight, columns=['weight'])
        
        # 拼接词汇列表和权重列表
        word_weight = pd.concat([df_word, df_weight], axis=1)
        # 拼接词汇列表和权重列表
        word_weight = word_weight.sort_values(by='weight', ascending=False)
        
        # 选择词汇列并转成数组形式
        keyword = np.array(word_weight['word'])
        # 抽取前topK个词汇作为关键词
        word_split = [keyword[k] for k in range(0, topK)]
        word_split = " ".join(word_split)
        keys.append(word_split.encode('utf-8'))  
    result = pd.DataFrame({"id": ids, "title": titles, "key": keys}, columns=['id', 'title', 'key'])
    
    return result

# 主函数
def main():
    # 读取数据集
    dataFile = "./data/sample_data.csv"
    data = pd.read_csv(dataFile)
    # 停用词表
    stopkey = [w.strip() for w in codecs.open('./data/stopWord.txt', 'r', 'utf-8').readlines()]
    # tf-idf关键词提取
    result = getKeyWords_tfidf(data, stopkey, 10)
    result.to_csv("./result/keys_tfidf.csv", encoding='utf-8', index=False)
    
if __name__ == '__main__':
    main()

--这里输出第 1 篇文本的词语tf-idf--
--这里输出第 2 篇文本的词语tf-idf--
--这里输出第 3 篇文本的词语tf-idf--
--这里输出第 4 篇文本的词语tf-idf--
--这里输出第 5 篇文本的词语tf-idf--
--这里输出第 6 篇文本的词语tf-idf--
--这里输出第 7 篇文本的词语tf-idf--
--这里输出第 8 篇文本的词语tf-idf--
--这里输出第 9 篇文本的词语tf-idf--
--这里输出第 10 篇文本的词语tf-idf--
[b'\xe7\x94\xb5\xe6\x9c\xba \xe9\x98\xb2\xe6\xba\x9c \xe6\xb0\xb8\xe7\xa3\x81 \xe6\x8e\xa7\xe5\x88\xb6 \xe7\xad\x96\xe7\x95\xa5 \xe8\xb8\x8f\xe6\x9d\xbf \xe5\x8d\x95\xe5\x85\x83 \xe5\x8a\x9b\xe7\x9f\xa9 \xe6\x95\xb4\xe8\xbd\xa6 \xe8\xbd\xac\xe9\x80\x9f', b'\xe6\xba\x83\xe7\xbc\xa9 \xe8\xbd\xa6\xe9\x97\xa8 \xe6\x9c\xba\xe5\x8a\xa8\xe8\xbd\xa6\xe8\xbe\x86 \xe7\xbb\x93\xe6\x9e\x84 \xe5\xae\x89\xe6\x8e\x92 \xe7\x89\xb9\xe5\x88\xab \xe8\xa6\x86\xe7\x9b\x96 \xe8\xae\xbe\xe8\xae\xa1 \xe8\xb4\x9f\xe8\x8d\xb7 \xe5\xa2\x9e\xe5\x8a\xa0', b'\xe6\x94\xaf\xe6\x9e\xb6 \xe6\x94\xaf\xe6\x92\x91 \xe6\xa8\xaa\xe5\x90\x91 \xe7\xab\xaf\xe9\x83\xa8 \xe5\x81\x8f\xe5\x8e\x8b \xe4\xbb\xaa\xe8\xa1\xa8\xe6\x9d\xbf \xe5\xaf\xbc\xe5\xbc\x95 \xe9\x9d\xa2\xe6\x9d\xbf \x

In [3]:
%debug

> [0;32m<ipython-input-2-98d8494c5177>[0m(57)[0;36mmain[0;34m()[0m
[0;32m     55 [0;31m    [0;31m# tf-idf关键词提取[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     56 [0;31m    [0mresult[0m [0;34m=[0m [0mgetKeyWords_tfidf[0m[0;34m([0m[0mdata[0m[0;34m,[0m [0mstopkey[0m[0;34m,[0m [0;36m10[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m---> 57 [0;31m    [0mresult[0m[0;34m.[0m[0mto_csv[0m[0;34m([0m[0;34m"./data/result/keys_tfidf.csv"[0m[0;34m,[0m [0mindex[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m     58 [0;31m[0;34m[0m[0m
[0m[0;32m     59 [0;31m[0;32mif[0m [0m__name__[0m [0;34m==[0m [0;34m'__main__'[0m[0;34m:[0m[0;34m[0m[0m
[0m
ipdb> corpus
*** NameError: name 'corpus' is not defined
ipdb> result
ipdb> main()
> [0;32m<ipython-input-2-98d8494c5177>[0m(57)[0;36mmain[0;34m()[0m
[0;32m     55 [0;31m    [0;31m# tf-idf关键词提取[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     56 [0;31m    [0mresult[0m [0;34m=[0m 