In [None]:
# coding: utff-8
"""采用TF-IDF"""
import sys, codecs
import numpy as np
import pandas as pd
import jieba.posseg
import jieba.analyse
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
"""
    TF-IDF:
    1、CountVectorizer 构建词频矩阵
    2、TfidfTransformer构建tf-idf权值计算
    3、文本的关键字
    4、对应的tfidf矩阵
"""

# 数据预处理操作：分词、去停用词、词性筛选
def dataPrepos(text, stopkey):
    result = []
    # 定义选取的词性
    pos = ['n', 'nz', 'v', 'vd', 'vn', 'l', 'a', 'd']
    # 分词
    wordsList = jieba.posseg.cut(text)
    # 去停用词和词性筛选
    for i in wordsList:
        if i.word not in stopkey and i.flag in pos:
            result.append(i.word)
    # 返回结果
    return result


# tf-idf获取文本top10关键词
def getKeywords_tfidf(data, stopkey, topK):
    idList, titleList, abstractList = data[u'id'], data[u'title'], data[u'abstract']
    corpus = []
    for i in range(len(idList)):
        # 拼接标题和摘要
        text = '%s %s' % (titleList[i], abstractList[i])
        text = dataPrepos(text, stopkey)
        text = " ".join(text)
        corpus.append(text)
    %debug()
    print(corpus)
    print(12)

# 主函数
def main():
    # 读取数据集
    dataFile = "./data/sample_data.csv"
    data = pd.read_csv(dataFile)
    # 停用词表
    stopkey = [w.strip() for w in codecs.open('./data/stopWord.txt', 'r', 'utf-8').readlines()]
    # tf-idf关键词提取
    result = getKeyWords_tifidf(data, stopkey, 10)
    result.to_csv("./data/result/keys_tfidf.csv", index=False)
    
if __name__ == '__main__':
    main()