### 基于 textRank 的关键词抽取探索
本文使用一个简易版本的textRank，以此来了解其机理，并使用nips数据集，进行充分的数据探索

In [1]:
import pandas as pd
# load the dataset
dataset = pd.read_csv('data/papers2.csv')
dataset.head()

Unnamed: 0,id,year,abstract1
0,1861,2000,Algorithms for Non-negative Matrix Factorizati...
1,1975,2001,Characterizing Neural Gain Control using Spike...
2,3163,2007,Competition Adds Complexity It is known that d...
3,3164,2007,Efficient Principled Learning of Thin Junction...
4,3167,2007,Regularized Boost for Semi-Supervised Learning...


### textRank 相关内容

In [6]:
import numpy as np
# 英文预处理相关模块
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

class TextRank(object):
    
    def __init__(self, sentence, window, alpha, iternum):
        """
        sentence: 原始文本
        window: 创建相邻节点所使用的窗口数
        alpha: textRank中使用到的参数
        iternum: 迭代次数
        """
        self.sentence = sentence
        self.window = window
        self.alpha = alpha
        self.edge_dict = {}  # 记录节点的边连接字典
        self.iternum = iternum
        
    # 获取英文预处理的停用词表
    def getStopWords(self):
        # Creating a list of stop words and adding custom stopwords
        stop_words = set(stopwords.words("english"))
        # Creating a list of custom stopwords
        news_words = ["using", "show", "result", "large", "also", "iv", "one", "two", "new", "previously", "shown"]
        self.stop_words = stop_words.union(news_words)
        
    # 进行英文文本预处理
    def dealSentence(self):
        # 对self.sentence进行处理
        
        # 去除标点
        text = re.sub('[^a-zA-Z]', ' ', self.sentence)
        
        # 转化成小写
        text = text.lower()
        
        # 去除符号
        text = re.sub("</?.*?>"," <> ",text)
        
        # 去除特殊字符和数字
        text = re.sub("(\d|\W)+"," ",text)
        
        # 将列表转换成string
        text = text.split()
        
        # 词干处理
        ps = PorterStemmer()
        
        # 词性
        lem = WordNetLemmatizer()
        text = [lem.lemmatize(word) for word in text if not word in self.stop_words]
        
        self.dealedSen = " ".join(text)
        print(self.dealedSen)
        
        self.dealedSentence = text
        
    
    # 根据窗口，构建每个节点的相邻节点，返回边的集合
    def createNodes(self):
        tmp_list = []
        word_list_len = len(self.dealedSentence)
        for index, word in enumerate(self.dealedSentence):
            if word not in self.edge_dict.keys():
                tmp_list.append(word)
                tmp_set = set()
                left = index - self.window + 1  # 窗口左边界
                right = index + self.window  # 窗口右边界
                # 越界处理
                if left < 0:
                    left = 0
                if right >= word_list_len:
                    right = word_list_len
                # 取滑动窗口内的不同单词，即建立邻接过程
                for i in range(left, right):
                    if i == index:
                        continue
                    tmp_set.add(self.dealedSentence[i])
                self.edge_dict[word] = tmp_set
    
    # 根据边的关系，构建矩阵
    def createMatrix(self):
        self.matrix = np.zeros([len(set(self.dealedSentence)), len(set(self.dealedSentence))])
        self.word_index = {}  # 记录词的index
        self.index_dict = {}  # 记录节点index对应的词
        
        for i, v in enumerate(set(self.dealedSentence)):
            self.word_index[v] = i
            self.index_dict[i] = v
        
        for key in self.edge_dict.keys():
            for w in self.edge_dict[key]:
                self.matrix[self.word_index[key]][self.word_index[w]] = 1
                self.matrix[self.word_index[w]][self.word_index[key]] = 1
        
        # 归一化
        for j in range(self.matrix.shape[1]):
            summ = 0
            for i in range(self.matrix.shape[0]):
                summ += self.matrix[i][j]
            for i in range(self.matrix.shape[0]):
                self.matrix[i][j] /= summ
        
    # 根据textRank公式计算权重
    def calPR(self):
        self.PR = np.ones([len(set(self.dealedSentence)), 1])
        for i in range(self.iternum):
            self.PR = (1 - self.alpha) + self.alpha * np.dot(self.matrix, self.PR)
                
        
    # 输出词和相应的权重
    def printResult(self):
        word_pr = {}
        for i in range(len(self.PR)):
            word_pr[self.index_dict[i]] = self.PR[i][0]
        res = sorted(word_pr.items(), key=lambda x: x[1], reverse=True)
        self.res = res
        print(self.res)

In [7]:
for i in range(10):
    s = dataset['abstract1'][i]
    tr = TextRank(s, 3, 0.85, 700)
    tr.getStopWords()
    tr.dealSentence()
    tr.createNodes()
    tr.createMatrix()
    tr.calPR()
    tr.printResult()

algorithm non negative matrix factorization non negative matrix factorization nmf useful decomposition multivariate data different multi plicative algorithm nmf analyzed differ slightly multiplicative factor used update rule algorithm minimize conventional least square error minimizes generalized kullback leibler divergence monotonic convergence algorithm proven auxiliary func tion analogous used proving convergence expectation maximization algorithm algorithm interpreted diag onally rescaled gradient descent rescaling factor optimally chosen ensure convergence
[('algorithm', 3.3388344271912227), ('convergence', 1.833289867906681), ('factor', 1.6538696136510604), ('nmf', 1.4701989678811866), ('used', 1.4411182860038187), ('factorization', 1.0799689216685375), ('minimizes', 0.9623411657148838), ('error', 0.960089597880959), ('generalized', 0.960024338714584), ('kullback', 0.9516558744814465), ('square', 0.951012145228346), ('onally', 0.9489005556679782), ('rescaled', 0.9433463856538848)