Link: https://pypi.org/project/pytextrank/

In [1]:
import spacy
import pytextrank

# example text
text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types systems and systems of mixed types."

# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")

# add PyTextRank to the spaCy pipeline
nlp.add_pipe("positionrank")
doc = nlp(text)

# examine the top-ranked phrases in the document
# for phrase in doc._.phrases:
#     print(phrase.text)
#     print(phrase.rank)

e:\software\Python\Python311\Lib\site-packages


词干提取和词干还原，方便结果的比对

In [2]:
import nltk

porter = nltk.PorterStemmer()
def stemmer(raw_sequences):
    stemmed_sequences = []

    for i, words in enumerate(raw_sequences):
        new_words = []
        for word in words:
            if type(word) != str:
                # for h_candidates and a_candidates
                word = word[0]
            items = word.split()
            new_word = ' '.join(porter.stem(item) for item in items)
            new_words.append(new_word)
        stemmed_sequences.append(new_words)

    return stemmed_sequences

指标计算

In [3]:
def calPRF(num_c, num_e, num_s):
    F1 = 0.0
    P = float(num_c) / float(num_e) if num_e!=0 else 0.0
    R = float(num_c) / float(num_s) if num_s!=0 else 0.0
    if (P + R == 0.0):
        F1 = 0
    else:
        F1 = 2 * P * R / (P + R)
    return P, R, F1

def getPRF(references, predictions, log):
    num_c_5, num_c_10, num_c_15 = 0, 0, 0
    num_e_5, num_e_10, num_e_15 = 0, 0, 0
    num_s = 0
    for i  in range(len(references)):
        reference = references[i]
        prediction = predictions[i]
        j = 0
        for candidate in prediction[:15]:
            if candidate in reference:
                if j<5:
                    num_c_5 += 1
                    num_c_10 += 1
                    num_c_15 += 1
                elif (j<10 and j>=5):
                    num_c_10 += 1
                    num_c_15 += 1
                elif (j<15 and j>=10):
                    num_c_15 += 1
            j += 1
        
        if len(prediction[0:5]) == 5:
            num_e_5 += 5
        else:
            num_e_5 += len(prediction[0:5])
        
        if len(prediction[0:10]) == 10:
            num_e_10 += 10
        else:
            num_e_10 += len(prediction[0:10])
        
        if len(prediction[0:15]) == 15:
            num_e_15 += 15
        else:
            num_e_15 += len(prediction[0:15])

        num_s += len(reference)
    
    P, R, F1 = calPRF(num_c_5, num_e_5, num_s)
    log.logger.info("P@5:{} R@5:{} F1@5:{}".format(P,R,F1))
    P, R, F1 = calPRF(num_c_10, num_e_10, num_s)
    log.logger.info("P@10:{} R@10:{} F1@10:{}".format(P,R,F1))
    P, R, F1 = calPRF(num_c_15, num_e_15, num_s)
    log.logger.info("P@15:{} R@15:{} F1@15:{}".format(P,R,F1))

将结果存在日志文件中

In [4]:
import logging

class Logger(object):

    def __init__(self, filename, level='info'):
        level = logging.INFO if level == 'info' else logging.DEBUG
        self.logger = logging.getLogger(filename)
        self.logger.propagate = False
        self.logger.setLevel(level)  #

        th = logging.FileHandler(filename, 'a')

        self.logger.addHandler(th)

log = Logger('PositionRank_Pred/Elsevier-LIS.log')

关键词抽取函数

In [5]:
def extractKeyword(texts):
    keywords = []
    for text in texts:
        doc = nlp(text.lower())
        keyword = []
        for phrase in doc._.phrases:
            keyword.append(phrase.text)
        keywords.append(keyword)

    return keywords

数据加载

In [6]:
import pandas as pd
import json

df = pd.read_excel('../data/Elsevier-LIS/Texts-lite-abstract.xlsx')
links = df['Pii'].tolist()
abs = df['Abstract'].tolist()
hts = df['Highlights'].tolist()

link_to_keywords = {}
with open('../data/Elsevier-LIS/Keywords.json', 'r') as f:
    link_to_keywords = json.load(f)

In [7]:
ahts, hats = [], []

for i, link in enumerate(links):
    ahts.append(abs[i] + ' ' + hts[i])
    hats.append(hts[i] + ' ' + abs[i])

开始抽取

In [8]:
keywords = []
for link in links:
    try:
        keywords.append(link_to_keywords[link])
    except:
        keywords.append([])

if len(keywords) == len(abs):
    print("True")
    labels_stem = stemmer(keywords)

True


1. 抽取未经过滤文本上的关键词集合

In [9]:
keywords_from_ab = extractKeyword(abs)
keywords_from_ht = extractKeyword(hts)
keywords_from_ah = extractKeyword(ahts)
keywords_from_ha = extractKeyword(hats)

keywords_from_ab_stem = stemmer(keywords_from_ab)
keywords_from_ht_stem = stemmer(keywords_from_ht)
keywords_from_ah_stem = stemmer(keywords_from_ah)
keywords_from_ha_stem = stemmer(keywords_from_ha)

results = {
    'ab': keywords_from_ab_stem,
    'ht': keywords_from_ht_stem,
    'ah': keywords_from_ah_stem,
    'ha': keywords_from_ha_stem
}

for key in results.keys():
    log.logger.info(key)
    getPRF(labels_stem, results[key],log)

In [10]:
from tqdm import tqdm

filter = [i for i in range(1,9)]

for k in tqdm(filter):
  # get data
  key = 'reserve_' + str(k)
  log.logger.info(key)

  abs = df[key]
  ahts, hats = [] , []
  for i, text in enumerate(abs):
    ahts.append(text + ' ' + hts[i])
    hats.append(hts[i] + ' ' + text)

  # extract keywords
  keywords_from_ab = extractKeyword(abs)
  keywords_from_aht = extractKeyword(ahts)
  keywords_from_hat = extractKeyword(hats)

  # stemmer
  keywords_from_ab_stem = stemmer(keywords_from_ab)
  keywords_from_aht_stem = stemmer(keywords_from_aht)
  keywords_from_hat_stem = stemmer(keywords_from_hat)

  # rejust the data structure
  results = {
      'ab': keywords_from_ab_stem,
      'aht': keywords_from_aht_stem,
      'hat': keywords_from_hat_stem
  }

  # evaluation
  for key in results.keys():
    log.logger.info(key)
    getPRF(labels_stem, results[key],log)

100%|██████████| 8/8 [19:46<00:00, 148.28s/it]
