-
Notifications
You must be signed in to change notification settings - Fork 9
/
extract_keywords_cal_summary_by_textrank4zh.py
executable file
·104 lines (86 loc) · 2.54 KB
/
extract_keywords_cal_summary_by_textrank4zh.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#-*- coding:utf-8 -*-
from gensim import models,corpora
from textrank4zh import TextRank4Keyword
import get_data
import re
import heapq
def get_max_k_tuple_list (tuple_list , k):
return heapq.nlargest(k , tuple_list , key = lambda x : x[1])
def get_stopwords_list():
filepath = "./stopword.txt"
f = open(filepath)
stopwords_list = []
for i,line in enumerate(f.readlines()):
stopwords_list.append(line.decode("utf8"))
f.close()
return stopwords_list
def get_index_of_summary(article,k):
list_word = article
k = len(article)/k
article = ''.join(article)
tr4w = TextRank4Keyword()
tr4w.analyze(text = article , lower = True , window = 2)
keyword_list = []
for item in tr4w.get_keywords(k, word_min_len = 1):
#keyword_weight_list.append( (''.join(item.word),item.weight) )
keyword_list.append(''.join(item.word))
#print ''.join(item.word),item.weight
s = " ".join(list_word).replace("\n","")
cal_list = []
for i,sen in enumerate(re.split(',|。|:|;|?|!',s)):
sen_list = sen.split(' ')
temp_list = []
temp_value = 0.0
n = 0
for j , word in enumerate(sen_list):
if word in keyword_list:
temp_list.insert(j,1)
else:
temp_list.insert(j,0)
length = 0
for k in temp_list:
length += 1
if k==1:
n += 1
try:
temp_value = n*n*1.0/length
except:
temp_value = 0
sen = ''.join(sen.split())
sen = re.sub("\d{4,}",'',sen)
cal_list.append((i,temp_value,sen))
cal_list = sorted(cal_list,key=lambda x : (-x[1],x[0]))
all_size = 0
ans_list = []
for t in cal_list:
if all_size+len(t[2].decode("utf8"))+1 <= 60 and t[1]>0:
ans_list.append(t)
all_size+=(len(t[2].decode("utf8"))+1)
ans_list = sorted(ans_list,key=lambda x : (x[0]))
ans = ""
for i,t in enumerate(ans_list):
if i == len(ans_list)-1:
ans+=t[2]
ans+="。"
else:
ans+=t[2]
ans+=","
return ans
def use_textrank4zh_cal_summary( test_filepath , result_filepath , k):
list_test_article = get_data.get_cut_data_list_list(test_filepath)
result_f = open(result_filepath,"w+")
for i , article in enumerate(list_test_article):
ans = get_index_of_summary(article , k)
print i,ans
result_f.write(ans+"\n")
result_f.close()
if __name__ == "__main__":
test_filepath = "./data/cut_article_test.txt"
'''
for k in range(5,16):
result_filepath = "./result/EK_tfidf_result/0504_k=%d.txt"%(k)
use_tfidf_cal_summary(test_filepath , result_filepath , k)
'''
for k in range(10,20):
result_filepath = "./result/EK_textrank4zh_result/0520_k=%d.txt"%(k)
use_textrank4zh_cal_summary(test_filepath, result_filepath ,k)