-
Notifications
You must be signed in to change notification settings - Fork 1
/
triple_extraction.py
118 lines (104 loc) · 5.81 KB
/
triple_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from sentence_parser import LtpParser
class TripleExtractor:
def __init__(self):
self.parser = LtpParser()
'''利用语义角色标注,直接获取主谓宾三元组,基于A0,A1,A2'''
def ruler1(self, words, postags, roles_dict, role_index):
"""
words:['中国', '是', '一个', '和平', '、', '自由', '的', '国家']
postags:['ns', 'v', 'm', 'a', 'wp', 'a', 'u', 'n']
roles_dict:{1: {'A0': ('A0', 0, 0), 'A1': ('A1', 2, 7)}, 3: {'A0': ('A0', 7, 7)}, 5: {'A0': ('A0', 7, 7)}}
:param words:
:param postags:
:param roles_dict:
:param role_index:
:return:
"""
v = words[role_index] #
role_info = roles_dict[role_index] # 获取和主体词相关的词字典
if 'A0' in role_info.keys() and 'A1' in role_info.keys(): # 同时存在
s = ''.join([words[word_index] for word_index in range(role_info['A0'][1], role_info['A0'][2] + 1) if
postags[word_index][0] not in ['w', 'u', 'x'] and words[word_index]])
o = ''.join([words[word_index] for word_index in range(role_info['A1'][1], role_info['A1'][2] + 1) if
postags[word_index][0] not in ['w', 'u', 'x'] and words[word_index]])
if s and o:
return '1', [s, v, o]
return '4', []
'''三元组抽取主函数'''
def ruler2(self, words, postags, child_dict_list, arcs, roles_dict):
svos = []
for index in range(len(postags)):
tmp = 1
# 先借助语义角色标注的结果,进行三元组抽取
if index in roles_dict:
flag, triple = self.ruler1(words, postags, roles_dict, index)
if flag == '1':
svos.append(triple)
tmp = 0
if tmp == 1:
# 如果语义角色标记为空,则使用依存句法进行抽取
# if postags[index] == 'v':
if postags[index]:
# 抽取以谓词为中心的事实三元组
child_dict = child_dict_list[index]
# 主谓宾
if 'SBV' in child_dict and 'VOB' in child_dict:
r = words[index]
e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
svos.append([e1, r, e2])
# 定语后置,动宾关系
relation = arcs[index][0]
head = arcs[index][2]
if relation == 'ATT':
if 'VOB' in child_dict:
e1 = self.complete_e(words, postags, child_dict_list, head - 1)
r = words[index]
e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
temp_string = r + e2
if temp_string == e1[:len(temp_string)]:
e1 = e1[len(temp_string):]
if temp_string not in e1:
svos.append([e1, r, e2])
# 含有介宾关系的主谓动补关系
if 'SBV' in child_dict and 'CMP' in child_dict:
e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
cmp_index = child_dict['CMP'][0]
r = words[index] + words[cmp_index]
if 'POB' in child_dict_list[cmp_index]:
e2 = self.complete_e(words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0])
svos.append([e1, r, e2])
return svos
'''对找出的主语或者宾语进行扩展'''
def complete_e(self, words, postags, child_dict_list, word_index):
child_dict = child_dict_list[word_index]
prefix = ''
if 'ATT' in child_dict:
for i in range(len(child_dict['ATT'])):
prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i])
postfix = ''
if postags[word_index] == 'v':
if 'VOB' in child_dict:
postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
if 'SBV' in child_dict:
prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix
return prefix + words[word_index] + postfix
'''程序主控函数'''
def triples_main(self, sentence):
svos = []
words, postags, child_dict_list, roles_dict, arcs = self.parser.parser_main(sentence)
svo = self.ruler2(words, postags, child_dict_list, arcs, roles_dict)
svos += svo
return svos
'''测试'''
def test():
extractor = TripleExtractor()
sentence = '李克强总理今天来我家了,我感到非常荣幸'
svos = extractor.triples_main(sentence)
print('svos', svos)
sentence = """以色列国防军20日对加沙地带实施轰炸,造成3名巴勒斯坦武装人员死亡。此外,巴勒斯坦人与以色列士兵当天在加沙地带与以交界地区发生冲突,一名巴勒斯坦人被打死。当天的冲突还造成210名巴勒斯坦人受伤。
当天,数千名巴勒斯坦人在加沙地带边境地区继续“回归大游行”抗议活动。部分示威者燃烧轮胎,并向以军投掷石块、燃烧瓶等,驻守边境的以军士兵向示威人群发射催泪瓦斯并开枪射击。"""
svos = extractor.triples_main(sentence)
print('svos', svos)
if __name__ == '__main__':
test()