# 问答机器人-从问题中提取相关实体和关系

[同济子豪兄](https://space.bilibili.com/1900783) 2022-6-25

## 导入工具包

In [1]:
import os
import ahocorasick
from tqdm import tqdm

## 进入主目录

In [2]:
os.chdir('Z:\课题研究\刘焕勇\QASystemOnMedicalKG-master')

In [4]:
cur_dir = os.getcwd()
cur_dir

'Z:\\课题研究\\刘焕勇\\QASystemOnMedicalKG-master'

## 载入所有特征词

In [4]:
#　特征词txt文件路径
disease_path = os.path.join(cur_dir, 'dict/disease.txt')
department_path = os.path.join(cur_dir, 'dict/department.txt')
check_path = os.path.join(cur_dir, 'dict/check.txt')
drug_path = os.path.join(cur_dir, 'dict/drug.txt')
food_path = os.path.join(cur_dir, 'dict/food.txt')
producer_path = os.path.join(cur_dir, 'dict/producer.txt')
symptom_path = os.path.join(cur_dir, 'dict/symptom.txt')
deny_path = os.path.join(cur_dir, 'dict/deny.txt')

In [7]:
# 加载特征词
disease_wds = [i.strip() for i in open(disease_path,encoding="utf-8") if i.strip()]
department_wds = [i.strip() for i in open(department_path,encoding="utf-8") if i.strip()]
check_wds = [i.strip() for i in open(check_path,encoding="utf-8") if i.strip()]
drug_wds = [i.strip() for i in open(drug_path,encoding="utf-8") if i.strip()]
food_wds = [i.strip() for i in open(food_path,encoding="utf-8") if i.strip()]
producer_wds = [i.strip() for i in open(producer_path,encoding="utf-8") if i.strip()]
symptom_wds = [i.strip() for i in open(symptom_path,encoding="utf-8") if i.strip()]
region_words = set(disease_wds + department_wds + check_wds + drug_wds + food_wds + producer_wds + symptom_wds)
deny_words = [i.strip() for i in open(deny_path,encoding="utf-8") if i.strip()]

## 所有实体对应的类型

In [8]:
wdtype_dict = dict()
for word in tqdm(region_words):
    wdtype_dict[word] = []
    if word in disease_wds:
        wdtype_dict[word].append('disease')
    if word in department_wds:
        wdtype_dict[word].append('department')
    if word in check_wds:
        wdtype_dict[word].append('check')
    if word in drug_wds:
        wdtype_dict[word].append('drug')
    if word in food_wds:
        wdtype_dict[word].append('food')
    if word in symptom_wds:
        wdtype_dict[word].append('symptom')
    if word in producer_wds:
        wdtype_dict[word].append('producer')

100%|██████████| 43430/43430 [00:19<00:00, 2280.73it/s]


In [9]:
wdtype_dict

{'宜昌人福复方磺胺甲噁唑片': ['producer'],
 '鱼香青豆': ['food'],
 '卡铂注射液': ['drug'],
 '拔牙后出血不止': ['symptom'],
 '吉林海外狼疮丸': ['producer'],
 '大连美罗小儿盐酸赖氨酸颗粒': ['producer'],
 '肺毛霉病': ['disease'],
 '小儿遗传性果糖不耐受': ['disease'],
 '小儿止咳糖浆': ['drug'],
 '五果冰糖羹': ['food'],
 '大同星宇星火右旋糖酐20葡': ['producer'],
 '先强药业注射用盐酸甲氯芬酯': ['producer'],
 '小儿严重急性呼吸综合征': ['disease'],
 '淋病性关节炎': ['disease'],
 '白大衣高血压': ['disease'],
 '武汉普生注射用盐酸克林霉素': ['producer'],
 '哥台': ['producer'],
 '右下腹可触及柔...': ['symptom'],
 '四川绿叶宝光利肝隆片': ['producer'],
 '协一力利鲁唑片': ['producer'],
 '卫算苏': ['producer'],
 '红烧牛肚': ['food'],
 '江西赣南海欣转移因子口服溶': ['producer'],
 '牛黄解毒丸': ['drug'],
 '溃疡分枝杆菌感染': ['disease'],
 '胺试验': ['check'],
 '胃内异物': ['disease'],
 '拔丝土豆': ['food'],
 '禾丰盐酸洛贝林注射液': ['producer'],
 '重酒石酸卡巴拉汀胶囊': ['drug'],
 '台城制药西咪替丁片': ['producer'],
 '可谱妥': ['producer'],
 '厌氧菌肺炎': ['disease'],
 '行为无计划性': ['symptom'],
 '华青红霉素眼膏': ['producer'],
 '西南药业盐酸苯海拉明注射液': ['producer'],
 '循环血浆量': ['check'],
 '吉贝尔药业醋酸甲萘氢醌片': ['producer'],
 '南京白敬宇醋酸甲萘氢醌片': ['producer'],
 '康圣堂藿香正

## 构造AC自动机，加速实体提取

当知识图谱规模过大时，使用AC自动机算法，从问题中提取知识图谱中的实体，而不是用原生python的字符串方法。

扩展阅读：https://zhuanlan.zhihu.com/p/158767004

In [10]:
'''构造AC自动机，加速过滤'''
def build_actree(wordlist):
    actree = ahocorasick.Automaton()
    for index, word in enumerate(wordlist):
        actree.add_word(word, (index, word))
    actree.make_automaton()
    return actree

In [11]:
# 构造AC自动机
region_tree = build_actree(list(region_words))

In [12]:
region_tree

<ahocorasick.Automaton at 0x7f900bcf59d0>

In [13]:
'血常规' in region_tree

True

In [14]:
region_tree.get('血常规')

(12026, '血常规')

In [15]:
'学习' in region_tree

False

In [16]:
region_tree.get('学习')

KeyError: 

## 任务一：提取问题相关实体及其类别

In [19]:
question = '肺气肿和百日咳要做血常规吗'

In [20]:
question_entity = []
for each in region_tree.iter(question):
    print(each)
    entity = each[1][1]
    question_entity.append(entity)
print(question_entity)

(2, (16220, '肺气肿'))
(6, (21279, '百日咳'))
(11, (12026, '血常规'))
['肺气肿', '百日咳', '血常规']


In [45]:
# stop_wds = []
# # 排除字符串子串
# for wd1 in region_wds:
#     for wd2 in region_wds:
#         if wd1 in wd2 and wd1 != wd2:
#             stop_wds.append(wd1)
# print(stop_wds)
# final_wds = [each for each in region_wds if each not in stop_wds]
# final_wds

In [21]:
question_entity_dict = {each:wdtype_dict[each] for each in question_entity}

In [22]:
question_entity_dict

{'肺气肿': ['disease'], '百日咳': ['disease'], '血常规': ['check']}

In [37]:
output = {}
output['args'] = question_entity_dict

## 任务一完成

## 任务二：提取问题相关的待查询关系

### 问题中涉及的实体类别

In [24]:
types = []
for each in question_entity_dict.values():
    types.extend(each)

In [25]:
types

['disease', 'disease', 'check']

### 不同提问意图的疑问词

In [26]:
symptom_qwds = ['症状', '表征', '现象', '症候', '表现']
cause_qwds = ['原因','成因', '为什么', '怎么会', '怎样才', '咋样才', '怎样会', '如何会', '为啥', '为何', '如何才会', '怎么才会', '会导致', '会造成']
acompany_qwds = ['并发症', '并发', '一起发生', '一并发生', '一起出现', '一并出现', '一同发生', '一同出现', '伴随发生', '伴随', '共现']
food_qwds = ['饮食', '饮用', '吃', '食', '伙食', '膳食', '喝', '菜' ,'忌口', '补品', '保健品', '食谱', '菜谱', '食用', '食物','补品']
drug_qwds = ['药', '药品', '用药', '胶囊', '口服液', '炎片']
prevent_qwds = ['预防', '防范', '抵制', '抵御', '防止','躲避','逃避','避开','免得','逃开','避开','避掉','躲开','躲掉','绕开',
                     '怎样才能不', '怎么才能不', '咋样才能不','咋才能不', '如何才能不',
                     '怎样才不', '怎么才不', '咋样才不','咋才不', '如何才不',
                     '怎样才可以不', '怎么才可以不', '咋样才可以不', '咋才可以不', '如何可以不',
                     '怎样才可不', '怎么才可不', '咋样才可不', '咋才可不', '如何可不']
lasttime_qwds = ['周期', '多久', '多长时间', '多少时间', '几天', '几年', '多少天', '多少小时', '几个小时', '多少年']
cureway_qwds = ['怎么治疗', '如何医治', '怎么医治', '怎么治', '怎么医', '如何治', '医治方式', '疗法', '咋治', '怎么办', '咋办', '咋治']
cureprob_qwds = ['多大概率能治好', '多大几率能治好', '治好希望大么', '几率', '几成', '比例', '可能性', '能治', '可治', '可以治', '可以医']
easyget_qwds = ['易感人群', '容易感染', '易发人群', '什么人', '哪些人', '感染', '染上', '得上']
check_qwds = ['检查', '检查项目', '查出', '检查', '测出', '试出']
belong_qwds = ['属于什么科', '属于', '什么科', '科室']
cure_qwds = ['治疗什么', '治啥', '治疗啥', '医治啥', '治愈啥', '主治啥', '主治什么', '有什么用', '有何用', '用处', '用途', '有什么好处', '有什么益处', '有何益处', '用来', '用来做啥', '用来作甚', '需要', '要']

In [38]:
def check_words(words, question):
    # 疑问词是否出现在提问中
    for word in words:
        if word in question:
            # print(word + ' 出现在提问中')
            return True
    return False

In [39]:
# question_type = 'others'
question_types = []

# 疾病-症状
if check_words(symptom_qwds, question) and ('disease' in types):
    question_type = 'disease_symptom'
    question_types.append(question_type)
    
# 症状-疾病
if check_words(symptom_qwds, question) and ('symptom' in types):
    question_type = 'symptom_disease'
    question_types.append(question_type)
    
# 疾病-病因
if check_words(cause_qwds, question) and ('disease' in types):
    question_type = 'disease_cause'
    question_types.append(question_type)

# 疾病-并发症
if check_words(acompany_qwds, question) and ('disease' in types):
    question_type = 'disease_acompany'
    question_types.append(question_type)

# 疾病-可以吃/不可以吃
if check_words(food_qwds, question) and 'disease' in types:
    deny_status = check_words(deny_words, question)
    if deny_status:
        question_type = 'disease_not_food'
    else:
        question_type = 'disease_do_food'
    question_types.append(question_type)
    
# 可以吃/不可以吃-疾病
if check_words(food_qwds+cure_qwds, question) and 'food' in types:
    deny_status = check_words(deny_words, question)
    if deny_status:
        question_type = 'food_not_disease'
    else:
        question_type = 'food_do_disease'
    question_types.append(question_type)
    
# 疾病-药物
if check_words(drug_qwds, question) and 'disease' in types:
    question_type = 'disease_drug'
    question_types.append(question_type)
    
# 药物-疾病
if check_words(cure_qwds, question) and 'drug' in types:
    question_type = 'drug_disease'
    question_types.append(question_type)
    
# 疾病-检查
if check_words(check_qwds, question) and 'disease' in types:
    question_type = 'disease_check'
    question_types.append(question_type)
    
# 检查-疾病
if check_words(check_qwds+cure_qwds, question) and 'check' in types:
    question_type = 'check_disease'
    question_types.append(question_type)
    
#　疾病-预防措施
if check_words(prevent_qwds, question) and 'disease' in types:
    question_type = 'disease_prevent'
    question_types.append(question_type)
    
# 疾病-疗程
if check_words(lasttime_qwds, question) and 'disease' in types:
    question_type = 'disease_lasttime'
    question_types.append(question_type)

# 疾病-疗法
if check_words(cureway_qwds, question) and 'disease' in types:
    question_type = 'disease_cureway'
    question_types.append(question_type)

# 疾病-治愈率
if check_words(cureprob_qwds, question) and 'disease' in types:
    question_type = 'disease_cureprob'
    question_types.append(question_type)

# 疾病-易感人群
if check_words(easyget_qwds, question) and 'disease' in types :
    question_type = 'disease_easyget'
    question_types.append(question_type)

In [40]:
question_types

['check_disease']

In [41]:
# 若没有查到相关的外部查询信息，那么则将该疾病的描述信息返回
if question_types == [] and 'disease' in types:
    question_types = ['disease_desc']

# 若没有查到相关的外部查询信息，那么则将该疾病的描述信息返回
if question_types == [] and 'symptom' in types:
    question_types = ['symptom_disease']

# 将多个分类结果进行合并处理，组装成一个字典
output['question_types'] = question_types

In [42]:
output

{'args': {'肺气肿': ['disease'], '百日咳': ['disease'], '血常规': ['check']},
 'question_types': ['check_disease']}