In [None]:
!pip install py2neo
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# 加载医疗数据

In [3]:
# encoding:utf8
import os
import re
import json
import codecs
import threading
from py2neo import Graph
import pandas as pd
import numpy as np
from tqdm import tqdm

In [5]:
from datasets import load_dataset
dataset = load_dataset("nlp-guild/medical-data")

ModuleNotFoundError: No module named 'datasets'

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['_id', 'name', 'desc', 'category', 'prevent', 'cause', 'symptom', 'yibao_status', 'get_prob', 'easy_get', 'get_way', 'acompany', 'cure_department', 'cure_way', 'cure_lasttime', 'cured_prob', 'common_drug', 'cost_money', 'check', 'do_eat', 'not_eat', 'recommand_eat', 'recommand_drug', 'drug_detail'],
        num_rows: 8808
    })
})

In [None]:
features = dataset['train'].column_names
features

['_id',
 'name',
 'desc',
 'category',
 'prevent',
 'cause',
 'symptom',
 'yibao_status',
 'get_prob',
 'easy_get',
 'get_way',
 'acompany',
 'cure_department',
 'cure_way',
 'cure_lasttime',
 'cured_prob',
 'common_drug',
 'cost_money',
 'check',
 'do_eat',
 'not_eat',
 'recommand_eat',
 'recommand_drug',
 'drug_detail']

In [None]:
for i in range(3):
    sample = dataset['train'][i] # dict
    name = sample['name']
    print(f'{name}的信息如下:')
    for feat in features:
        print(f'{feat}: {sample[feat]}')
    
    print('---------------------------------------')
    print()

肺泡蛋白质沉积症的信息如下:
_id: {'$oid': '5bb578b6831b973a137e3ee6'}
name: 肺泡蛋白质沉积症
desc: 肺泡蛋白质沉积症(简称PAP)，又称Rosen-Castle-man-Liebow综合征，是一种罕见疾病。该病以肺泡和细支气管腔内充满PAS染色阳性，来自肺的富磷脂蛋白质物质为其特征，好发于青中年，男性发病约3倍于女性。
category: ['疾病百科', '内科', '呼吸内科']
prevent: 1、避免感染分支杆菌病，卡氏肺囊肿肺炎，巨细胞病毒等。
2、注意锻炼身体，提高免疫力。
cause: 病因未明，推测与几方面因素有关：如大量粉尘吸入（铝，二氧化硅等），机体免疫功能下降（尤其婴幼儿），遗传因素，酗酒，微生物感染等，而对于感染，有时很难确认是原发致病因素还是继发于肺泡蛋白沉着症，例如巨细胞病毒，卡氏肺孢子虫，组织胞浆菌感染等均发现有肺泡内高蛋白沉着。
虽然启动因素尚不明确，但基本上同意发病过程为脂质代谢障碍所致，即由于机体内，外因素作用引起肺泡表面活性物质的代谢异常，到目前为止，研究较多的有肺泡巨噬细胞活力，动物实验证明巨噬细胞吞噬粉尘后其活力明显下降，而病员灌洗液中的巨噬细胞内颗粒可使正常细胞活力下降，经支气管肺泡灌洗治疗后，其肺泡巨噬细胞活力可上升，而研究未发现Ⅱ型细胞生成蛋白增加，全身脂代谢也无异常，因此目前一般认为本病与清除能力下降有关。
symptom: ['紫绀', '胸痛', '呼吸困难', '乏力', '毓卓']
yibao_status: 否
get_prob: 0.00002%
easy_get: None
get_way: 无传染性
acompany: ['多重肺部感染']
cure_department: ['内科', '呼吸内科']
cure_way: ['支气管肺泡灌洗']
cure_lasttime: 约3个月
cured_prob: 约40%
common_drug: None
cost_money: 根据不同医院，收费标准不一致，省市三甲医院约（ 8000——15000 元）
check: ['胸部CT检查', '肺活检', '支气管镜检查']
do_eat: None
not_eat: None
recommand_eat: None
recommand_drug

# 连接云知识图谱

In [None]:
password = None # change password here

In [None]:
from py2neo import Graph
graph = Graph("neo4j+s://f54cadff.databases.neo4j.io:7687", auth=("neo4j", password))

In [None]:
graph.run("UNWIND range(1, 3) AS n RETURN n, n * n as n_sq")

n,n_sq
1,1
2,4
3,9


# Build KG

In [None]:
class MedicalExtractor(object):
    def __init__(self):
        super(MedicalExtractor, self).__init__()
        self.graph = Graph("neo4j+s://f54cadff.databases.neo4j.io:7687", auth=("neo4j", password))

        # 共8类节点
        self.drugs = []  # 药品
        self.recipes = []  # 菜谱
        self.foods = []  # 食物
        self.checks = []  # 检查
        self.departments = []  # 科室
        self.producers = []  # 药企
        self.diseases = []  # 疾病
        self.symptoms = []  # 症状

        self.disease_infos = []  # 疾病信息

        # 构建节点实体关系
        self.rels_department = []  # 科室－科室关系
        self.rels_noteat = []  # 疾病－忌吃食物关系
        self.rels_doeat = []  # 疾病－宜吃食物关系
        self.rels_recommandeat = []  # 疾病－推荐吃食物关系
        self.rels_commonddrug = []  # 疾病－通用药品关系
        self.rels_recommanddrug = []  # 疾病－热门药品关系
        self.rels_check = []  # 疾病－检查关系
        self.rels_drug_producer = []  # 厂商－药物关系

        self.rels_symptom = []  # 疾病症状关系
        self.rels_acompany = []  # 疾病并发关系
        self.rels_category = []  # 疾病与科室之间的关系

    def extract_triples(self):
        print("从json文件中转换抽取三元组")
        for data_json in dataset['train']:
            disease_dict = {}
            disease = data_json['name']
            disease_dict['name'] = disease
            self.diseases.append(disease)
            disease_dict['desc'] = ''
            disease_dict['prevent'] = ''
            disease_dict['cause'] = ''
            disease_dict['easy_get'] = ''
            disease_dict['cure_department'] = ''
            disease_dict['cure_way'] = ''
            disease_dict['cure_lasttime'] = ''
            disease_dict['symptom'] = ''
            disease_dict['cured_prob'] = ''

            if data_json['symptom'] != None:
                self.symptoms += data_json['symptom']
                for symptom in data_json['symptom']:
                    self.rels_symptom.append([disease, 'has_symptom', symptom])

            if data_json['acompany'] != None:
                for acompany in data_json['acompany']:
                    self.rels_acompany.append([disease, 'acompany_with', acompany])
                    self.diseases.append(acompany)

            if data_json['desc'] != None:
                disease_dict['desc'] = data_json['desc']

            if data_json['prevent'] != None:
                disease_dict['prevent'] = data_json['prevent']

            if data_json['cause'] != None:
                disease_dict['cause'] = data_json['cause']

            if data_json['get_prob'] != None:
                disease_dict['get_prob'] = data_json['get_prob']

            if data_json['easy_get'] != None:
                disease_dict['easy_get'] = data_json['easy_get']

            if data_json['cure_department'] != None:
                cure_department = data_json['cure_department']
                if len(cure_department) == 1:
                    self.rels_category.append([disease, 'cure_department', cure_department[0]])
                if len(cure_department) == 2:
                    big = cure_department[0]
                    small = cure_department[1]
                    self.rels_department.append([small, 'belongs_to', big])
                    self.rels_category.append([disease, 'cure_department', small])

                disease_dict['cure_department'] = cure_department
                self.departments += cure_department

            if data_json['cure_way'] != None:
                disease_dict['cure_way'] = data_json['cure_way']

            if data_json['cure_lasttime'] != None:
                disease_dict['cure_lasttime'] = data_json['cure_lasttime']

            if data_json['cured_prob'] != None:
                disease_dict['cured_prob'] = data_json['cured_prob']

            if data_json['common_drug'] != None:
                common_drug = data_json['common_drug']
                for drug in common_drug:
                    self.rels_commonddrug.append([disease, 'has_common_drug', drug])
                self.drugs += common_drug

            if data_json['recommand_drug'] != None:
                recommand_drug = data_json['recommand_drug']
                self.drugs += recommand_drug
                for drug in recommand_drug:
                    self.rels_recommanddrug.append([disease, 'recommand_drug', drug])

            if data_json['not_eat'] != None:
                not_eat = data_json['not_eat']
                for _not in not_eat:
                    self.rels_noteat.append([disease, 'not_eat', _not])

                self.foods += not_eat
                do_eat = data_json['do_eat']
                for _do in do_eat:
                    self.rels_doeat.append([disease, 'do_eat', _do])

                self.foods += do_eat

            if data_json['recommand_eat'] != None:
                recommand_eat = data_json['recommand_eat']
                for _recommand in recommand_eat:
                    self.rels_recommandeat.append([disease, 'recommand_recipes', _recommand])
                self.recipes += recommand_eat

            if data_json['check'] != None:
                check = data_json['check']
                for _check in check:
                    self.rels_check.append([disease, 'need_check', _check])
                self.checks += check

            if data_json['drug_detail'] != None:
                for det in data_json['drug_detail']:
                    det_spilt = det.split('(')
                    if len(det_spilt) == 2:
                        p, d = det_spilt
                        d = d.rstrip(')')
                        if p.find(d) > 0:
                            p = p.rstrip(d)
                        self.producers.append(p)
                        self.drugs.append(d)
                        self.rels_drug_producer.append([p, 'production', d])
                    else:
                        d = det_spilt[0]
                        self.drugs.append(d)

            self.disease_infos.append(disease_dict)

    def write_nodes(self, entitys, entity_type):
        print("写入 {0} 实体".format(entity_type))
        for node in tqdm(set(entitys), ncols=80):
            cql = """MERGE(n:{label}{{name:'{entity_name}'}})""".format(
                label=entity_type, entity_name=node.replace("'", ""))
            try:
                self.graph.run(cql)
            except Exception as e:
                print(e)
                print(cql)

    def write_edges(self, triples, head_type, tail_type):
        print("写入 {0} 关系".format(triples[0][1]))
        for head, relation, tail in tqdm(triples, ncols=80):
            cql = """MATCH(p:{head_type}),(q:{tail_type})
                    WHERE p.name='{head}' AND q.name='{tail}'
                    MERGE (p)-[r:{relation}]->(q)""".format(
                head_type=head_type, tail_type=tail_type, head=head.replace("'", ""),
                tail=tail.replace("'", ""), relation=relation)
            try:
                self.graph.run(cql)
            except Exception as e:
                print(e)
                print(cql)

    def set_attributes(self, entity_infos, etype):
        print("写入 {0} 实体的属性".format(etype))
        for e_dict in tqdm(entity_infos, ncols=80):
            name = e_dict['name']
            del e_dict['name']
            for k, v in e_dict.items():
                if k in ['cure_department', 'cure_way']:
                    cql = """MATCH (n:{label})
                        WHERE n.name='{name}'
                        set n.{k}={v}""".format(label=etype, name=name.replace("'", ""), k=k, v=v)
                else:
                    cql = """MATCH (n:{label})
                        WHERE n.name='{name}'
                        set n.{k}='{v}'""".format(label=etype, name=name.replace("'", ""), k=k,
                                                  v=v.replace("'", "").replace("\n", ""))
                try:
                    self.graph.run(cql)
                except Exception as e:
                    print(e)
                    print(cql)

    def create_entitys(self):
        self.write_nodes(self.drugs, '药品')
        self.write_nodes(self.recipes, '菜谱')
        self.write_nodes(self.foods, '食物')
        self.write_nodes(self.checks, '检查')
        self.write_nodes(self.departments, '科室')
        self.write_nodes(self.producers, '药企')
        self.write_nodes(self.diseases, '疾病')
        self.write_nodes(self.symptoms, '症状')

    def create_relations(self):
        self.write_edges(self.rels_department, '科室', '科室')
        self.write_edges(self.rels_noteat, '疾病', '食物')
        self.write_edges(self.rels_doeat, '疾病', '食物')
        self.write_edges(self.rels_recommandeat, '疾病', '菜谱')
        self.write_edges(self.rels_commonddrug, '疾病', '药品')
        self.write_edges(self.rels_recommanddrug, '疾病', '药品')
        self.write_edges(self.rels_check, '疾病', '检查')
        self.write_edges(self.rels_drug_producer, '药企', '药品')
        self.write_edges(self.rels_symptom, '疾病', '症状')
        self.write_edges(self.rels_acompany, '疾病', '疾病')
        self.write_edges(self.rels_category, '疾病', '科室')

    def set_diseases_attributes(self):
        self.set_attributes(self.disease_infos,"疾病")
        # t = threading.Thread(target=self.set_attributes, args=(self.disease_infos, "疾病"))
        # # t.setDaemon(False)
        # t.start()
        # t.join()

In [None]:
extractor = MedicalExtractor()
extractor.extract_triples()
extractor.create_entitys()
extractor.create_relations()
extractor.set_diseases_attributes()

从json文件中转换抽取三元组
写入 药品 实体


100%|███████████████████████████████████████| 3975/3975 [01:02<00:00, 63.55it/s]


写入 菜谱 实体


100%|███████████████████████████████████████| 4506/4506 [01:10<00:00, 64.10it/s]


写入 食物 实体


100%|█████████████████████████████████████████| 366/366 [00:05<00:00, 67.31it/s]


写入 检查 实体


100%|███████████████████████████████████████| 3353/3353 [00:52<00:00, 63.50it/s]


写入 科室 实体


100%|███████████████████████████████████████████| 54/54 [00:00<00:00, 63.13it/s]


写入 药企 实体


100%|███████████████████████████████████████| 7919/7919 [02:14<00:00, 58.91it/s]


写入 疾病 实体


100%|███████████████████████████████████████| 8807/8807 [02:32<00:00, 57.79it/s]


写入 症状 实体


100%|███████████████████████████████████████| 5998/5998 [01:39<00:00, 60.43it/s]


写入 belongs_to 关系


100%|███████████████████████████████████████| 7976/7976 [01:38<00:00, 81.34it/s]


写入 not_eat 关系


100%|█████████████████████████████████████| 22278/22278 [08:47<00:00, 42.21it/s]


写入 do_eat 关系


100%|█████████████████████████████████████| 22258/22258 [08:46<00:00, 42.28it/s]


写入 recommand_recipes 关系


100%|█████████████████████████████████████| 40267/40267 [16:59<00:00, 39.51it/s]


写入 has_common_drug 关系


100%|█████████████████████████████████████| 14656/14656 [06:00<00:00, 40.69it/s]


写入 recommand_drug 关系


100%|█████████████████████████████████████| 59467/59467 [24:32<00:00, 40.38it/s]


写入 need_check 关系


100%|█████████████████████████████████████| 39531/39531 [16:11<00:00, 40.70it/s]


写入 production 关系


100%|█████████████████████████████████| 161664/161664 [1:01:25<00:00, 43.86it/s]


写入 has_symptom 关系


100%|█████████████████████████████████████| 54710/54710 [23:29<00:00, 38.83it/s]


写入 acompany_with 关系


100%|█████████████████████████████████████| 12052/12052 [05:32<00:00, 36.21it/s]


写入 cure_department 关系


100%|███████████████████████████████████████| 8807/8807 [03:21<00:00, 43.70it/s]


写入 疾病 实体的属性


 22%|████████▋                              | 1767/7916 [06:35<23:15,  4.41it/s]

[Statement.SyntaxError] Invalid input '': expected "NOT" or an expression (line 3, column 40 (offset: 96))
"                        set n.cure_way="
                                        ^
MATCH (n:疾病)
                        WHERE n.name='肺纤维化'
                        set n.cure_way=


100%|███████████████████████████████████████| 7916/7916 [32:02<00:00,  4.12it/s]
