# CSV2Neo4j 

In [1]:
import os
import json

import pandas as pd

In [2]:
df = pd.read_csv('data/medical_data.csv')

In [3]:
df.shape

(8808, 23)

In [4]:
def print_list(my_list,top=20):
    for i, item in enumerate(my_list[:top]):
        print(f"{i + 1}. {item}")

## 实体（节点）

### 所有症状

In [5]:
def df2list(key):
    data = []
    for each in df[key]:
        if not isinstance(each, str):  # 如果 each 不是字符串类型
            each = str(each)           # 转换为字符串类型
        data.extend(each.split(','))
    data = set(data)
    return data

In [6]:
symptoms = df2list('症状')

### 所有科室

In [7]:
departments= df2list('科室')

### 实体：所有检查

In [8]:
checks = df2list('检查')

### 实体：所有药物

In [9]:
drugs = []
for each in df['推荐药物']:
    try:
        drugs.extend(each.split(','))
    except:
        pass
for each in df['常用药物']:
    try:
        drugs.extend(each.split(','))
    except:
        pass
drugs = set(drugs)

### 实体：所有食物

In [10]:
foods = []
for each in df['可以吃']:
    try:
        foods.extend(each.split(','))
    except:
        pass
for each in df['不可以吃']:
    try:
        foods.extend(each.split(','))
    except:
        pass
for each in df['推荐吃']:
    try:
        foods.extend(each.split(','))
    except:
        pass
foods = set(foods)

### 实体：所有药物厂商

In [11]:
producers = []

for each in df['具体药物']:
    try:
        for each_drug in each.split(','):
            producer = each_drug.split('(')[0]
            producers.append(producer)
    except:
        pass
producers = set(producers)

### 疾病字典信息

In [12]:
disease_infos = [] # 疾病信息
for idx, row in df.iterrows():
    disease_infos.append(dict(row))

In [13]:
dict(row).keys()

dict_keys(['疾病名称', '疾病描述', '疾病种类', '科室', '病因', '症状', '检查', '并发症', '花费', '疗程', '疗法', '治愈率', '易感人群', '感染概率', '感染途径', '预防措施', '推荐药物', '常用药物', '具体药物', '可以吃', '不可以吃', '推荐吃', '是否纳入医保'])

## 关系（边）

In [14]:
def deduplicate(rels_old):
    '''关系去重函数'''
    rels_new = []
    for each in rels_old:
        if each not in rels_new:
            rels_new.append(each)
    return rels_new

### 关系：疾病-检查

In [15]:
rels_check = []
for idx, row in df.iterrows():
    checks = row['检查']
    if not isinstance(checks, str):  # 如果 checks 不是字符串类型
        checks = str(checks)         # 转换为字符串类型
    for each in checks.split(','):
        rels_check.append([row['疾病名称'], each])
rels_check = deduplicate(rels_check)

In [16]:
print_list(rels_check)

1. ['肺泡蛋白质沉积症', '胸部CT检查']
2. ['肺泡蛋白质沉积症', '肺活检']
3. ['肺泡蛋白质沉积症', '支气管镜检查']
4. ['百日咳', '耳、鼻、咽拭子细菌培养']
5. ['百日咳', '周围血白细胞计数及分类检验']
6. ['百日咳', '血常规']
7. ['百日咳', '酶联免疫吸附试验']
8. ['百日咳', '白细胞分类计数']
9. ['苯中毒', '血常规']
10. ['苯中毒', '骨髓象分析']
11. ['苯中毒', '先令氏指数']
12. ['喘息样支气管炎', '肺部检查']
13. ['喘息样支气管炎', '肺和胸膜听诊']
14. ['喘息样支气管炎', '抗链球菌型M蛋白抗体']
15. ['喘息样支气管炎', '抗链球菌壁多糖抗体']
16. ['喘息样支气管炎', '酶联免疫吸附试验']
17. ['成人呼吸窘迫综合征', '胸部CT检查']
18. ['成人呼吸窘迫综合征', '呼吸肌功能测定']
19. ['成人呼吸窘迫综合征', '血浆蛋白C抗原']
20. ['成人呼吸窘迫综合征', '肺泡气-动脉血氧分压差']


### 关系：疾病-症状

In [17]:
rels_symptom = []
for idx, row in df.iterrows():
    symptoms = row['症状']
    if not isinstance(symptoms, str):  # 如果 symptoms 不是字符串类型
        symptoms = str(symptoms)       # 转换为字符串类型
    for each in symptoms.split(','):
        rels_symptom.append([row['疾病名称'], each])
rels_symptom = deduplicate(rels_symptom)

In [18]:
print_list(rels_symptom)

1. ['肺泡蛋白质沉积症', '紫绀']
2. ['肺泡蛋白质沉积症', '胸痛']
3. ['肺泡蛋白质沉积症', '呼吸困难']
4. ['肺泡蛋白质沉积症', '乏力']
5. ['肺泡蛋白质沉积症', '毓卓']
6. ['百日咳', '吸气时有蝉鸣音']
7. ['百日咳', '痉挛性咳嗽']
8. ['百日咳', '胸闷']
9. ['百日咳', '肺阴虚']
10. ['百日咳', '抽搐']
11. ['百日咳', '低热']
12. ['百日咳', '闫鹏辉']
13. ['百日咳', '惊厥']
14. ['苯中毒', '恶心']
15. ['苯中毒', '抽搐']
16. ['苯中毒', '感觉障碍']
17. ['喘息样支气管炎', '耸肩喘息']
18. ['喘息样支气管炎', '哮鸣音']
19. ['喘息样支气管炎', '纤毛上皮细胞损伤脱落']
20. ['喘息样支气管炎', '变应性咳嗽']


### 关系：疾病-疾病（并发症）

In [19]:
rels_acompany = []
for idx, row in df.iterrows():
    acompany = row['并发症']
    if not isinstance(acompany, str):  # 如果 acompany 不是字符串类型
        acompany = str(acompany)       # 转换为字符串类型
    for each in acompany.split(','):
        rels_acompany.append([row['疾病名称'], each])
rels_acompany = deduplicate(rels_acompany)


In [20]:
print_list(rels_acompany)

1. ['肺泡蛋白质沉积症', '多重肺部感染']
2. ['百日咳', '肺不张']
3. ['苯中毒', '贫血']
4. ['喘息样支气管炎', '支气管哮喘']
5. ['成人呼吸窘迫综合征', '细菌性肺炎']
6. ['大量羊水吸入', '呼吸衰竭']
7. ['单纯性肺嗜酸粒细胞浸润症', '胆道蛔虫病']
8. ['大叶性肺炎', '脓胸']
9. ['大楼病综合征', '抑郁症']
10. ['二硫化碳中毒', '昏迷']
11. ['肺-胸膜阿米巴病', '阿米巴肝脓肿']
12. ['肺出血－肾炎综合征', '便血']
13. ['肺放线菌病', '膈下脓肿']
14. ['肺泡蛋白沉着症', '呼吸衰竭']
15. ['肺曲菌病', '过敏性鼻炎']
16. ['放射性肺炎', '肺气肿']
17. ['肺念珠菌病', '菌血症']
18. ['肺大疱', '张力性气胸']
19. ['肺炎球菌肺炎', '败血症']
20. ['肺气肿', '呼吸衰竭']


### 关系：疾病-推荐药物

In [21]:
rels_recommanddrug = []
for idx, row in df.iterrows():
    try:
        for each in row['推荐药物'].split(','):
            rels_recommanddrug.append([row['疾病名称'], each])
    except:
        pass
rels_recommanddrug = deduplicate(rels_recommanddrug)

### 关系：疾病-常用药物

In [22]:
rels_commonddrug = []
for idx, row in df.iterrows():
    try:
        for each in row['常用药物'].split(','):
            rels_commonddrug.append([row['疾病名称'], each])
    except:
        pass
rels_commonddrug = deduplicate(rels_commonddrug)

### 关系：疾病-不可以吃

In [23]:
rels_noteat = []
for idx, row in df.iterrows():
    try:
        for each in row['不可以吃'].split(','):
            rels_noteat.append([row['疾病名称'], each])
    except:
        pass
rels_noteat = deduplicate(rels_noteat)

### 关系：疾病-可以吃

In [24]:
rels_doeat = []
for idx, row in df.iterrows():
    try:
        for each in row['可以吃'].split(','):
            rels_doeat.append([row['疾病名称'], each])
    except:
        pass
rels_doeat = deduplicate(rels_doeat)

### 关系：疾病-推荐吃

In [25]:
rels_recommandeat = []
for idx, row in df.iterrows():
    try:
        for each in row['推荐吃'].split(','):
            rels_recommandeat.append([row['疾病名称'], each])
    except:
        pass
rels_recommandeat = deduplicate(rels_recommandeat)

### 关系：药物厂商-具体药物

In [26]:

rels_drug_producer = []
for each in df['具体药物']:
    try:
        for each_drug in each.split(','):
            producer = each_drug.split('(')[0]
            drug = each_drug.split('(')[1][:-1]
            rels_drug_producer.append([producer, drug])
    except:
        pass
rels_drug_producer = deduplicate(rels_drug_producer)

### 关系：疾病-科室、小科室-大科室

In [27]:
rels_category = []  # 关系：疾病-科室
rels_department = []  # 关系：小科室-大科室
for idx, row in df.iterrows():
    department = row['科室']
    if not isinstance(department, str):  # 如果 department 不是字符串类型
        department = str(department)     # 转换为字符串类型
    if len(department.split(',')) == 1:
        rels_category.append([row['疾病名称'], department])
    else:
        big = department.split(',')[0]  # 大科室
        small = department.split(',')[1]  # 小科室
        rels_category.append([row['疾病名称'], small])
        rels_department.append([small, big])
rels_category = deduplicate(rels_category)
rels_department = deduplicate(rels_department)


In [28]:
print_list(rels_category)

1. ['肺泡蛋白质沉积症', '呼吸内科']
2. ['百日咳', '小儿内科']
3. ['苯中毒', '急诊科']
4. ['喘息样支气管炎', '呼吸内科']
5. ['成人呼吸窘迫综合征', '呼吸内科']
6. ['大量羊水吸入', '小儿内科']
7. ['单纯性肺嗜酸粒细胞浸润症', '呼吸内科']
8. ['大叶性肺炎', '呼吸内科']
9. ['大楼病综合征', '其他综合']
10. ['二硫化碳中毒', '急诊科']
11. ['肺-胸膜阿米巴病', '呼吸内科']
12. ['肺出血－肾炎综合征', '呼吸内科']
13. ['肺放线菌病', '呼吸内科']
14. ['肺泡蛋白沉着症', '呼吸内科']
15. ['肺曲菌病', '呼吸内科']
16. ['放射性肺炎', '呼吸内科']
17. ['肺念珠菌病', '呼吸内科']
18. ['肺大疱', '呼吸内科']
19. ['肺炎球菌肺炎', '呼吸内科']
20. ['肺气肿', '呼吸内科']


In [29]:
print_list(rels_department)

1. ['呼吸内科', '内科']
2. ['小儿内科', '儿科']
3. ['其他综合', '其他科室']
4. ['肿瘤内科', '肿瘤科']
5. ['心胸外科', '外科']
6. ['感染科', '外科']
7. ['儿科综合', '儿科']
8. ['产科', '妇产科']
9. ['普外科', '外科']
10. ['心内科', '内科']
11. ['肿瘤外科', '肿瘤科']
12. ['神经内科', '内科']
13. ['风湿免疫科', '内科']
14. ['眼科', '五官科']
15. ['内分泌科', '内科']
16. ['小儿外科', '儿科']
17. ['耳鼻喉科', '五官科']
18. ['妇科', '妇产科']
19. ['康复科', '其他科室']
20. ['消化内科', '内科']


## 链接图数据库

In [30]:
from neo4j_driver import Neo4jConnection, Node

In [31]:
g = Neo4jConnection('neo4j://localhost:7687/', 'neo4j', 'Lorne@2022')

### 创建疾病实体

In [32]:
count = 0
for disease_dict in disease_infos:
    try:
        node = Node("Disease",
                    name=disease_dict['疾病名称'],
                    desc=disease_dict['疾病描述'],
                    prevent=disease_dict['预防措施'],
                    cause=disease_dict['病因'],
                    easy_get=disease_dict['易感人群'],
                    cure_lasttime=disease_dict['疗程'],
                    cure_department=disease_dict['科室'],
                    cure_way=disease_dict['疗法'], 
                    cured_prob=disease_dict['治愈率'])
        g.create(node)
        count += 1
        # print('创建疾病实体：', disease_dict['疾病名称'])
    except:
        pass
print('共创建 {} 个疾病实体'.format(count))

共创建 8808 个疾病实体


### 创建药物实体

In [33]:
count = 0
for each in drugs:
    node = Node('Drug', name=each)
    g.create(node)
    count += 1
    # print('创建实体 {}'.format(each))
print('共创建 {} 个药物实体'.format(count))

共创建 3829 个药物实体


### 创建食物实体

In [34]:
count = 0
for each in foods:
    node = Node('Food', name=each)
    g.create(node)
    count += 1
    # print('创建实体 {}'.format(each))
print('共创建 {} 个食物实体'.format(count))

共创建 4870 个食物实体


### 创建检查实体

In [35]:
count = 0
for each in checks:
    node = Node('Check', name=each)
    g.create(node)
    count += 1
    # print('创建实体 {}'.format(each))
print('共创建 {} 个检查实体'.format(count))

共创建 57 个检查实体


### 创建科室实体

In [36]:
count = 0
for each in departments:
    node = Node('Department', name=each)
    g.create(node)
    count += 1
    # print('创建实体 {}'.format(each))
print('共创建 {} 个科室实体'.format(count))

共创建 55 个科室实体


### 创建 药物厂商 实体

In [37]:
count = 0
for each in producers:
    node = Node('Producer', name=each)
    g.create(node)
    count += 1
    # print('创建实体 {}'.format(each))
print('共创建 {} 个厂商实体'.format(count))

共创建 17202 个厂商实体


### 创建 症状 实体

In [38]:
count = 0
for each in symptoms:
    node = Node('Symptom', name=each)
    g.create(node)
    count += 1
    # print('创建实体 {}'.format(each))
print('共创建 {} 个症状实体'.format(count))

共创建 23 个症状实体


## 创建知识图谱关系（连接、边）

友情提示：这一步需要花费很长的时间，不过不同的电脑配置需要的时间也会有所不同，差不多需要执行40多分钟的时间

In [39]:
g.relationship('Disease', 'Food', rels_recommandeat, 'recommand_eat', '推荐食谱')
g.relationship('Disease', 'Food', rels_noteat, 'no_eat', '忌吃')
g.relationship('Disease', 'Food', rels_doeat, 'do_eat', '宜吃')
g.relationship('Department', 'Department', rels_department, 'belongs_to', '属于')
g.relationship('Disease', 'Drug', rels_commonddrug, 'common_drug', '常用药品')
g.relationship('Producer', 'Drug', rels_drug_producer, 'drugs_of', '生产药品')
g.relationship('Disease', 'Drug', rels_recommanddrug, 'recommand_drug', '好评药品')
g.relationship('Disease', 'Check', rels_check, 'need_check', '诊断检查')
g.relationship('Disease', 'Symptom', rels_symptom, 'has_symptom', '症状')
g.relationship('Disease', 'Disease', rels_acompany, 'acompany_with', '并发症')
g.relationship('Disease', 'Department', rels_category, 'belongs_to', '所属科室')

In [40]:
print("数据库中的节点总数：", g.counts())

数据库中的节点总数： 34844
