# 将 GraphRAG 生成的数据 导入 neo4j 可视化

### 加载数据

In [6]:
import pandas as pd

data_path = "/home/yhchen/CodeLibrary/ChatGLM/GraphRAG/data"

communities = pd.read_parquet(f"{data_path}/create_final_communities.parquet")
community_reports = pd.read_parquet(f"{data_path}/create_final_community_reports.parquet")
entities = pd.read_parquet(f"{data_path}/create_final_entities.parquet")
text_units = pd.read_parquet(f"{data_path}/create_final_text_units.parquet")
relationships = pd.read_parquet(f"{data_path}/create_final_relationships.parquet")

In [7]:
communities.head(2)

Unnamed: 0,id,title,level,raw_community,relationship_ids,text_unit_ids
0,2,Community 2,0,2,"[4d183e7007624fcd98af96b9d752c16d, 718c507cb8a...","[338c9e40c185682b2ad81ccc004ed48b,767a93d53f3f..."
1,6,Community 6,0,6,"[3c4062de44d64870a3cc5913d5769244, 24652fab20d...","[338c9e40c185682b2ad81ccc004ed48b,91437fcf6c97..."


In [8]:
community_reports.head(2)

Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,0,"# 5G Network, Autonomous Systems, and Urban Pl...",0,7.0,"5G Network, Autonomous Systems, and Urban Plan...",The impact severity rating is high due to the ...,The community is centered around the integrati...,"[{'explanation': '5G network, represented by e...","{\n ""title"": ""5G Network, Autonomous System...",23b5ebae-f574-4b4a-bd0a-4dcb4d0a3d81
1,1,# Autonomous Driving Ecosystem\n\nThe Autonomo...,0,7.0,Autonomous Driving Ecosystem,The ecosystem's impact severity rating is mode...,The Autonomous Driving Ecosystem revolves arou...,[{'explanation': 'The ecosystem is centered ar...,"{\n ""title"": ""Autonomous Driving Ecosystem""...",f0db4e83-7bb2-4dcf-88cb-108ffe7c7647


In [9]:
entities.head(2)

Unnamed: 0,id,name,type,description,human_readable_id,graph_embedding,text_unit_ids,description_embedding
0,b45241d70f0e43fca764df95b2b81f77,"""自动驾驶""","""CONCEPT""","""\""\""\""\""\""\""\""\""\""\""\""\""\""\""\""\""\""\""\""\""\""\""\...",0,,"[338c9e40c185682b2ad81ccc004ed48b, 767a93d53f3...","[0.027606548741459846, -0.006643487606197596, ..."
1,4119fd06010c494caa07f439b333f4c5,"""网约车""","""ORGANIZATION""","""网约车是数字化改造的产业之一，用以替代巡游出租车，其改造过程通常是渐进的。""",1,,[338c9e40c185682b2ad81ccc004ed48b],"[-0.0335652120411396, 0.0033339483197778463, 0..."


In [10]:
text_units.head(2)

Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids
0,0034f8aab40570315e906cea6777fb75,发展。\n\n \n\n\n\n \n\n而与红旗法案同时代，美国和德国在鼓励汽车产业的同时...,1200,[ee561ee109854a929c8af43657d70d28],"[d91a266f766b4737a06b0fda588ba40b, 4a67211867e...","[05913bee89a94bca88449249e35ba74d, 838c4498bc3..."
1,e706feacef2f4e39de21d52d8fc0c532,。那是不是只需要在现有道路上安装路侧设备，实现车路云协同呢？这是必要的，而且是符合我国治理特...,1200,[ee561ee109854a929c8af43657d70d28],"[b45241d70f0e43fca764df95b2b81f77, 3671ea0dd4e...","[3c4062de44d64870a3cc5913d5769244, df40ad480a3..."


In [11]:
relationships.head(2)

Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,"""自动驾驶""","""社会变革""",1.0,"""自动驾驶技术将引发社会变革，影响多个领域的发展。""",[767a93d53f3f98f27b2b11bee3d716fc],4d183e7007624fcd98af96b9d752c16d,0,7,1,8
1,"""自动驾驶""","""环境保护""",1.0,"""自动驾驶技术通过节能减排，对环境保护产生积极影响。""",[767a93d53f3f98f27b2b11bee3d716fc],718c507cb8ac49e6a35c251ac951b5ca,1,7,1,8


### 创建 neo4j 连接

In [16]:
from neo4j import GraphDatabase

In [17]:
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "12345678" 
NEO4J_DATABASE = "neo4j"
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

In [18]:
# 封装 neo4j数据查询 函数
def import_data(cypher, df, batch_size=1000):

    for i in range(0,len(df), batch_size):
        batch = df.iloc[i: min(i+batch_size, len(df))]
        result = driver.execute_query("UNWIND $rows AS value " + cypher, 
                                      rows=batch.to_dict('records'),
                                      database_=NEO4J_DATABASE)
        print(result.summary.counters)
    return 

### 导入 GraphRAG 生成的知识图谱到 neo4j

In [19]:
#导入text units
cypher_text_units = """
MERGE (c:__Chunk__ {id:value.id})
SET c += value {.text, .n_tokens}
WITH c, value
UNWIND value.document_ids AS document
MATCH (d:__Document__ {id:document})
MERGE (c)-[:PART_OF]->(d)
"""

import_data(cypher_text_units, text_units)

{'_contains_updates': True, 'labels_added': 13, 'nodes_created': 13, 'properties_set': 39}


In [20]:
#加载entities
cypher_entities= """
MERGE (e:__Entity__ {id:value.id})
SET e += value {.human_readable_id, .description, name:replace(value.name,'"','')}
WITH e, value
CALL db.create.setNodeVectorProperty(e, "description_embedding", value.description_embedding)
CALL apoc.create.addLabels(e, case when coalesce(value.type,"") = "" then [] else [apoc.text.upperCamelCase(replace(value.type,'"',''))] end) yield node
UNWIND value.text_unit_ids AS text_unit
MATCH (c:__Chunk__ {id:text_unit})
MERGE (c)-[:HAS_ENTITY]->(e)
"""

import_data(cypher_entities, entities)

{'_contains_updates': True, 'labels_added': 347, 'relationships_created': 460, 'nodes_created': 347, 'properties_set': 1388}


In [21]:
#导入relationships
cypher_relationships = """
    MATCH (source:__Entity__ {name:replace(value.source,'"','')})
    MATCH (target:__Entity__ {name:replace(value.target,'"','')})
    // not necessary to merge on id as there is only one relationship per pair
    MERGE (source)-[rel:RELATED {id: value.id}]->(target)
    SET rel += value {.rank, .weight, .human_readable_id, .description, .text_unit_ids}
    RETURN count(*) as createdRels
"""

import_data(cypher_relationships, relationships)

{'_contains_updates': True, 'relationships_created': 173, 'properties_set': 1038}


In [22]:
#导入communities
cypher_communities = """
MERGE (c:__Community__ {community:value.id})
SET c += value {.level, .title}
/*
UNWIND value.text_unit_ids as text_unit_id
MATCH (t:__Chunk__ {id:text_unit_id})
MERGE (c)-[:HAS_CHUNK]->(t)
WITH distinct c, value
*/
WITH *
UNWIND value.relationship_ids as rel_id
MATCH (start:__Entity__)-[:RELATED {id:rel_id}]->(end:__Entity__)
MERGE (start)-[:IN_COMMUNITY]->(c)
MERGE (end)-[:IN_COMMUNITY]->(c)
RETURn count(distinct c) as createdCommunities
"""

import_data(cypher_communities, communities)

{'_contains_updates': True, 'labels_added': 8, 'relationships_created': 87, 'nodes_created': 8, 'properties_set': 24}


In [23]:
#导入community_reports
cypher_community_reports = """MATCH (c:__Community__ {community: value.community})
SET c += value {.level, .title, .rank, .rank_explanation, .full_content, .summary}
WITH c, value
UNWIND range(0, size(value.findings)-1) AS finding_idx
WITH c, value, finding_idx, value.findings[finding_idx] as finding
MERGE (c)-[:HAS_FINDING]->(f:Finding {id: finding_idx})
SET f += finding"""
import_data(cypher_community_reports, community_reports)

{'_contains_updates': True, 'labels_added': 40, 'relationships_created': 40, 'nodes_created': 40, 'properties_set': 168}
