# 将 GraphRAG 生成的数据 导入 neo4j 可视化

### 加载数据

In [5]:
import pandas as pd

data_path = "/home/yhchen/CodeLibrary/LLM-Application/GraphRAG/data/farewellmyconcubine"

communities = pd.read_parquet(f"{data_path}/create_final_communities.parquet")
community_reports = pd.read_parquet(f"{data_path}/create_final_community_reports.parquet")
entities = pd.read_parquet(f"{data_path}/create_final_entities.parquet")
text_units = pd.read_parquet(f"{data_path}/create_final_text_units.parquet")
relationships = pd.read_parquet(f"{data_path}/create_final_relationships.parquet")

In [6]:
communities.head(2)

Unnamed: 0,id,title,level,raw_community,relationship_ids,text_unit_ids
0,8,Community 8,0,8,"[d4467d7e8a404c45a5d3477b35cdd6df, 807b66ed814...","[1d652ff93acafdb2868576cf4f723cb6,25a79abe1571..."
1,3,Community 3,0,3,"[823112b882f54270a10fc0495ddc3370, 8475c141252...","[1fbaeff0f46119ca7fe5d7e2396a990d,5936a809b59a..."


In [7]:
community_reports.head(2)

Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,206,# Cultural Revolution and Red Guards\n\nThe co...,4,7.5,Cultural Revolution and Red Guards,The impact severity rating is high due to the ...,"The community revolves around the Red Guards, ...",[{'explanation': 'The Red Guards were involved...,"{\n ""title"": ""Cultural Revolution and Red G...",6b0c4695-178f-4bc6-9c80-0ce7155dc21b
1,205,# 小楼与程蝶衣的京剧世界\n\n文本描述了一个围绕小楼和程蝶衣的京剧世界，其中小楼与程蝶衣...,4,6.5,小楼与程蝶衣的京剧世界,小楼与程蝶衣的京剧世界在文本中占据重要地位，涉及多个角色和事件，但整体上并未展现出对社会或文...,文本描述了一个围绕小楼和程蝶衣的京剧世界，其中小楼与程蝶衣、菊仙等角色有着密切的联系，共同参...,[{'explanation': '文本中多次提到小楼与程蝶衣在京剧表演中的合作，两人共同演...,"{\n ""title"": ""小楼与程蝶衣的京剧世界"",\n ""summary"":...",a0284f19-60db-47a5-b764-a241bbe99271


In [8]:
entities.head(2)

Unnamed: 0,id,name,type,description,human_readable_id,graph_embedding,text_unit_ids,description_embedding
0,b45241d70f0e43fca764df95b2b81f77,霸王别姬,ORGANIZATION,"The entity ""\u9738\u738b\u522b\u59ec"" refers t...",0,,"[1d652ff93acafdb2868576cf4f723cb6, 25a79abe157...","[1.0037897349126279e-15, -1.1491861276425238e-..."
1,4119fd06010c494caa07f439b333f4c5,李碧华,PERSON,李碧华是一位著名的中国作家，以其创作的文学作品《霸王别姬》而广为人知。,1,,"[70752709f4ff8105981635123d9d3dd7, a7e3f833a8c...","[6.768991590255298e-31, -3.2895517229234936e-1..."


In [9]:
text_units.head(2)

Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids
0,a7e3f833a8c75a77de5ef1ce9abca9df,\n　霸王别姬\n\n 李碧华 著\n\n　第一章 暑去寒来春复秋 \n\n 婊子无情，...,1200,[d338ec6941b6216661320e79f45032df],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[d4467d7e8a404c45a5d3477b35cdd6df, 807b66ed814..."
1,ede2a6d531d678f4d8327d4706cfc54a,。桥西有鸟市，对过有各种小食摊子，还有摞地抠饼的卖艺人。热热闹闹，兴兴旺旺。\n 小叫花爱...,1200,[d338ec6941b6216661320e79f45032df],"[9646481f66ce4fd2b08c2eddda42fc82, 254770028d7...","[da245f24b8a041f6ac96ef214862cabf, f2db8d8c674..."


In [10]:
relationships.head(2)

Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,霸王别姬,李碧华,2.0,《霸王别姬》是李碧华所著的一部小说。,"[70752709f4ff8105981635123d9d3dd7, a7e3f833a8c...",d4467d7e8a404c45a5d3477b35cdd6df,0,22,1,23
1,霸王别姬,民国十八年,1.0,《霸王别姬》的故事发生在民国十八年（1929年）,[a7e3f833a8c75a77de5ef1ce9abca9df],807b66ed814e4b25a499ccb80afd56c5,1,22,1,23


### 创建 neo4j 连接

In [11]:
from neo4j import GraphDatabase

In [12]:
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "12345678" 
NEO4J_DATABASE = "neo4j"
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

In [13]:
# 封装 neo4j数据查询 函数
def import_data(cypher, df, batch_size=1000):

    for i in range(0,len(df), batch_size):
        batch = df.iloc[i: min(i+batch_size, len(df))]
        result = driver.execute_query("UNWIND $rows AS value " + cypher, 
                                      rows=batch.to_dict('records'),
                                      database_=NEO4J_DATABASE)
        print(result.summary.counters)
    return 

### 导入 GraphRAG 生成的知识图谱到 neo4j

In [14]:
#导入text units
cypher_text_units = """
MERGE (c:__Chunk__ {id:value.id})
SET c += value {.text, .n_tokens}
WITH c, value
UNWIND value.document_ids AS document
MATCH (d:__Document__ {id:document})
MERGE (c)-[:PART_OF]->(d)
"""

import_data(cypher_text_units, text_units)

{'_contains_updates': True, 'labels_added': 232, 'nodes_created': 232, 'properties_set': 696}


In [15]:
#加载entities
cypher_entities= """
MERGE (e:__Entity__ {id:value.id})
SET e += value {.human_readable_id, .description, name:replace(value.name,'"','')}
WITH e, value
CALL db.create.setNodeVectorProperty(e, "description_embedding", value.description_embedding)
CALL apoc.create.addLabels(e, case when coalesce(value.type,"") = "" then [] else [apoc.text.upperCamelCase(replace(value.type,'"',''))] end) yield node
UNWIND value.text_unit_ids AS text_unit
MATCH (c:__Chunk__ {id:text_unit})
MERGE (c)-[:HAS_ENTITY]->(e)
"""

import_data(cypher_entities, entities)

{'_contains_updates': True, 'labels_added': 1000, 'relationships_created': 2643, 'nodes_created': 1000, 'properties_set': 4000}
{'_contains_updates': True, 'labels_added': 1000, 'relationships_created': 1591, 'nodes_created': 1000, 'properties_set': 4000}
{'_contains_updates': True, 'labels_added': 1000, 'relationships_created': 1026, 'nodes_created': 1000, 'properties_set': 4000}
{'_contains_updates': True, 'labels_added': 36, 'relationships_created': 36, 'nodes_created': 36, 'properties_set': 144}


In [16]:
#导入relationships
cypher_relationships = """
    MATCH (source:__Entity__ {name:replace(value.source,'"','')})
    MATCH (target:__Entity__ {name:replace(value.target,'"','')})
    // not necessary to merge on id as there is only one relationship per pair
    MERGE (source)-[rel:RELATED {id: value.id}]->(target)
    SET rel += value {.rank, .weight, .human_readable_id, .description, .text_unit_ids}
    RETURN count(*) as createdRels
"""

import_data(cypher_relationships, relationships)

{'_contains_updates': True, 'relationships_created': 1000, 'properties_set': 6000}
{'_contains_updates': True, 'relationships_created': 960, 'properties_set': 5760}


In [17]:
#导入communities
cypher_communities = """
MERGE (c:__Community__ {community:value.id})
SET c += value {.level, .title}
/*
UNWIND value.text_unit_ids as text_unit_id
MATCH (t:__Chunk__ {id:text_unit_id})
MERGE (c)-[:HAS_CHUNK]->(t)
WITH distinct c, value
*/
WITH *
UNWIND value.relationship_ids as rel_id
MATCH (start:__Entity__)-[:RELATED {id:rel_id}]->(end:__Entity__)
MERGE (start)-[:IN_COMMUNITY]->(c)
MERGE (end)-[:IN_COMMUNITY]->(c)
RETURn count(distinct c) as createdCommunities
"""

import_data(cypher_communities, communities)

{'_contains_updates': True, 'labels_added': 207, 'relationships_created': 6243, 'nodes_created': 207, 'properties_set': 621}


In [18]:
#导入community_reports
cypher_community_reports = """MATCH (c:__Community__ {community: value.community})
SET c += value {.level, .title, .rank, .rank_explanation, .full_content, .summary}
WITH c, value
UNWIND range(0, size(value.findings)-1) AS finding_idx
WITH c, value, finding_idx, value.findings[finding_idx] as finding
MERGE (c)-[:HAS_FINDING]->(f:Finding {id: finding_idx})
SET f += finding"""
import_data(cypher_community_reports, community_reports)

{'_contains_updates': True, 'labels_added': 872, 'relationships_created': 872, 'nodes_created': 872, 'properties_set': 14882}
