In [None]:
import os
import sys

try:
    get_ipython
    current_dir = os.getcwd()
except NameError:
    current_dir = os.path.dirname(os.path.abspath(__file__))

# Set path，temporary path expansion
project_dir = os.path.abspath(os.path.join(current_dir, "../../"))
if project_dir not in sys.path:
    sys.path.append(project_dir)


from neo4j import GraphDatabase
import numpy as np
from sentence_transformers import SentenceTransformer, util
from typing import List, Dict, Tuple, Optional
import os
import hashlib
USER ="neo4j"
PWD ="neo4j123"




#### 安装java
sudo apt update
sudo apt install openjdk-21-jdk


#### 解压
tar -xzf neo4j-community-2025.05.0-unix.tar.gz
cd neo4j-community-2025.05.0

#### 设置初始密码
./bin/neo4j-admin set-initial-password your_password

#### 启动服务
./bin/neo4j start


In [None]:
import neo4j
neo4j.__version__

In [None]:

driver = GraphDatabase.driver("bolt://localhost:7687", auth=(USER, PWD))
def test_connection():
    try:
        with driver.session() as session:
            result = session.run("RETURN 'Neo4j connection successful!' AS message")
            print(result.single()["message"])
    except Exception as e:
        print("连接失败：", e)
    finally:
        driver.close()
    
test_connection()

In [None]:


class Neo4jGraphRAG:
    def __init__(self, uri: str, user: str, password: str, 
                 model_name: str = os.path.join(project_dir, "model/BAAI/bge-small-zh")):
        """
        初始化基于Neo4j的Graph RAG系统
        
        Args:
            uri: Neo4j数据库连接URI
            user: 数据库用户名
            password: 数据库密码
            model_name: 用于生成文本嵌入的模型名称
        """
        # 初始化Neo4j连接
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
        
        # 初始化嵌入模型
        self.embedding_model = SentenceTransformer(model_name)
        
        # 确保嵌入向量的索引存在
        self._create_vector_index()
    
    def close(self):
        """关闭数据库连接"""
        if self.driver:
            self.driver.close()
    
    def _create_vector_index(self):
        """创建向量索引以加速相似性搜索"""
        with self.driver.session() as session:
            # 检查索引是否已存在
            result = session.run("""
                SHOW INDEXES WHERE name = 'node_embeddings_index'
            """)
            
            # if  list(result):
            #     result = session.run("""
            #         DROP INDEX `node_embeddings_index`
            # """)
           
            if not list(result):
                # 创建向量索引 
                session.run("""
                        CREATE VECTOR INDEX `node_embeddings_index`
                        FOR (n:Entity) ON (n.embedding)
                        OPTIONS {
                        indexConfig: {
                            `vector.dimensions`: 512,
                            `vector.similarity_function`: 'cosine'
                        }
                        }
                    """)

    def _normalize_entity_id(self, entity_text: str) -> str:
        """将字符串转换为固定长度的哈希值"""
        processed = entity_text.lower().strip()
        # 使用SHA-256哈希算法，也可以换成md5、sha1等
        hash_object = hashlib.sha256(processed.encode('utf-8'))
        # 转为16进制字符串（32个字符长度）
        return hash_object.hexdigest()
    
    def add_entity(self, entity_id: str, entity_type: str, properties: Dict, embedding: Optional[np.ndarray] = None):
        """
        向知识图谱添加实体
        
        Args:
            entity_id: 实体唯一标识符
            entity_type: 实体类型（标签）
            properties: 实体属性字典
            embedding: 实体的预计算嵌入（可选）
        """
        with self.driver.session() as session:
            # 如果没有提供嵌入，则从实体描述生成
            if embedding is None and "description" in properties:
                embedding = self.embedding_model.encode(properties["description"])


            # 将嵌入向量转换为列表以便存储
            embedding_list = embedding.tolist() if embedding is not None else None
            if entity_type==""or entity_type==None: 
                print(entity_type)
            # 添加实体节点 
            query = f"""
                MERGE (e:{entity_type} {{id: $entity_id}})
                SET e += $properties
                { 'SET e.embedding = $embedding' if embedding_list else '' }
            """
            session.run(query, {
                "entity_id": entity_id,
                "properties": properties,
                "embedding": embedding_list
            })

    
    def add_relationship(self, source_id: str, target_id: str, relationship_type: str, properties: Dict = None):
        """
        向知识图谱添加实体间的关系
        
        Args:
            source_id: 源实体ID
            target_id: 目标实体ID
            relationship_type: 关系类型
            properties: 关系属性（可选）
        """
        if properties is None:
            properties = {}
            
        with self.driver.session() as session:  
            # result= session.run("""
            #     MATCH (s) WHERE s.id = $source_id
            #     MATCH (t) WHERE t.id = $target_id
            #     MERGE (s)-[r:%s]->(t)
            #     SET r += $properties
            # """ % relationship_type, {
            #     "source_id": source_id,
            #     "target_id": target_id,
            #     "properties": properties
            # })
            # print(result)
            cypher = f"""
            MERGE (s {{id: $source_id}})  
            MERGE (t {{id: $target_id}})
            MERGE (s)-[r:{relationship_type}]->(t)
            SET r += $properties
            """
            session.run(cypher, {
                "source_id": source_id,
                "target_id": target_id,
                "properties": properties
            })
        
    def add_text_chunk(self, chunk_name: str, text: str, related_entities: List[str] = None):
        """
        添加文本块并与相关实体关联
        
        Args:
            chunk_name: 文本块唯一标识符
            text: 文本内容
            related_entities: 相关实体列表
        """
        # 生成文本嵌入
        embedding = self.embedding_model.encode(text)
        chunk_id = self._normalize_entity_id(chunk_name)
        # print("chunk_id ",chunk_id)
        with self.driver.session() as session:
            # 添加文本块节点并确保其有Entity标签，以便被向量索引包含
            session.run("""
                MERGE (c:TextChunk:Entity {id: $chunk_id})
                SET c.name = $name,
                    c.content = $text,
                    c.embedding = $embedding
            """, {
                "name":chunk_name,
                "chunk_id": chunk_id,
                "text": text,
                "embedding": embedding.tolist()
            })
            
            # 与相关实体建立连接
            if related_entities:
                for entity in related_entities:
                    ent_id =self._normalize_entity_id(entity)
                    # print(ent_id)
                    self.add_relationship(
                        chunk_id, 
                        ent_id, 
                        "MENTIONS",
                        {"s":chunk_name,"t":entity}
                    )
    
    def retrieve_relevant_nodes(self, query: str, top_k: int = 5) -> List[Dict]:
        """
        根据查询检索相关节点
        
        Args:
            query: 查询文本
            top_k: 返回的相关节点数量
            
        Returns:
            相关节点信息的列表
        """
        # 生成查询嵌入
        query_embedding = self.embedding_model.encode(query)
        
        with self.driver.session() as session:
            result = session.run("""
                CALL db.index.vector.queryNodes('node_embeddings_index', $top_k, $query_embedding)
                YIELD node, score
                RETURN node.id AS id, labels(node) AS labels, properties(node) AS properties, score
            """, {
                "top_k": top_k,
                "query_embedding": query_embedding.tolist()
            })
            
            # print("生成查询嵌入")
            return [record.data() for record in result]
    
    def get_connected_nodes(self, node_id: str, depth: int = 1) -> List[Dict]:
        """
        获取与指定节点连接的节点
        
        Args:
            node_id: 起始节点ID
            depth: 探索深度
            
        Returns:
            连接的节点信息列表
        """
        with self.driver.session() as session:
            # 构建Cypher查询，根据深度获取连接节点
            match_pattern = ""
            for i in range(1, depth + 1):
                match_pattern += f"-[r{i}]->(n{i})"
            
            nodes_list = [f"n{i}" for i in range(1, depth + 1)]
            nodes_str = ", ".join(nodes_list)

            query = f"""
                MATCH (n0) WHERE n0.id = $node_id
                MATCH (n0){match_pattern}
                UNWIND [{nodes_str}] AS connected_node
                WITH DISTINCT connected_node
                RETURN connected_node.id AS id, labels(connected_node) AS labels, properties(connected_node) AS properties
            """
            
            result = session.run(query, {"node_id": node_id})
            return [record.data() for record in result]
   
        
    def build_context(self, query: str, top_k: int = 5, context_depth: int = 1) -> str:
        """
        构建回答查询的上下文
        
        Args:
            query: 查询文本
            top_k: 检索的相关节点数量
            context_depth: 上下文扩展深度
            
        Returns:
            构建的上下文文本
        """
        # 检索相关节点
        relevant_nodes = self.retrieve_relevant_nodes(query, top_k)
        # print("relevant_nodes",relevant_nodes)
        # 收集所有相关节点及其连接节点
        context_nodes = {}
        
        # 添加相关节点
        for node in relevant_nodes:
            print("context_nodes",node["id"])
            context_nodes[node["id"]] = node
        
        # 添加连接节点
        for node in relevant_nodes:
            connected_nodes = self.get_connected_nodes(node["id"], context_depth)
            # print(connected_nodes)
            for conn_node in connected_nodes:
                # print("conn_node",conn_node)
                if conn_node["id"] not in context_nodes:
                    context_nodes[conn_node["id"]] = conn_node
                    
        
        # 构建上下文文本
        context_parts = []
        
        for node_id, node_data in context_nodes.items():
            labels = [label for label in node_data["labels"] if label != "Entity"]  # 排除基础标签
            node_type = labels[0] if labels else "Node"
            
            if node_type == "TextChunk":
                context_parts.append(f"Find Text Chunk: {node_data['properties'].get('content', '')}")
            else:
                entity_info = [f"Find Entity:{node_type}: {node_data['properties']['name']}"]
                for key, value in node_data["properties"].items():
                    if key not in ["id", "embedding"]:  # 排除不需要的属性
                        entity_info.append(f"  {key}: {value}")
                
                # 添加关系信息
                with self.driver.session() as session:
                    rel_result = session.run("""
                        MATCH (n)-[r]->(m) WHERE n.id = $node_id
                        RETURN type(r) AS rel_type, m.id AS target_id, labels(m) AS target_labels , properties(r) AS r_properties
                    """, {"node_id": node_id})
                    
                    for rel in rel_result:
                        # print("----------",rel['r_properties'])
                        target_type = rel["target_labels"][0] if rel["target_labels"] else "节点"
                        entity_info.append(f"{rel['r_properties']['s']}  与 {target_type} -{rel['r_properties']['t']} 存在 {rel['rel_type']} 关系")
                
                context_parts.append("\n".join(entity_info))
        
        return "\n\n".join(context_parts)
    
    def generate_response(self, query: str, top_k: int = 5, context_depth: int = 1) -> str:
        """
        生成基于图谱上下文的回答
        
        Args:
            query: 查询文本
            top_k: 检索的相关节点数量
            context_depth: 上下文扩展深度
            
        Returns:
            生成的回答
        """
        # 构建上下文
        context = self.build_context(query, top_k, context_depth)
        
        # 构建提示词（实际应用中应替换为真实LLM调用）
        prompt = f"""基于以下上下文信息回答问题:
        
        上下文:{context}
        
        问题: {query}
        
        回答:"""
        
        # 模拟LLM输出
        simulated_response = f"关于'{query}'的回答如下：\n"
        simulated_response += "在实际应用中，这里会是大型语言模型生成的详细回答。"
        
        return simulated_response


In [None]:
# # 初始化Neo4j Graph RAG系统
def manual_Neo4j():
    neo4j_rag = Neo4jGraphRAG(
        uri="bolt://localhost:7687",
        user=USER,
        password=PWD
    )

    try:
        with neo4j_rag.driver.session() as session:
            session.run("MATCH (n) DETACH DELETE n")
        
        # 添加一些实体
        neo4j_rag.add_entity(
            "einstein", 
            "Scientist", 
            {"name": "阿尔伯特·爱因斯坦", "birth_year": 1879, 
            "description": "著名物理学家，提出了相对论"}
        )
        
        neo4j_rag.add_entity(
            "relativity", 
            "Theory", 
            {"name": "相对论", "description": "关于时空和引力的物理理论"}
        )
        
        neo4j_rag.add_entity(
            "newton", 
            "Scientist", 
            {"name": "艾萨克·牛顿", "birth_year": 1643,
            "description": "物理学家，提出了万有引力定律和三大运动定律"}
        )
        
        # 添加实体间的关系
        neo4j_rag.add_relationship(
            "einstein", 
            "relativity", 
            "DEVELOPED",
            {"year": 1905}
        )
        
        neo4j_rag.add_relationship(
            "einstein", 
            "newton", 
            "WAS_INFLUENCED_BY"
        )
        
        # 添加文本块
        neo4j_rag.add_text_chunk(
            "chunk1", 
            "爱因斯坦在1905年发表了狭义相对论，后来在1915年提出了广义相对论。",
            ["einstein", "relativity"]
        )
        
        neo4j_rag.add_text_chunk(
            "chunk2", 
            "牛顿的力学理论在低速宏观情况下非常有效，但在高速或强引力场中需要相对论来解释。",
            ["newton", "relativity"]
        )
        
        # 测试查询
        query = "相对论是谁提出的？它与牛顿的理论有什么关系？"
        print(f"查询: {query}")
        
        # 构建的上下文（用于演示）
        print("\n构建的上下文:")
        print(neo4j_rag.build_context(query))

        # 获取回答
        response = neo4j_rag.generate_response(query, top_k=3, context_depth=2)
        # print("\n回答:")
        # print(response)
        
        
    finally:
        # 关闭连接
        neo4j_rag.close()

# manual_Neo4j()

## Auto Graph RAG

### 三种匹配方式

**re（正则表达式）**
re 的核心优势在于高效精准与低门槛：基于明确字符规则匹配，处理结构化文本（如固定格式的手机号、日期）时速度极快，且无需依赖复杂模型，仅需编写规则即可快速落地，成本极低。但它的局限性也很明显，缺乏语义理解且适配性弱—— 只能识别字符形式，无法区分 “苹果（水果）” 和 “苹果（品牌）” 这类语义差异，面对非结构化文本（如自由对话）完全失效，且文本格式稍有变化就需重新调整规则，维护成本高。

**spaCy（NLP 工具库）**
spaCy 的核心价值是平衡语义能力与实用效率：作为预训练 NLP 工具，它自带实体识别（NER）、词性标注等功能，能理解文本语义（如自动识别 “北京” 是地名、“张三” 是人名），且无需从零训练模型，开箱即用，推理速度比 LLM 快得多，适合中等规模的结构化 NLP 任务。不过它的泛化性和灵活性不足—— 对医疗、法律等小众领域的专业术语识别准确率低，需额外微调才能适配，且无法处理 “提取用户抱怨的问题” 这类模糊需求，只能应对明确的实体或语法层面的匹配任务。

**LLM（大语言模型）**
LLM 的核心优势是超强语义理解与泛化能力：依托大规模语料训练，它能处理复杂、模糊的需求（如 “提取用户提到的、需要优先解决的产品故障”），不仅能识别多义、小众术语，还能结合上下文做逻辑判断，甚至排除无效信息。同时，它无需针对特定领域微调，就能适配多数场景，对非结构化文本的处理能力远超 re 和 spaCy。但它的短板也很突出—— 推理依赖大模型，处理大量文本时速度慢、算力 / API 成本高，且可能出现 “幻觉”（虚构匹配结果），结果可控性差，还会受输入文本质量影响，上下文不清晰时准确率大幅下降。


In [None]:
import re
import spacy
spacy.__version__


In [None]:
# 使用spaCy的中等规模模型
# 加载NLP模型用于实体识别和关系抽取
spacy_model ="zh_core_web_sm"
try:
    # 使用spaCy的中等规模模型
    nlp = spacy.load(spacy_model)
except:
    # 如果没有安装，自动下载
    import spacy.cli
    spacy.cli.download(spacy_model)
    nlp = spacy.load(spacy_model)

In [None]:
from openai import OpenAI
import json


BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
MODEL_NAME = "qwen-plus"
api_key =  os.getenv("QWEN_API_KEY") or os.getenv("DASHSCOPE_API_KEY")

def build_prompt(text):
    """
    构建结构化提示词：告诉大模型需要做什么、输出格式是什么
    
    """
    prompt = f"""
    请处理以下中文文本，完成实体与关系提取任务：

    # 任务目标
    从文本中提取两类关键信息：
    1）**实体**：现实世界中存在或概念化的具体/抽象事物，包括但不限于人物、组织、地点、事件、概念、产品、时间、机构等
    2）**实体间的关系**：描述实体间的语义关联，如"任职于"、"位于"、"发布于"、"属于"等

    # 提取规则
    ## 实体提取：
    - 保留完整名称（如"阿里巴巴集团"而非"阿里巴巴"，"2024年世界人工智能大会"而非"AI大会"）
    - 同一实体的不同表述需合并（如"苹果公司"与"Apple Inc."统一标注为"苹果公司（Apple Inc.）"）
    - 过滤无意义的虚词、副词（如"的"、"非常"、"可能"），仅保留具有实际指代意义的名词/名词短语
    - 实体类型建议：人物、组织、公司、机构、地点、城市、国家、事件、产品、技术、概念、时间、日期、指标等

    ## 关系提取：
    - 关系需明确、无歧义（如"小明在腾讯工作"应提取为"小明-任职于-腾讯"，而非"小明-在-腾讯"）
    - 若文本中未直接说明但可逻辑推导明确的关系，需补充完整（如"张三是华为的CEO"→"张三-担任CEO-华为"）
    - 排除冗余关系（同一实体对的相同关系仅保留1次）
    - 关系必须基于文本内容，不可虚构
    - 代词需替换为对应的实体（如"他提出了狭义相对论"→主语为"阿尔伯特·爱因斯坦"）
    
    # 输出格式（严格遵循）
    - 仅输出纯 JSON 内容，不要包含任何多余文本
    - 禁止使用任何代码块标记（如 ```json、``` 等）
    - 确保 JSON 格式正确，可被直接解析
    - id = re.sub(r'\W+', '_', name.lower().strip())
    {{
    "entities": [
        {{
        "name": "示例实体1",
        "label": "人物" 
        }},
        {{
        "name": "示例实体2",
        "label": "组织"
        }}
    ],
    "relations": [
        {{
        "source": "示例实体1",
        "type": "任职于",
        "target": "示例实体2"
        }}
    ]
    }}

    文本内容：
    {text}
    """
    return prompt

def add_id_to_single_item(item):
    """新增ID（核心处理逻辑）"""
    def get_string_hash(s: str) -> str:
        """将字符串转换为固定长度的哈希值"""
        processed = s.lower().strip()
        # 使用SHA-256哈希算法，也可以换成md5、sha1等
        hash_object = hashlib.sha256(processed.encode('utf-8'))
        # 转为16进制字符串（32个字符长度）
        return hash_object.hexdigest()
    
    if isinstance(item, dict):
        # 若元素是字典：复制后添加ID（避免覆盖原字典）
        new_item = item.copy()
        if "id" not in new_item:  # 已有ID则不重复添加
            new_item["id"] =get_string_hash(item['name'])
        return new_item


        
def extract_entities_relations_llm(text):
    """
    调用大模型提取关系，并解析 JSON 结果
    """

    # 构建提示词
    prompt = build_prompt(text)
    
    client = OpenAI(
            api_key=api_key,
            base_url=BASE_URL,
            )

    output = client.chat.completions.create(
            messages=[{"role": "user", "content": f"{prompt}"}],
            model=MODEL_NAME,
            stream=False,
        )
    output=output.choices[0].message.content
    # print(output)
    # 提取输出中的 JSON 部分（大模型可能会多输出一些文字，需过滤）
    import re
    json_str = re.sub(r'^```json\s*|\s*```$', '', output.strip(), flags=re.MULTILINE)

    #  解析 JSON 为结构化列表
    try:
        data = json.loads(json_str)

        # 拆分出entities和relations
        entities = data.get('entities', [])
        relations = data.get('relations', [])
        return entities,relations
    except json.JSONDecodeError:
        raise ValueError("大模型输出的 JSON 格式错误")


In [None]:
class AutoGraphRAG(Neo4jGraphRAG):
    def __init__(self, uri, user, password, model_name = os.path.join(project_dir, "model/BAAI/bge-small-zh"),spacy_model ="zh_core_web_sm"):
        super().__init__(uri, user, password, model_name)
        self.nlp  = spacy.load(spacy_model)
        # 关系抽取的模式（可根据需求扩展）
        self.relationship_patterns = [
            (r"(.+) (发明|发现|提出)了? (.+)", r"\2"),        # 例如：爱因斯坦提出了相对论
            (r"(.+) (影响|启发)了? (.+)", r"\2"),             # 例如：牛顿影响了爱因斯坦
            (r"(.+) (出生于|生于) (.+)", r"\2"),              # 例如：爱因斯坦出生于1879年
            (r"(.+) 是 (.+)", r"\2"),                         # 例如：爱因斯坦是物理学家
            (r"(.+) (发表|撰写)了? (.+)", r"\2")              # 例如：爱因斯坦发表了论文
        ]
        
    def _extract_entities(self, text: str) -> List[Dict]:
        """从文本中提取实体"""
        doc = self.nlp (text)
        entities = []
        
        for ent in doc.ents:
            # 去重并简化实体
            if ent.text not in [e["name"] for e in entities]:
                entities.append({
                    "name": ent.text,
                    "label": ent.label_,  # 实体类型（如PERSON, ORG, GPE等）
                })
        
        return entities
    
    def _extract_relationships(self, text: str) -> List[Dict]:
        """从文本中提取关系"""
        relationships = []
        
        # 使用规则匹配提取关系
        for pattern, rel_type in self.relationship_patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                print(match)
                s_obj = match.group(1).strip()
                t_obj = match.group(3).strip()
                
                print(rel_type,s_obj,t_obj)
                # 只添加有效的实体关系
                if s_obj and t_obj and s_obj != t_obj:
                    relationships.append({
                        "source": s_obj,
                        "target": t_obj,
                        "type": rel_type.upper()
                    })
        
        # 使用spaCy的依赖解析辅助提取关系
        '''
        # 添加共指消解组件  coreferee不支持zh
        nlp.add_pipe("coreferee")
        doc = nlp(text)
        
        # 构建共指映射：将代词映射到原始实体
        coref_map = {}
        for cluster in doc._.coreferee.resolve_doc():
            # 取聚类中最长的实体作为代表（通常是完整名称）
            main_entity = max(cluster, key=lambda x: len(x.text))
            for mention in cluster:
                if mention.text != main_entity.text:
                    coref_map[mention.text] = main_entity.text
        
        # 替换文本中的代词
        resolved_tokens = []
        for token in doc:
            resolved_tokens.append(coref_map.get(token.text, token.text))
        resolved_text = " ".join(resolved_tokens).replace("  ", " ")  # 清理多余空格
        print("共指消解后的文本：")
        print(resolved_text)
        print("\n提取的关系：")
        
        # 重新分析处理后的文本
        doc_resolved = nlp(resolved_text)
        '''
   
        doc_resolved = nlp(text)
        for token in doc_resolved:
        # 寻找主语和宾语，它们通常依附于动词
            if token.dep_ in ["nsubj", "dobj", "pobj"] and token.head.pos_ == "VERB":
                s_obj = None
                t_obj = None
                
                # 查找主语和宾语
                for child in token.head.children:
                    if child.dep_ == "nsubj":
                        s_obj = child.text
                    if child.dep_ in ["dobj", "pobj"]:
                        t_obj = child.text
                if s_obj and t_obj:
                    # print(f"head: {token.head.text}, pos: {token.head.pos_},lemma:{ token.head.lemma_}") 
                    # rel_type = token.head.lemma_.upper()
                    rel_type =token.head.text # 中文lemma为空
               
                    # 避免重复添加
                    if not any(r["source"] == s_obj and r["target"] == t_obj and r["type"] == rel_type 
                               for r in relationships):
                        relationships.append({
                            "source": s_obj,
                            "target": t_obj,
                            "type": rel_type
                        })
        
        return relationships
    

    
    def add_document(self, doc_id: str, text: str,isLLm=False):
        """添加文档并自动抽取实体和关系"""
        # 提取实体和关系
        if isLLm:
            entities,relationships = extract_entities_relations_llm(text)
        else:
            entities = self._extract_entities(text)
            relationships = self._extract_relationships(text)
  
       
        
        # 添加实体
        for ent in entities:
            # print(ent["name"])
            ent_id =self._normalize_entity_id(ent["name"])
            # print("ent_id",ent_id)
            self.add_entity(
                ent_id, 
                ent["label"],  # 实体类型作为标签
                {"name": ent["name"], "description": f"从文本中提取的{ent['label']}实体"}
            )
        
        # 添加关系
        for rel in relationships:
            source_id = self._normalize_entity_id(rel["source"])
            target_id = self._normalize_entity_id(rel["target"])
            
            if not isLLm:
                # spacy解析可能不包含实体，确保主体和客体实体存在
                self.add_entity(
                    source_id, 
                    "Entity",  # 默认类型，如果不存在
                    {"name": rel["source"]}
                )
                self.add_entity(
                    target_id, 
                    "Entity", 
                    {"name": rel["target"]}
                )
            
            self.add_relationship(source_id, target_id, rel["type"],{"s":rel["source"],"t":rel["target"]})
      
         # 添加文本块(避免重复)
        self.add_text_chunk(doc_id, text, [e["name"] for e in entities])

In [None]:
def create_graph():
    auto_rag = AutoGraphRAG(
            uri="bolt://localhost:7687",
            user=USER,
            password=PWD
        )
    driver = auto_rag.driver
    try:
        with driver.session() as session:
            session.run("MATCH (n) DETACH DELETE n")
        
            # 自动处理文档并构建知识图谱
            doc1 = """爱因斯坦是一位著名的物理学家，1879年出生于德国。
            爱因斯坦在1905年提出了狭义相对论，1915年发表了广义相对论。
            爱因斯坦受到了牛顿的影响，但他的理论扩展了牛顿力学的适用范围。"""
            
            doc2 = """牛顿是17世纪的英国物理学家，他提出了万有引力定律和三大运动定律。
            这些理论为经典力学奠定了基础，影响了后来包括爱因斯坦在内的许多科学家。"""
            
            #对比 spacy和llm 的提取
            entities =auto_rag._extract_entities(doc1)
            print("e ",entities)
            relations =auto_rag._extract_relationships(doc1)
            print("r ",relations)

            entities,relations = extract_entities_relations_llm(doc1)
            print("e ",entities)
            print("r ",relations)

            # 添加文档（自动抽取实体和关系）
            auto_rag.add_document("doc1", doc1,True)
            auto_rag.add_document("doc2", doc2,True)
            #裸节点
            session.run(
                '''
                MATCH (n)
                WHERE size(labels(n)) = 0 AND n.id IS NOT NULL
                DETACH DELETE n
                '''
            )
    finally:
        driver.close()

# create_graph()

In [None]:
#检查节点和关系
auto_rag = AutoGraphRAG(
        uri="bolt://localhost:7687",
        user=USER,
        password=PWD
    )
driver = auto_rag.driver
with driver.session() as session:

    result = session.run("""
        MATCH (n)
        WHERE n.id IS NOT NULL
        WITH n.id AS id, collect(n) AS nodes, count(*) AS cnt
        WHERE cnt > 1
        UNWIND nodes AS node
        RETURN id,cnt, labels(node) AS labels, properties(node) AS props
    """)
    print([(record["id"],record['labels']) for record in result]) 

    result = session.run("""
        MATCH (n)
        RETURN n.id AS id, labels(n) AS labels, properties(n) AS props
    """)
    print([(record["id"],record['labels']) for record in result]) 
    
    result = session.run("""
        MATCH (n)
        RETURN DISTINCT labels(n) AS labels
    """)
    print([record["labels"] for record in result]) 


    result = session.run("""
        MATCH ()-[r]->()
        RETURN DISTINCT type(r) AS relation_type
    """)
    print([record["relation_type"] for record in result]) 

In [None]:
# 测试相关节点查询
auto_rag = AutoGraphRAG(
        uri="bolt://localhost:7687",
        user=USER,
        password=PWD
    )
    
driver = auto_rag.driver
try:
    with driver.session() as session:
       # 测试查询
        query = "爱因斯坦提出了什么理论？他受到了谁的影响？"
        print(f"查询: {query}")
      
      # 查看构建的上下文
        print("\n构建的上下文:")
        print(auto_rag.build_context(query))
        
          # 获取回答
        # response = auto_rag.generate_response(query)
        # print("\n回答:")
        # print(response)
finally:
    driver.close()

In [None]:
from typing import Any, List, Mapping, Optional, Dict
from langchain.llms.base import LLM
from langchain.callbacks.manager import CallbackManagerForLLMRun
from openai import OpenAI
from pydantic import Field

BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
ONLINE_MODEL_NAME = "qwen-plus"


class OpenAILLM(LLM):
    """
    一个将OpenAI API包装为LangChain LLM的自定义类
    """

    api_key: Optional[str] = Field(None)
    base_url: Optional[str] = Field(BASE_URL)
    model_name: str = Field(ONLINE_MODEL_NAME)
    temperature: float = Field(0.7)
    max_tokens: int = Field(1024)

    def __init__(self, **data: Any):
        super().__init__(**data)
        # 配置OpenAI API密钥
        if self.api_key:
            OpenAI.api_key = self.api_key

    @property
    def _llm_type(self) -> str:
        """返回LLM类型标识"""
        return self.model_name

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """返回用于标识LLM的参数"""
        return {
            "model_name": self.model_name,
            "temperature": self.temperature,
            "max_tokens": self.max_tokens,
        }

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        """
        实现调用OpenAI API的核心方法
        """
        try:
            # 调用OpenAI的 completions API
            client = OpenAI(
                api_key=self.api_key,
                base_url=self.base_url,
            )
            completion = client.chat.completions.create(
                model=self.model_name,
                messages=[
                    {
                        "role": "system",
                        "content": "你是一个智能超级助手，请用[中文]专业的词语回答问题，整体上下文带有逻辑性，并以markdown格式输出",
                    },
                    {"role": "user", "content": prompt},
                ],
                temperature=self.temperature,
                max_tokens=self.max_tokens,
            )
            # 提取并返回生成的文本
            return completion.choices[0].message.content

        except Exception as e:
            raise ValueError(f"调用OpenAI API时发生错误: {str(e)}")

    def predict(self, text: str, **kwargs: Any) -> str:
        """预测方法，与LangChain其他组件兼容"""
        return self._call(text, **kwargs)

https://github.com/neo4j/apoc/releases/tag/2025.05.1

In [None]:
from langchain.chains import GraphCypherQAChain
from langchain_community.graphs import Neo4jGraph
from langchain.prompts import PromptTemplate

graph = Neo4jGraph(
    url="bolt://localhost:7687",
    username=USER,
    password=PWD,
    refresh_schema=True
)
print(graph.schema)


In [46]:
api_key =  os.getenv("QWEN_API_KEY") or os.getenv("DASHSCOPE_API_KEY")
BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
ONLINE_MODEL_NAME = "qwen-plus"
prompt="爱因斯坦提出了什么理论？他受到了谁的影响？"
llm = OpenAILLM(model_name=ONLINE_MODEL_NAME, base_url=BASE_URL, api_key=api_key)

cypher_template = """
你需要根据以下中文图结构生成Cypher查询，解决用户问题：

图结构：
{schema}

用户问题：{question}

生成规则：
1. 必须使用中文标签（如:人物）、中文关系（如-[提出]->）、中文属性（如姓名）
2. 不要使用英文标签/关系/属性
3. 确保查询语法正确

生成的Cypher：
"""
cypher_prompt = PromptTemplate(
    template=cypher_template,
    input_variables=[graph.schema, "question"]
)
qa_prompt = PromptTemplate.from_template("根据查询结果回答问题：\n{context}\n问题：{question}")

cypher_generation_chain = cypher_prompt | llm
qa_chain = qa_prompt | llm

# 构建 GraphQAChain
chain = GraphCypherQAChain.from_llm(
    llm=llm,
    graph=graph,
    # graph_schema=graph.schema,
    cypher_prompt=cypher_prompt,
    qa_prompt =qa_prompt,
    verbose=True,
    allow_dangerous_requests=True
)

# 执行问答
response = chain.run({"query": prompt})



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (p:人物 {name: "爱因斯坦"})-[:提出]->(t:理论)
WITH p, collect(t.name) AS 提出的理论
MATCH (p)-[:受到影响于]->(influencer:人物)
RETURN 
  提出的理论 AS 爱因斯坦提出的理论,
  collect(influencer.name) AS 受到影响的人物
[0m
Full Context:
[32;1m[1;3m[{'爱因斯坦提出的理论': ['狭义相对论'], '受到影响的人物': ['牛顿']}][0m

[1m> Finished chain.[0m
