In [4]:
from GraphReasoning.graph_generation import *
from GraphReasoning.graph_analysis import *
from GraphReasoning.graph_tools import *
from GraphReasoning.llm_providers import *

输入文件，可以是 pdf、markdown、txt

In [None]:
import os
from pathlib import Path

# Prompt user to input a file
; file_path = input("请输入文件路径 (PDF、Markdown 或 TXT 文件): ").strip()
try:
    import tkinter as tk
    from tkinter import filedialog

    root = tk.Tk()
    root.withdraw()
    selected_path = filedialog.askopenfilename(
        title="选择文件",
        filetypes=[
            ("PDF", "*.pdf"),
            ("Markdown", "*.md *.markdown"),
            ("Text", "*.txt"),
            ("All files", "*.*"),
        ],
    )
    if selected_path:
        file_path = selected_path
    root.destroy()
except Exception as e:
    print(f"文件选择窗口打开失败，改用手动输入路径: {e}")

# Validate the file exists and has correct extension
valid_extensions = {'.pdf', '.md', '.markdown', '.txt'}
file_extension = Path(file_path).suffix.lower()

if not os.path.exists(file_path):
    print("错误: 文件不存在，请检查路径")
elif file_extension not in valid_extensions:
    print(f"错误: 不支持的文件格式。请使用以下格式之一: {', '.join(valid_extensions)}")
else:
    print(f"✓ 文件已加载: {file_path}")
    source_file = file_path

调用 graph_generation.py 中的函数，从输入文件中提取文本，并生成 chunks

In [None]:
# 从文件提取文本并生成 chunks（根据 graph_generation.py 中的函数命名调整）
text = extract_text_from_file(source_file)
chunks = generate_chunks(text, chunk_size=500, overlap=50)

print(f"✓ 已生成 {len(chunks)} 个 chunks")

In [None]:


# 如果本仓库不在 sys.path，可先添加
import sys, os
sys.path.append(os.path.abspath(".."))  # Notebook 位于 Notebooks/ 下，上一层是项目根

from GraphReasoning import graph_generation

# 选择/构造一个 generate 函数
# 1) 使用已有的 provider 工厂（推荐）：见 graph_generation.py 中 main 的用法
from GraphReasoning.llm_providers import get_generate_fn
import os

provider = "openai"  # 可换为 deepseek / qwen / llama_cpp / transformers
provider_config = {
    "api_key": os.getenv("OPENAI_API_KEY", ""),
    "model": os.getenv("OPENAI_MODEL", "gpt-4-turbo"),
    # deepseek/qwen 需额外 base_url；llama_cpp 需 model_path；transformers 需 model 名称
}
generate = get_generate_fn(provider, provider_config)


In [None]:

# 方案 1: 对所有 chunks 进行知识图谱提取，然后合并结果
knowledge_graphs = []
for i, chunk in enumerate(chunks, start=1):
    # 调用核心函数：从文本生成图
    graph_html, graph_graphml, G, net, output_pdf = graph_generation.make_graph_from_text(
        txt=chunk,
        generate=generate,
        include_contextual_proximity=True,
        graph_root="notebook_test",
        chunk_size=500,
        chunk_overlap=0,
        repeat_refine=0,
        verbatim=True,
        data_dir="./test_output/",
        save_PDF=False,
        save_HTML=True,
    )
    knowledge_graphs.append((graph_html, graph_graphml, G, net, output_pdf))
    print(f"✓ 已处理 chunk {i}/{len(chunks)}")

print("HTML:", graph_html)
print("GraphML:", graph_graphml)
print("Nodes:", G.number_of_nodes())
print("Edges:", G.number_of_edges())



In [5]:
# 方案2：逐块增量合并（add_new_subgraph_from_text）
from transformers import AutoTokenizer, AutoModel
from GraphReasoning.graph_tools import update_node_embeddings
import networkx as nx
import shutil
import os
from pathlib import Path
import time

# 准备轻量嵌入模型（可按需换更大模型）
embed_model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(embed_model_name)
model = AutoModel.from_pretrained(embed_model_name)

# 初始空嵌入（如已有可加载）
node_embeddings = {}

# 临时目录
data_dir_output = "./test_output/"
Path(data_dir_output).mkdir(exist_ok=True, parents=True)

graph_graphml_agg = None
G_agg = None

for i, chunk in enumerate(chunks, start=1):
    t0 = time.time()
    print(f"\n=== 增量处理 chunk {i}/{len(chunks)} ===")
    
    # 第一次：直接生成基图
    if G_agg is None:
        graph_html, graph_graphml, G_agg, net, output_pdf = graph_generation.make_graph_from_text(
            txt=chunk,
            generate=generate,
            include_contextual_proximity=True,
            graph_root=f"notebook_incremental_base_{i}",
            chunk_size=500,
            chunk_overlap=0,
            repeat_refine=0,
            verbatim=True,
            data_dir=data_dir_output,
            save_PDF=False,
            save_HTML=True,
        )
        graph_graphml_agg = graph_graphml
        print(f"基图完成: {graph_graphml_agg}")
    else:
        # 后续：增量合并新子图
        graph_GraphML_new, G_new, G_loaded, G_orig, node_embeddings, res = graph_generation.add_new_subgraph_from_text(
            txt=chunk,
            generate=generate,
            node_embeddings=node_embeddings,
            tokenizer=tokenizer,
            model=model,
            original_graph_path_and_fname=graph_graphml_agg,
            data_dir_output=data_dir_output,
            verbatim=True,
            size_threshold=0,
            chunk_size=500,
            do_Louvain_on_new_graph=False,
            include_contextual_proximity=False,
            repeat_refine=0,
            similarity_threshold=0.9,
            do_simplify_graph=False,
            return_only_giant_component=False,
            save_common_graph=False,
        )
        # 更新聚合结果
        G_agg = G_new
        graph_graphml_agg = graph_GraphML_new
        print(f"增量图完成: {graph_graphml_agg}")
    
    print(f"当前节点/边: {G_agg.number_of_nodes()} / {G_agg.number_of_edges()}")
    print(f"耗时: {time.time() - t0:.1f}s")

print("\n最终聚合图:", graph_graphml_agg)
print("最终节点/边:", G_agg.number_of_nodes(), G_agg.number_of_edges())

NameError: name 'chunks' is not defined