# 下载模型与数据集

In [None]:
import os

# 设置镜像地址
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
from huggingface_hub import snapshot_download
from huggingface_hub.utils import HfHubHTTPError


def download_model(target_dir, repo_id):
    """
    将指定的 HuggingFace 仓库下载到本地目录。
    
    参数:
        target_dir (str): 本地保存模型的路径。
        repo_id (str): HuggingFace 上的仓库 ID。
    """
    print(f"准备开始下载 '{repo_id}' ...")
    print(f"目标保存路径: {os.path.abspath(target_dir)}")

    try:
        # snapshot_download 会下载整个仓库
        # local_dir_use_symlinks=False 确保下载的是实际文件而不是缓存的软链接
        local_path = snapshot_download(
            repo_id=repo_id,
            local_dir=target_dir,
            local_dir_use_symlinks=False,
            resume_download=True,  # 支持断点续传
            # max_workers=4        # 可选：如果网速允许，增加并发数
        )
        print(f"✅ 下载成功！文件已保存在: {local_path}")

    except HfHubHTTPError as e:
        print(f"❌ 下载失败 (网络或权限错误): {e}")
    except Exception as e:
        print(f"❌ 发生未知错误: {e}")


下载向量模型:Qwen/Qwen3-Embedding-8B

In [None]:
target_dir = "/root/autodl-tmp/model/qwen3-8b"
repo_id = "Qwen/Qwen3-Embedding-8B"
download_model(target_dir=target_dir, repo_id=repo_id)

下载识别模型：ds4sd/docling-models

In [None]:
download_model(
    target_dir="/root/autodl-tmp/model/docling", 
    repo_id="ds4sd/docling-models"
)

# Pipeline

In [1]:
from pipeline import RunConfig, PipelineConfig, Pipeline

preprocess_configs = {"ser_tab": RunConfig(use_serialized_tables=True),
                      "no_ser_tab": RunConfig(use_serialized_tables=False)}

max_st_qwenturbo8k_reasoning_config = RunConfig(
    use_serialized_tables=False,
    parent_document_retrieval=True,
    llm_reranking=True,
    parallel_requests=1,
    submission_name="",
    pipeline_details="",
    api_provider="qwen",
    answering_model="qwen-turbo",
    config_suffix="_max_qwen-turbo1-llmre-reasoning"
)

2026-01-17 16:15:25,292 - Loading faiss with AVX2 support.
2026-01-17 16:15:25,356 - Successfully loaded faiss with AVX2 support.


In [2]:
from pyprojroot import here

root_path = here() / "data" / "test_set"
print("root_path:", root_path)
pipeline = Pipeline(root_path,
                    run_config=max_st_qwenturbo8k_reasoning_config
                    )

root_path: d:\program\project\python\MHier-RAG\data\test_set


此方法将PDF报告解析为JSON文件。它会在debug/data_01_parsed_reports目录中创建JSON文件。这些JSON文件将在后续步骤中使用
它还会将文档生成的原始输出存储在 debug/data_01_parsed_reports_debug目录中。这些JSON文件包含大量元数据，但不会被使用
在这一步，会调用多模态大模型生成图片描述

In [3]:
import os

# 设置镜像地址
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
pipeline.parse_pdf_reports_sequential()

2026-01-17 16:16:08,614 - Starting to process 1 documents
2026-01-17 16:16:08,720 - Going to convert document batch...
2026-01-17 16:16:50,913 - Accelerator device: 'cpu'
2026-01-17 16:16:51,145 - Accelerator device: 'cpu'
2026-01-17 16:16:51,730 - Processing document Elizabeth_I.pdf
2026-01-17 16:17:17,079 - Finished converting document Elizabeth_I.pdf in 68.45 sec.
2026-01-17 16:17:18,259 - HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-17 16:17:28,142 - HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-17 16:17:29,483 - HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-17 16:17:41,019 - HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"
2026-01-17 16:17:41,382 - HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completion

PDF reports parsed and saved to d:\program\project\python\MHier-RAG\data\test_set\debug_data\01_parsed_reports


此方法将debug/data_01_parsed_reports中的JSON转换为更简单的JSON，即Markdown格式的页面列表
新的JSON文件位于debug/data_02_merged_reports中

In [4]:
pipeline.merge_reports()

Reports saved to d:\program\project\python\MHier-RAG\data\test_set\debug_data\02_merged_reports


此方法将报告导出为纯 Markdown 格式。这些报告仅用于审阅和全文搜索配置：gemini_thinking_config
新文件位于 debug/data_03_reports_markdown 目录下

In [5]:
pipeline.export_reports_to_markdown()

Reports saved to d:\program\project\python\MHier-RAG\data\test_set\debug_data\03_reports_markdown


此方法将报告分割成多个数据块，用于向量化处理
新的 JSON 文件位于 databases/chunked_reports 目录中。

In [None]:
# TODO 在这一步调用qwen向量模型api时，并发请求速率过快，会被拦截
pipeline.chunk_reports()

2026-01-17 16:27:59,794 - Successfully initialized TreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 100
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
            Selection Mode: top_k
            Summarization Length: 100
            Summarization Model: <raptor.SummarizationModels.QwenSummarizationModel object at 0x0000021B879B01F0>
            Embedding Models: {'EMB': <raptor.EmbeddingModels.Qwen3EmbeddingModel object at 0x0000021B879B31C0>}
            Cluster Embedding Model: EMB
        
        Reduction Dimension: 10
        Clustering Algorithm: RAPTOR_Clustering
        Clustering Parameters: {}
        
2026-01-17 16:27:59,795 - Successfully initialized ClusterTreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 100
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
            Sel

RateLimitError: Error code: 429 - {'error': {'message': 'You have exceeded your current request limit. For details, see: https://help.aliyun.com/zh/model-studio/error-code#rate-limit', 'type': 'limit_requests', 'param': None, 'code': 'limit_requests'}, 'request_id': 'e6357145-c010-9507-b1df-b990cb137c20'}

此方法从分块报告中创建向量数据库
新文件位于 databases/vector_dbs 目录中。

In [None]:
pipeline.create_vector_dbs()

此方法处理问题和答案
问题处理逻辑取决于run_config

In [None]:
pipeline.process_questions()