# 第二步：导入与环境配置

In [None]:
# Cell 2: 导入库与配置
from __future__ import annotations
import os, io, math, json
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)
from typing import Dict, Any, List
import fitz  # PyMuPDF
from PIL import Image
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# --- 调试修改点：Notebook 中不仅要在后台画图，可能还需要查看 ---
# 如果你想在 cell 输出里直接看 matplotlib 的图，可以注释掉下面这行
# matplotlib.use("Agg") 

from langchain_unstructured import UnstructuredLoader
from unstructured.partition.pdf import partition_pdf
from html2text import html2text

# 定义数据根目录
DATA_ROOT = Path("data_debug") # 调试时换个目录名以免污染生产数据

# 辅助显示图片的函数（用于 Notebook）
from IPython.display import display, Image as IPImage

def show_image(path: Path):
    if path.exists():
        display(IPImage(filename=str(path), width=600))
    else:
        print(f"Image not found: {path}")

print("Imports ready & Data Root set to:", DATA_ROOT)

In [None]:
# Cell 3: 目录管理函数
def workdir(file_id: str) -> Path:
    d = DATA_ROOT / file_id
    d.mkdir(parents=True, exist_ok=True)
    return d

def dir_original_pages(file_id: str) -> Path:
    p = workdir(file_id) / "pages" / "original"
    p.mkdir(parents=True, exist_ok=True); return p

def dir_parsed_pages(file_id: str) -> Path:
    p = workdir(file_id) / "pages" / "parsed"
    p.mkdir(parents=True, exist_ok=True); return p

def original_pdf_path(file_id: str) -> Path:
    return workdir(file_id) / "original.pdf"

def markdown_output(file_id: str) -> Path:
    return workdir(file_id) / "output.md"

def images_dir(file_id: str) -> Path:
    p = workdir(file_id) / "images"
    p.mkdir(parents=True, exist_ok=True); return p

print("Path helpers defined.")

In [None]:
# Cell 4: 模拟上传 PDF
def create_dummy_pdf(path: Path):
    """创建一个包含简单文本和形状的测试 PDF"""
    doc = fitz.open()
    page = doc.new_page()
    page.insert_text((50, 72), "Test Document Title", fontsize=24)
    page.insert_text((50, 150), "This is a paragraph of text to test OCR and layout analysis.", fontsize=12)
    # 画个框模拟图片或表格
    page.draw_rect(fitz.Rect(50, 200, 300, 400), color=(0, 0, 1), width=2)
    page.insert_text((60, 220), "Table Data Here", fontsize=10)
    doc.save(path)
    doc.close()

FILE_ID = "debug_001"
FILENAME = "test_doc.pdf"

# 1. 准备目录
workdir(FILE_ID)
pdf_save_path = original_pdf_path(FILE_ID)

# 2. 如果没有文件，创建一个假的；或者你可以手动把真实 PDF 放到 data_debug/debug_001/original.pdf
if not pdf_save_path.exists():
    print("Generating dummy PDF...")
    create_dummy_pdf(pdf_save_path)
else:
    print(f"Using existing PDF at {pdf_save_path}")

# 3. 模拟 save_upload 函数逻辑
with open(pdf_save_path, "rb") as f:
    upload_bytes = f.read()

# 运行你的 save_upload 逻辑
def save_upload(file_id: str, upload_bytes: bytes, filename: str) -> Dict[str, Any]:
    pdf_path = original_pdf_path(file_id)
    pdf_path.write_bytes(upload_bytes)
    with fitz.open(pdf_path) as doc:
        pages = doc.page_count
    return {"fileId": file_id, "name": filename, "pages": pages}

info = save_upload(FILE_ID, upload_bytes, FILENAME)
print("Upload result:", info)

In [None]:
# Cell 5: 渲染原始页面
def render_original_pages(file_id: str, dpi: int = 144):
    pdf_path = original_pdf_path(file_id)
    out_dir = dir_original_pages(file_id)
    with fitz.open(pdf_path) as doc:
        for idx, page in enumerate(doc, start=1):
            mat = fitz.Matrix(dpi/72, dpi/72)
            pix = page.get_pixmap(matrix=mat)
            out_path = out_dir / f"page-{idx:04d}.png"
            out_path.write_bytes(pix.tobytes("png"))
            print(f"Saved: {out_path.name}")

render_original_pages(FILE_ID)

# 验证：显示第一页
print("\n--- Preview Page 1 ---")
show_image(dir_original_pages(FILE_ID) / "page-0001.png")

In [None]:
import os
import sys

# 【重要】每次重启内核后，必须重新运行这几行
# 请确保这个路径是你移动硬盘里解压 Poppler 的 bin 目录
# 优先从环境变量读取 POPPLER_PATH，否则使用默认路径
poppler_path = os.getenv("POPPLER_PATH", r"V:\RAG\tools\poppler-25.12.0\Library\bin")

# 检查一下路径是否存在，防止写错
if os.path.exists(poppler_path):
    # 将其加入到系统 PATH 中
    if poppler_path not in os.environ["PATH"]:
        os.environ["PATH"] += os.pathsep + poppler_path
        print(f"✅ Poppler 路径已临时添加: {poppler_path}")
else:
    print(f"❌ 警告：路径不存在，请检查: {poppler_path}")

# ---------------------------------------------------------------------------
# Tesseract OCR 配置
# ---------------------------------------------------------------------------
# 优先从 .env 读取
tesseract_path = os.getenv("TESSERACT_PATH", r"C:\Program Files\Tesseract-OCR\tesseract.exe")
tess_dir = os.path.dirname(tesseract_path)

if os.path.exists(tess_dir):
    if tess_dir not in os.environ["PATH"]:
        os.environ["PATH"] += os.pathsep + tess_dir
        print(f"✅ Tesseract 路径已临时添加: {tess_dir}")
else:
    print(f"❌ 警告：Tesseract 路径不存在: {tess_dir}")
    print("   请安装 Tesseract 并更新 .env 配置")


In [None]:
# Cell 6: 运行 UnstructuredLoader
def unstructured_segments(file_id: str) -> List[Any]:
    pdf_path = str(original_pdf_path(file_id))
    print(f"Processing {pdf_path} ... this may take a while.")
    
    loader = UnstructuredLoader(
        file_path=pdf_path,
        strategy="hi_res",
        infer_table_structure=True,
        # ocr_languages="chi_sim+eng", # 需要安装 tesseract 中文包
        # ocr_engine="paddleocr",      # 如果没装 paddle，注释掉这行使用默认(tesseract)
    )
    out = []
    for d in loader.lazy_load():
        out.append(d)
    return out

# 运行并保存结果到变量 docs，方便后续反复使用而不必重跑
try:
    docs = unstructured_segments(FILE_ID)
    print(f"Extracted {len(docs)} elements.")
    # 查看第一个元素的 metadata 结构
    if len(docs) > 0:
        print("First element type:", docs[0].page_content)
        print("First element metadata:", docs[0].metadata)
except Exception as e:
    print("Error during Unstructured processing:", e)
    docs = []

In [None]:
# Cell 8: PDF 转 Markdown
def pdf_to_markdown(file_id: str):
    pdf_path = str(original_pdf_path(file_id))
    out_md = markdown_output(file_id)
    img_dir = images_dir(file_id)
    
    print("Partitioning PDF for Markdown generation...")
    # 注意：这里会再次调用 partition_pdf，生产环境建议复用上面的 docs 结果，
    # 但你的代码里是分开调用的，这里保持一致以便调试
    elements = partition_pdf(
        filename=pdf_path,
        infer_table_structure=True,
        strategy="hi_res",
        # ocr_languages="chi_sim+eng",
    )

    # 1. 提取图片
    image_map = {}
    with fitz.open(pdf_path) as doc:
        for page_num, page in enumerate(doc, start=1):
            image_map[page_num] = []
            for img_index, img in enumerate(page.get_images(full=True), start=1):
                xref = img[0]
                pix = fitz.Pixmap(doc, xref)
                img_path = img_dir / f"page{page_num}_img{img_index}.png"
                
                # 处理 CMYK 等情况
                if pix.n < 5:
                    pix.save(str(img_path))
                else:
                    pix = fitz.Pixmap(fitz.csRGB, pix)
                    pix.save(str(img_path))
                image_map[page_num].append(img_path.name)
    print(f"Images extracted to {img_dir}")

    # 2. 组装 Markdown
    md_lines: List[str] = []
    inserted_images = set()
    
    for el in elements:
        cat = getattr(el, "category", None)
        text = (getattr(el, "text", "") or "").strip()
        meta = getattr(el, "metadata", None)
        page_num = getattr(meta, "page_number", None) if meta else None

        if not text and cat != "Image":
            continue

        if cat == "Title":
            md_lines.append(f"# {text}\n")
        elif cat in ["Header", "Subheader"]:
            md_lines.append(f"## {text}\n")
        elif cat == "Table":
            html = getattr(meta, "text_as_html", None) if meta else None
            if html:
                md_lines.append(html2text(html) + "\n")
            else:
                md_lines.append((text or "") + "\n")
        # 简单逻辑：遇到 Image 类型的元素，就把该页所有提取到的图片塞进去
        # 实际逻辑可能需要更精确的坐标匹配
        elif cat == "Image" and page_num:
            for name in image_map.get(page_num, []):
                if (page_num, name) not in inserted_images:
                    md_lines.append(f"\n![Image](./images/{name})\n")
                    inserted_images.add((page_num, name))
        else:
            md_lines.append(text + "\n")

    out_md.write_text("\n".join(md_lines), encoding="utf-8")
    return {"markdown": out_md, "images_dir": img_dir}

result = pdf_to_markdown(FILE_ID)
print(f"\nMarkdown saved to: {result['markdown']}")

# 读取并打印 Markdown 前 500 个字符看看
print("-" * 20)
print(result['markdown'].read_text(encoding='utf-8')[:500])
print("...")