In [None]:
#!/Users/dev4/miniconda3/envs/dolphin311/bin/dolphin-run \
!dolphin-run \
    --model_path "/Volumes/sw/pretrained_models/Dolphin-1.5" \
        --input_path '/Volumes/sw/books/罗洛·梅文集 权力与无知：寻求暴力的根源 ([美]罗洛·梅著； 郭本宇 方红译（中国人民大学出版社2013年）) (Z-Library).pdf' \
            --save_dir /Volumes/sw/ocr_result/results_quanliyuwuzhi \
                --max_batch_size 1

In [None]:
#把所有的json文件内容提取, 并过滤掉不需要的, 仅保存文本内容
import os
import json
import re
from pathlib import Path
from dolphin.utils.markdown_utils import MarkdownConverter

# from utils.utils import save_combined_pdf_results
#/Volumes/sw/ocr_result/results_quanliyuwuzhi

# ignore 'header'
label_filter = ['para', 'title', 'half_para', 'catalogue', 'sec','sub_sec', 'sec_0', 
                'sec_1', 'sec_2', 'sec_3', 'sec_4', 'sec_5', 'fig', 'tab', 'equ', 'list', 'code']

def save_combined_pdf_results_to_markdown(all_page_results, save_name, save_dir):
    """Save combined results for multi-page PDF with both JSON and Markdown

    Args:
        all_page_results: List of results for all pages
        pdf_path: Path to original PDF file
        save_dir: Directory to save results

    Returns:
        Path to saved combined JSON file
    """
    # Create output filename based on PDF name
    base_name = save_name

    # Prepare combined results
    combined_results = {"source_file": save_name, "total_pages": len(all_page_results), "pages": all_page_results}

    # Save combined JSON results
    json_filename = f"{base_name}.json"
    json_path = os.path.join(save_dir, "recognition_json", json_filename)
    os.makedirs(os.path.dirname(json_path), exist_ok=True)

    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(combined_results, f, indent=2, ensure_ascii=False)

    # Generate and save combined markdown
    try:
        markdown_converter = MarkdownConverter()

        # Combine all page results into a single list for markdown conversion
        all_elements = []
        for page_data in all_page_results:
            page_elements = page_data.get("elements", [])
            if page_elements:
                # Add page separator if not the first page
                # if all_elements:
                #     all_elements.append(
                #         {"label": "page_separator", "text": f"\n\n---\n\n", "reading_order": len(all_elements)}
                #     )
                all_elements.extend(page_elements)

        # Generate markdown content
        markdown_content = markdown_converter.convert(all_elements)

        # Save markdown file
        markdown_filename = f"{base_name}.md"
        markdown_path = os.path.join(save_dir, "markdown", markdown_filename)
        os.makedirs(os.path.dirname(markdown_path), exist_ok=True)

        with open(markdown_path, "w", encoding="utf-8") as f:
            f.write(markdown_content)

        # print(f"Combined markdown saved to: {markdown_path}")

    except ImportError:
        print("MarkdownConverter not available, skipping markdown generation")
    except Exception as e:
        print(f"Error generating markdown: {e}")

    # print(f"Combined JSON results saved to: {json_path}")
    return markdown_path

def combine_json(book_name):
    base_dir = f'/Volumes/sw/ocr_result/results_{book_name}'
    print(f'处理json目录: {base_dir}')
    json_dir = f"{base_dir}/recognition_json/"
    output_file = os.path.join(os.path.dirname(json_dir.rstrip('/')), "merged_paragraphs.txt")

    all_texts = []
    def n_of_filename(_filename):
        _filename = Path(_filename).stem 
        number = _filename.split("_")[-1]
        return number

    def is_punctuation_end(sentence):
        punctuation_marks = {'.', ',', '?', '!', ';', '。', '，', '？', '！', '；'}
        sentence = sentence.strip()
        if not sentence:  # 处理空字符串
            return False
        
        return sentence[-1] in punctuation_marks

    files = os.listdir(json_dir)
    sorted_files = sorted(
        # [f for f in files if f.startswith(f'{book_name}_page_') and f.endswith('.json')],
        [f for f in files if re.search(r'_page_\d+\.json$', f)],
        key=lambda f: int(n_of_filename(f))
    )

    all_elements = []
    for filename in sorted_files:
        json_path = os.path.join(json_dir, filename)

        bbox_list = []
        with open(json_path, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
            except json.JSONDecodeError:
                print(f"跳过无效 JSON 文件: {filename}")
                continue
        match = re.search(r'_page_(\d+)\.json$', filename)
        if match:
            page_number = int(match[1])

            data = [_d for _d in data if _d['label'] in label_filter]
            page_results = {
                "page_number": page_number + 1,
                "elements": data
            }
        else:
            print(f'not match file {filename}')
        all_elements.append(page_results)
        paras = sorted(
            [
                item
                for item in data
                if isinstance(item, dict) and item.get('label') in label_filter
            ],
            key=lambda x: x['reading_order']
        )
        result_text = ''
        for p in paras:
            _text = p['text'].replace('\n', '')
            if p.get('label') == 'sub_sec':
                _text = '章节: ' + _text + ';'
            if is_punctuation_end(result_text):
                result_text = result_text + '\n' + _text
            else:
                result_text = result_text + _text
        if result_text:
            all_texts.append(result_text)
            
    md_path = save_combined_pdf_results_to_markdown(all_elements, book_name, base_dir)
    print(f'saved markdown file [{md_path}]')

    merged_text = ''
    for _l in all_texts:
        if is_punctuation_end(merged_text):
            merged_text = merged_text + f'\n\n{_l}'
        else:
            merged_text = merged_text + _l
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, "w", encoding="utf-8") as out_f:
        out_f.write(merged_text)
    print(f"保存完成，总共 {len(all_texts)} 个 JSON 文件内容合并到 {output_file}")

combine_json("quanliyuwuzhi")

In [None]:
#提取出pdf的某一页,以做测试或者其他验证
from pdf2image import convert_from_path

def extract_pdf_page_to_image(pdf_path, page_number, output_path):
    try:
        # Convert specific page to image (page_number is 1-based in pdf2image)
        pages = convert_from_path(pdf_path, first_page=page_number, last_page=page_number)
        
        if not pages:
            raise ValueError(f"No page found for page number {page_number}")
        
        # Save the page as an image
        pages[0].save(output_path, 'PNG')  # Save as PNG (can change to JPEG, etc.)
        print(f"Page {page_number} saved as {output_path}")
    
    except Exception as e:
        print(f"Error: {str(e)}")

# Example usage
pdf_path = "/Volumes/sw/MyDrive/data_src/独立宣言 一种全球史 (（美）大卫·阿米蒂奇) (Z-Library).pdf"
page_number = 20  # Replace with desired page number (1-based)
output_path = f"/Volumes/sw/MyDrive/data_src/page_{page_number}.png"  # Output image file path

extract_pdf_page_to_image(pdf_path, page_number, output_path)