In [12]:
import re
import sys
from pathlib import Path

try:
    import pymupdf  # PyMuPDF
except ImportError:
    print("错误: 请先安装 PyMuPDF 库")
    print("安装命令: pip install pymupdf")
    sys.exit(1)


def extract_page_number(text):
    """从页面文本中提取页脚页码"""
    # 匹配形如 "- 2 -"、"- 19 -"、"- 247 -" 的页码
    pattern = r'-\s*(\d+)\s*-'
    matches = re.findall(pattern, text)
    if matches:
        # 返回最后一个匹配（通常页码在页脚）
        return matches[-1]
    return None


def remove_page_numbers(text):
    """移除文本中的页码"""
    # 移除形如 "- 数字 -" 的页码
    text = re.sub(r'-\s*\d+\s*-', '', text)
    return text


def is_chapter_start(line):
    """判断是否是章节开始"""
    return line.strip().startswith("专项训练")


def is_question_start(line):
    """判断是否是题目开始"""
    # 匹配"第X题"、"第XX题"等
    pattern = r'^第[一二三四五六七八九十百]+题'
    return bool(re.match(pattern, line.strip()))


def merge_lines(lines):
    """智能合并文本行，保留必要的换行符
    
    规则：
    1. 空行表示段落结束，保留
    2. 如果当前行接近满行（≥38字符），且不是明显的标题，则与下一行合并
    3. 如果当前行较短（<38字符），说明是自然换行，保留
    4. 标题、列表等特殊格式保留换行
    """
    if not lines:
        return ""
    
    merged = []
    i = 0
    
    while i < len(lines):
        line = lines[i].strip()
        
        # 空行保留（段落分隔）
        if not line:
            if merged and merged[-1] != '':
                merged.append('')
            i += 1
            continue
        
        # 计算当前行的字符数（汉字算1个，英文字母算0.5个）
        char_count = sum(2 if ord(c) > 127 else 1 for c in line) / 2
        
        # 判断是否应该与下一行合并
        should_merge = False
        
        # 如果当前行接近满行（38-43字符），可能需要与下一行合并
        if char_count >= 38 and i + 1 < len(lines):
            next_line = lines[i + 1].strip()
            
            # 下一行不为空，且不是特殊标记（如数字编号、括号开头等）
            if next_line and not re.match(r'^[\(（\[【\d一二三四五六七八九十]+[、）\]】）.]', next_line):
                should_merge = True
        
        if should_merge:
            # 合并当前行和下一行
            next_line = lines[i + 1].strip()
            merged.append(line + next_line)
            i += 2
        else:
            # 保留当前行的换行
            merged.append(line)
            i += 1
    
    # 用换行符连接，保留段落结构
    result = []
    for j, line in enumerate(merged):
        if line == '':
            # 空行用于段落分隔
            if result and result[-1] != '':
                result.append('\n')
        else:
            result.append(line)
    
    return '\n'.join(result)


def process_pdf(pdf_path, output_path):
    """处理PDF文件并输出Markdown格式"""
    
    # 打开PDF文件
    doc = pymupdf.open(pdf_path)
    
    # 存储结构化数据
    chapters = []
    current_chapter = None
    current_question = None
    current_page_num = None
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text()
        
        # 提取当前页的页码
        page_number = extract_page_number(text)
        if page_number:
            current_page_num = page_number
        
        # 先移除页码，再按行处理
        clean_text = remove_page_numbers(text)
        lines = clean_text.split('\n')
        
        # 临时存储当前页的行（用于处理跨页的标题）
        temp_lines = []
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
            temp_lines.append(line)
        
        # 处理每一行
        for i, line in enumerate(temp_lines):
            # 检查是否是章节开始
            if is_chapter_start(line):
                # 保存之前的题目和章节
                if current_question and current_chapter:
                    current_chapter['questions'].append(current_question)
                if current_chapter:
                    chapters.append(current_chapter)
                
                # 开始新章节
                current_chapter = {
                    'title': line,
                    'questions': []
                }
                current_question = None
                continue
            
            # 检查是否是题目开始
            if is_question_start(line):
                # 保存之前的题目
                if current_question and current_chapter:
                    current_chapter['questions'].append(current_question)
                
                # 开始新题目，收集完整的题目标题
                title_lines = [line]
                j = i + 1
                # 继续收集后续行，直到遇到空行或明显的内容开始
                while j < len(temp_lines):
                    next_line = temp_lines[j]
                    # 如果下一行也是长行（可能是题目的延续），添加进来
                    next_char_count = sum(2 if ord(c) > 127 else 1 for c in next_line) / 2
                    if next_char_count >= 35 or (j == i + 1 and not next_line.startswith('要求') and not next_line.startswith('【')):
                        title_lines.append(next_line)
                        j += 1
                    else:
                        break
                
                # 合并题目标题
                full_title = ''.join(title_lines)
                
                current_question = {
                    'title': full_title,
                    'page': current_page_num,
                    'content': [],
                    'title_lines_count': len(title_lines)
                }
                continue
            
            # 将内容添加到当前题目
            if current_question is not None:
                # 跳过已经作为标题一部分的行
                if 'title_lines_count' in current_question:
                    if current_question['title_lines_count'] > 1:
                        current_question['title_lines_count'] -= 1
                        continue
                
                current_question['content'].append(line)
    
    # 保存最后一个题目和章节
    if current_question and current_chapter:
        current_chapter['questions'].append(current_question)
    if current_chapter:
        chapters.append(current_chapter)
    
    doc.close()
    
    # 生成Markdown输出
    markdown_output = []
    
    for chapter in chapters:
        # 章节标题
        markdown_output.append(f"# {chapter['title']}\n")
        
        for question in chapter['questions']:
            # 题目标题和页码备注
            markdown_output.append(f"## {question['title']}")
            if question['page']:
                markdown_output.append(f"*（第 {question['page']} 页）*\n")
            else:
                markdown_output.append("")
            
            # 题目内容 - 合并多余的换行符
            content = merge_lines(question['content'])
            markdown_output.append(content)
            markdown_output.append("")  # 空行分隔
    
    # 写入输出文件
    output_text = '\n'.join(markdown_output)
    
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(output_text)
    
    return len(chapters), sum(len(ch['questions']) for ch in chapters)


In [13]:
pdf_path_list = ["2026讲义答案.pdf", "2026刷题班讲义.pdf"]

for pdf_path in pdf_path_list:
  # 检查PDF文件是否存在
  if not Path(pdf_path).exists():
      print(f"错误: 文件不存在: {pdf_path}")
      sys.exit(1)

  output_path = Path(pdf_path).stem + "_output.md"

  print(f"正在处理PDF文件: {pdf_path}")
  print(f"输出文件: {output_path}")

  try:
      chapter_count, question_count = process_pdf(pdf_path, output_path)
      print(f"\n提取完成!")
      print(f"- 章节数: {chapter_count}")
      print(f"- 题目数: {question_count}")
      print(f"- 输出文件: {output_path}")
  except Exception as e:
      print(f"错误: 处理PDF时出现异常: {e}")
      import traceback
      traceback.print_exc()
      sys.exit(1)


正在处理PDF文件: 2026讲义答案.pdf
输出文件: 2026讲义答案_output.md

提取完成!
- 章节数: 7
- 题目数: 115
- 输出文件: 2026讲义答案_output.md
正在处理PDF文件: 2026刷题班讲义.pdf
输出文件: 2026刷题班讲义_output.md

提取完成!
- 章节数: 7
- 题目数: 115
- 输出文件: 2026刷题班讲义_output.md
