In [None]:
import re
import sys
from pathlib import Path

try:
    import pymupdf  # PyMuPDF
except ImportError:
    print("错误: 请先安装 PyMuPDF 库")
    print("安装命令: pip install pymupdf")
    sys.exit(1)


def extract_page_number(text):
    """从页面文本中提取页脚页码"""
    pattern = r'-\s*(\d+)\s*-'
    matches = re.findall(pattern, text)
    if matches:
        return matches[-1]
    return None


def remove_page_numbers(text):
    """移除文本中的页码"""
    text = re.sub(r'-\s*\d+\s*-', '', text)
    return text


def is_chapter_start(line):
    """判断是否是章节开始"""
    # 匹配 "专项训练" 开头
    return line.strip().startswith("专项训练")


def is_question_start(line):
    """判断是否是题目开始"""
    # 匹配"第X题"、"第XX题"等
    pattern = r'^第[一二三四五六七八九十百]+题'
    return bool(re.match(pattern, line.strip()))


def get_char_width_count(line):
    """计算字符显示宽度（汉字2，英文1，返回总宽度除以2，即折合汉字数）"""
    return sum(2 if ord(c) > 127 else 1 for c in line) / 2


def merge_lines(lines):
    """
    智能合并文本行 - 流式处理版
    
    逻辑：
    1. 遍历每一行。
    2. 检查"上一行"是否是满行（意味着段落未结束）。
    3. 检查"当前行"是否是列表项开头（意味着强制新起一行）。
    4. 如果判定为同一段落，则合并；否则，将上一段落存入结果，开始新段落。
    """
    if not lines:
        return ""
    
    # 结果列表，存储完整的段落
    paragraphs = []
    
    # 当前正在构建的段落缓冲
    current_buffer = ""
    
    # 记录"上一行原始文本"的长度特征，用于判断是否自然换行
    last_raw_line_width = 0
    
    # 满行阈值：如果一行超过这个长度（折合汉字），通常意味着它是段落的一部分而不是结尾
    # 一般A4文档一行约35-45个汉字，这里设定为35比较安全
    FULL_LINE_THRESHOLD = 35

    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        # 计算当前行的特征
        is_list_item = bool(re.match(r'^[\(（\[【\d一二三四五六七八九十]+[、）\]】\.]', line))
        
        # 决定是【合并】还是【新起一段】
        # 条件1：如果是第一行，直接入缓冲
        if not current_buffer:
            current_buffer = line
            last_raw_line_width = get_char_width_count(line)
            continue
            
        # 条件2：判断是否应该合并到上一段
        # 逻辑：如果上一行很长（接近满行），且当前行不是明显的列表项（如"1."），则合并
        should_merge = (last_raw_line_width >= FULL_LINE_THRESHOLD) and (not is_list_item)
        
        if should_merge:
            # 合并到当前缓冲（不加换行符，汉字直接拼接）
            current_buffer += line
        else:
            # 结束上一段，存入结果
            paragraphs.append(current_buffer)
            # 开始新的一段
            current_buffer = line
            
        # 更新上一行原始长度记录
        last_raw_line_width = get_char_width_count(line)
    
    # 循环结束后，处理缓冲区中最后一段
    if current_buffer:
        paragraphs.append(current_buffer)
    
    # Markdown中，段落之间需要空一行（即两个换行符）
    return "\n\n".join(paragraphs)


def process_pdf(pdf_path, output_path):
    """处理PDF文件并输出Markdown格式"""
    
    doc = pymupdf.open(pdf_path)
    
    chapters = []
    current_chapter = None
    current_question = None
    current_page_num = None
    
    # 用于跨页拼接题目标题的临时变量
    pending_title_parts = []
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text()
        
        # 提取页码
        page_number = extract_page_number(text)
        if page_number:
            current_page_num = page_number
        
        clean_text = remove_page_numbers(text)
        lines = clean_text.split('\n')
        
        # 预处理行：去除首尾空格，过滤空行
        valid_lines = [line.strip() for line in lines if line.strip()]
        
        i = 0
        while i < len(valid_lines):
            line = valid_lines[i]
            
            # 1. 检查章节开始
            if is_chapter_start(line):
                # 保存旧数据
                if current_question and current_chapter:
                    current_chapter['questions'].append(current_question)
                if current_chapter:
                    chapters.append(current_chapter)
                
                current_chapter = {
                    'title': line,
                    'questions': []
                }
                current_question = None
                i += 1
                continue
            
            # 2. 检查题目开始
            if is_question_start(line):
                # 保存旧题目
                if current_question and current_chapter:
                    current_chapter['questions'].append(current_question)
                
                # 收集多行标题
                title_parts = [line]
                j = i + 1
                while j < len(valid_lines):
                    next_line = valid_lines[j]
                    # 如果下一行比较长，或者不是明显的内容段落，可能是标题的延续
                    # 这里做一个简单的假设：如果下一行不包含"要求"、"【"且上一行比较长，则视为标题延续
                    prev_width = get_char_width_count(valid_lines[j-1])
                    if prev_width >= 30 and not next_line.startswith('要求') and not next_line.startswith('【'):
                         title_parts.append(next_line)
                         j += 1
                    else:
                        break
                
                full_title = "".join(title_parts)
                current_question = {
                    'title': full_title,
                    'page': current_page_num,
                    'content': []
                }
                # 更新索引跳过标题行
                i = j 
                continue
            
            # 3. 普通内容
            if current_question is not None:
                current_question['content'].append(line)
            
            i += 1
    
    # 循环结束，保存最后一项
    if current_question and current_chapter:
        current_chapter['questions'].append(current_question)
    if current_chapter:
        chapters.append(current_chapter)
    
    doc.close()
    
    # 生成Markdown输出
    markdown_output = []
    
    for chapter in chapters:
        markdown_output.append(f"# {chapter['title']}\n")
        
        for question in chapter['questions']:
            markdown_output.append(f"## {question['title']}")
            if question['page']:
                markdown_output.append(f"*（第 {question['page']} 页）*\n")
            else:
                markdown_output.append("")
            
            # 核心修改：将列表行一次性传入新的 merge_lines 函数
            content = merge_lines(question['content'])
            markdown_output.append(content)
            markdown_output.append("") # 题目间空行
    
    output_text = '\n'.join(markdown_output)
    
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(output_text)
    
    return len(chapters), sum(len(ch['questions']) for ch in chapters)


# --- 执行部分 ---
pdf_path_list = ["2026讲义答案.pdf", "2026刷题班讲义.pdf"]

for pdf_path in pdf_path_list:
    if not Path(pdf_path).exists():
        # print(f"错误: 文件不存在: {pdf_path}") # 调试时由于文件不在本地可注释
        continue

    output_path = Path(pdf_path).stem + "_output.md"
    print(f"正在处理: {pdf_path} -> {output_path}")

    try:
        chapter_count, question_count = process_pdf(pdf_path, output_path)
        print(f"提取完成! 章节: {chapter_count}, 题目: {question_count}\n")
    except Exception as e:
        print(f"处理异常: {e}")
        import traceback
        traceback.print_exc()