# 预处理

### md文档转换成json

In [11]:
import re
import json

def parse_markdown_to_json(markdown_file_path, output_json_path):
    """
    Parses a markdown file containing questions and answers,
    extracts them, and saves them to a JSON file.

    Each question is expected to start with a marker like 【题 XX】, [題 XX],
    【图XX】, 【原 XX】, or [理 XX].
    The question text is the content between this marker and the
    start of the answer section (marked by 【分析】 or 【解】).
    The answer text includes the answer marker and the subsequent content
    until the next question marker.
    """
    try:
        with open(markdown_file_path, "r", encoding="utf-8") as f:
            markdown_content = f.read()
    except FileNotFoundError:
        print(f"Error: The file "{markdown_file_path}" was not found.")
        print("Please ensure the markdown file is in the same directory as the script,")
        print("or provquestion_numbere the correct path.")
        return
    except Exception as e:
        print(f"Error reading file "{markdown_file_path}": {e}")
        return

    questions_data = []
    
    # Regex to find all question starting markers.
    # This pattern captures:
    # 【题...】 (e.g., 【题 21】)
    # [題...] (e.g., [題 2])
    # 【图...】 (e.g., 【图24】) - Assuming these are questions based on your document structure
    # 【原...】 (e.g., 【原 41】)
    # [理...] (e.g., [理 23】)
    problem_marker_pattern = r"(【题\s*[^】]+】|\[題\s*[^\]]+\]|【图\s*[^】]+】|【原\s*[^】]+】|\[理\s*[^\]]+\])"
    
    # Find all starting positions of question markers
    matches = list(re.finditer(problem_marker_pattern, markdown_content))
    
    if not matches:
        print("No questions found. Please check the markers in your markdown file.")
        return

    for i, match in enumerate(matches):
        q_question_number_full_marker = match.group(1)  # The full marker, e.g., "【题 21】"
        
        # The actual content of the current question block starts after its marker
        content_start_index = match.end()
        
        # The end of the current question block is the start of the next question"s marker,
        # or the end of the file if this is the last question.
        if i + 1 < len(matches):
            block_end_index = matches[i+1].start()
        else:
            block_end_index = len(markdown_content)
            
        current_block_content = markdown_content[content_start_index:block_end_index].strip()
        
        # Clean up the question_number: remove brackets and leading/trailing whitespace
        # e.g., "【题 21】" becomes "题 21"
        q_question_number = re.sub(r"[【】\[\]]", "", q_question_number_full_marker).strip()

        question_text = ""
        answer_text = ""
        
        # Try to find the start of the answer section (【分析】 or 【解】)
        analysis_marker_str = "【分析】"
        solution_marker_str = "【解】"
        
        analysis_pos = current_block_content.find(analysis_marker_str)
        solution_pos = current_block_content.find(solution_marker_str)
        
        answer_marker_pos = -1
        
        # Determine the earliest position of an answer marker
        if analysis_pos != -1 and solution_pos != -1:
            answer_marker_pos = min(analysis_pos, solution_pos)
        elif analysis_pos != -1:
            answer_marker_pos = analysis_pos
        elif solution_pos != -1:
            answer_marker_pos = solution_pos
            
        if answer_marker_pos != -1:
            # Question is text before the answer marker
            # question_text = current_block_content[:answer_marker_pos].strip()
            question_text = current_block_content[:answer_marker_pos]

            # Answer includes the marker itself and the rest of the block
            # answer_text = current_block_content[answer_marker_pos:].strip()
            answer_text = current_block_content[answer_marker_pos:]

        else:
            # If no specific answer marker is found, assume the entire block
            # after the question_number marker is the question text. This might happen if
            # a question (e.g., a 【图X】 item) doesn"t have a separate analysis/solution.
            question_text = current_block_content
            answer_text = "" # No separate answer section question_numberentified

        questions_data.append({
            "question_number": q_question_number,
            "question": str(question_text),
            "answer": str(answer_text)
        })

    try:
        with open(output_json_path, "w", encoding="utf-8") as f:
            json.dump(questions_data, f, ensure_ascii=False, indent=4)
        print(f"Successfully extracted {len(questions_data)} questions to "{output_json_path}"")
    except Exception as e:
        print(f"Error writing JSON to file "{output_json_path}": {e}")

question_text = ""
# 1. Save the markdown content you provquestion_numbered into a file named "questions.md"
#    in the same directory as this Python script.
# 2. Run this script. It will generate "questions_extracted.json".

markdown_file = "questions.md"  # Input file name
json_file = "questions_extracted.json" # Output file name

parse_markdown_to_json(markdown_file, json_file)

Successfully extracted 98 questions to 'questions_extracted.json'


In [4]:
!pip install python-dotenv



In [9]:
from openai import OpenAI
import os
import base64
from dotenv import load_dotenv 
load_dotenv()

prompt_extra = r"""
识别图中的问题及其参考答案,按照以下规则提取出["question_number","condition","specific_questions","solution","final_answer"],并返回一个JSON列表,如果遇到换行，请使用\n表示。

其中:
"question_number": 字符串,题号
"condition": 原题的题干,直接复制原题的Latex内容,然后参考转换规则修改,
"specific_questions": 原题的设问,直接复制原题的Latex内容,然后参考转换规则修改,
"solution": 子问的逐步solution,改写为独立,不包含图示,且不直接引用其他部分结论。如需前部结果,应作为已知条件说明,
"final_answer": 一个数值或公式,不要任何汉字、条件、单位, 不要出现 "=","\n","\box"


转换规则:
1. 输出格式: 输出必须是一个JSON格式{"result":[字典列表]}, 每个字典必须是latex格式,确保能用latex编译器编译通过
2. 单个子问对应一个 JSON 对象: 若原题包含多个子问题(如 1., 2., a., b.),拆分为多个对象,question_number 用原 question_number.1/2/3/4...
3. 子问题独立: 每个对象必须尽量自包含。"condition"和"solution"避免出现"由第 1 部分得出"等表述,若需前部结果,请将结果写入该部分的"condition"
4. 转换选填题: 若"specific_questions"为选择题或填空题,需转换为计算题并给出数值或公式;不可行则跳过。
5. 内容完整: 保留题目核心物理概念和数值,改写不改变实质。
6. 结构: 严格包含"question_number"、"condition"、"specific_questions"、"solution"、"final_answer"五个字段。
7. 需要删除的内容 :【多余的补位符号(*#?-)】【题目序号】【如图xx所示】【某个大学】【某个省份】,"final_answer"里的汉字、单位、条件
8. 需要核对的内容 : 核对原题目和答案汉字的正确性:人射 vs 入射、代人 vs 代入、收玫 vs 收敛;latex代码的正确性:$\\overrightarrow{{AB}}$ vs $\\overline{{AB}}$
9. 需要修改的格式 : \\[替换为 换行$$,\\]替换为 $$换行,\\(替换为$,\\)替换为$;
10.确保你的回答可以被json.loads(your_answer)正确解析，包括检查：所有键名和字符串值必须使用 双引号 " 包裹，不能使用单引号 '。所有反斜杠 \ 必须写成 双反斜杠 \\，特别是用于 LaTeX 公式时。不允许对象或数组中最后一个元素后添加逗号（例如：[1, 2, 3] 是合法的，[1, 2, 3,] 是非法的）。如果包含换行，应使用 \\n 转义表示，不直接写入回车符。JSON 中的布尔值和空值必须是小写形式：true、false、null，不能使用 Python 的 True、False、None。确保整个 JSON 是一个结构完整、合法的对象或数组。输出必须是 纯 JSON 字符串，不要包含任何解释、注释或多余文本。Latex代码需要使用$$包裹。
"""
#  base 64 编码格式


def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

# 将xxxx/eagle.png替换为你本地图像的绝对路径
base64_image1 = encode_image("images/1.png")
base64_image2 = encode_image("images/2.png")

qwen_client = OpenAI(
    # 若没有配置环境变量，请用百炼API Key将下行替换为：api_key="sk-xxx"
    api_key= os.getenv("qwen_api"),
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)

def call_qwen(prompt):
    qwen_completion = qwen_client.chat.completions.create(
        model="qwen-vl-max", # 此处以qwen-vl-max-latest为例，可按需更换模型名称。模型列表：https://help.aliyun.com/model-studio/getting-started/model
        messages=[
            {
                "role": "system",
                "content": [{"type":"text","text":prompt_extra}]},
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{base64_image1}"}
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{base64_image2}"}
                    }
                ],
            }
        ],
        response_format={"type": "json_object"}
    )
    return qwen_completion.choices[0].message.content


[
    {
        "question_number": "12",
        "condition": "$$\\text{一根长为 } L \\text{ 的水平粗管与一根竖直细管连接成如图所示的形状．把细管下端插入密度为 } \\rho_{f} \\text{ 的液体之中，然后将粗管的管口封住，并使粗管绕细管以恒定的微小角速度 } \\omega \\text{ 旋转．已知空气的密度为 } \\rho_{a} \\text{, 细管的体积与粗管相比可以忽略, 毛细现象也可忽略．在温度恒定的条件下，试求细管中液面上升的高度 } h \\text{.}$$",
        "specific_questions": "$$\\text{在随粗管一起匀角速转动的非惯性系中, 粗管内被封闭的空气沿粗管水平长度方向有一定的惯性离心势能分布, 使得平衡时粗管内被封闭的空气分子的数密度 } n \\text{ 沿粗管长度水平方向有相应的玻耳兹曼分布．结果, 在粗管中高细管较远处的 } n \\text{ 大于封闭旋转前的 } n_{0} \\text{, 在粗管中靠近细管处的 } n \\text{ 小于 } n_{0} \\text{．封闭旋转前的 } n_{0} \\text{ 与大气压强 } p_{0} \\text{ 对应．使得粗管中靠近细管处的空气压强 } p_{0}' \\text{ 小于 } p_{0} \\text{, 导致液体被抽上细管．细管内液体上升的高度取决于 } p_{0}' \\text{ 与 } p_{0} \\text{ 之差, 最终达到平衡．}$$",
        "solution": "$$\\text{取以 } \\omega \\text{ 旋转的非惯性系, 取 } x \\text{ 轴沿粗管水平长度方向, 原点在细管处．则粗管中在 } x \\text{ 位置的空气分子具有的惯性离心势能为}$$\n$$E_{p}(x) = - \\frac{1}{2} m \\omega^{2} x^{2}$$\n$$\\text{式中 } m \\text{ 是分子质量, } E_{p}(x) \\text{ 使得粗管中的气体分子数密度沿 } x \\text{ 轴按玻耳兹曼分布, 为}$$\n$$n(x) = n(0

In [13]:
import json
json.loads(completion.choices[0].message.content)

[{'question_number': '12',
  'condition': '$$\\text{一根长为 } L \\text{ 的水平粗管与一根竖直细管连接成如图所示的形状．把细管下端插入密度为 } \\rho_{f} \\text{ 的液体之中，然后将粗管的管口封住，并使粗管绕细管以恒定的微小角速度 } \\omega \\text{ 旋转．已知空气的密度为 } \\rho_{a} \\text{, 细管的体积与粗管相比可以忽略, 毛细现象也可忽略．在温度恒定的条件下，试求细管中液面上升的高度 } h \\text{.}$$',
  'specific_questions': "$$\\text{在随粗管一起匀角速转动的非惯性系中, 粗管内被封闭的空气沿粗管水平长度方向有一定的惯性离心势能分布, 使得平衡时粗管内被封闭的空气分子的数密度 } n \\text{ 沿粗管长度水平方向有相应的玻耳兹曼分布．结果, 在粗管中高细管较远处的 } n \\text{ 大于封闭旋转前的 } n_{0} \\text{, 在粗管中靠近细管处的 } n \\text{ 小于 } n_{0} \\text{．封闭旋转前的 } n_{0} \\text{ 与大气压强 } p_{0} \\text{ 对应．使得粗管中靠近细管处的空气压强 } p_{0}' \\text{ 小于 } p_{0} \\text{, 导致液体被抽上细管．细管内液体上升的高度取决于 } p_{0}' \\text{ 与 } p_{0} \\text{ 之差, 最终达到平衡．}$$",
  'solution': "$$\\text{取以 } \\omega \\text{ 旋转的非惯性系, 取 } x \\text{ 轴沿粗管水平长度方向, 原点在细管处．则粗管中在 } x \\text{ 位置的空气分子具有的惯性离心势能为}$$\n$$E_{p}(x) = - \\frac{1}{2} m \\omega^{2} x^{2}$$\n$$\\text{式中 } m \\text{ 是分子质量, } E_{p}(x) \\text{ 使得粗管中的气体分子数密度沿 } x \\text{ 轴按玻耳兹曼分布, 为}$$\n$$n(x) = n(0) e^{-E_{p}(x)/kT} = n(0) e^{m \

In [None]:
import os
from openai import OpenAI

client = OpenAI(
    # 若没有配置环境变量，请用百炼API Key将下行替换为：api_key="sk-xxx",
    api_key="sk-d45d399a7ea4495c939297628a9d9151",
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)
completion = client.chat.completions.create(
    model="qwen-vl-plus",  # 此处以qwen-vl-plus为例，可按需更换模型名称。模型列表：https://help.aliyun.com/zh/model-studio/getting-started/models
    messages=[{"role": "user","content": [
            {"type": "text","text": "这是什么"},
            {"type": "image_url",
             "image_url": {"url": "https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg"}}
            ]}]
    )
print(completion.model_dump_json())

### 测试——查找文字所在pdf页数

In [19]:
import sys
import pdfplumber
import os
import json
import time

# 提取PDF文本并保存到文件中
def extract_pdf_text(pdf_path, save_dir=None):
    """
    提取PDF文本并保存到文件中
    
    Args:
        pdf_path (str): PDF文件路径
        save_dir (str, optional): 保存目录，默认为PDF同目录
        
    Returns:
        str: 保存的文本文件路径
    """
    # 如果未提供保存目录，使用PDF所在目录
    if save_dir is None:
        save_dir = os.path.dirname(pdf_path)
    
    # 创建保存目录（如果不存在）
    os.makedirs(save_dir, exist_ok=True)
    
    # 生成保存文件名（使用PDF文件名+.json）
    pdf_filename = os.path.basename(pdf_path)
    pdf_name_without_ext = os.path.splitext(pdf_filename)[0]
    save_path = os.path.join(save_dir, f"{pdf_name_without_ext}.json")
    
    # 检查是否已经提取过文本
    if os.path.exists(save_path):
        print(f"找到已提取的文本文件: {save_path}")
        return save_path
    
    # 开始提取
    print(f"开始提取PDF文本...")
    start_time = time.time()
    
    extracted_data = {
        "pdf_path": pdf_path,
        "extraction_time": time.strftime("%Y-%m-%d %H:%M:%S"),
        "total_pages": 0,
        "pages": {}
    }
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            total_pages = len(pdf.pages)
            extracted_data["total_pages"] = total_pages
            
            print(f"PDF总页数: {total_pages}")
            
            for page_num, page in enumerate(pdf.pages):
                current_page = page_num + 1  # 页码从1开始
                print(f"正在提取第 {current_page}/{total_pages} 页...")
                
                # 提取文本
                text = page.extract_text() or ""
                extracted_data["pages"][str(current_page)] = text
        
        # 保存提取结果
        with open(save_path, "w", encoding="utf-8") as f:
            json.dump(extracted_data, f, ensure_ascii=False, indent=2)
        
        end_time = time.time()
        print(f"文本提取完成！用时 {end_time - start_time:.2f} 秒")
        print(f"文本已保存至: {save_path}")
        
        return save_path
        
    except Exception as e:
        print(f"提取PDF文本时发生错误: {e}")
        return None

# 在已保存的PDF文本中查找指定文本
def find_text_in_saved_pdf(text_file_path, search_text):
    """
    在已保存的PDF文本中查找指定文本
    
    Args:
        text_file_path (str): 保存的文本文件路径
        search_text (str): 需要查找的文本
    
    Returns:
        list: 包含文本出现页码的列表（页码从1开始）
    """
    result_pages = []
    
    try:
        # 读取保存的文本文件
        with open(text_file_path, "r", encoding="utf-8") as f:
            pdf_data = json.load(f)
        
        # 获取PDF总页数
        total_pages = pdf_data.get("total_pages", 0)
        pages_data = pdf_data.get("pages", {})
                
        # 遍历每一页
        for page_num in range(1, total_pages + 1):
            # 获取当前页面文本
            text = pages_data.get(str(page_num), "")
            
            # 检查搜索文本是否在当前页面
            if search_text in text:
                result_pages.append(page_num)
    
    except Exception as e:
        print(f"查找文本时发生错误: {e}")
    
    return result_pages

# 从输入文本中随机查找一个由5个连续汉字组成的子字符串
def is_chinese_char(char):
    """
    判断一个字符是否为常用的汉字。
    这里使用的是基本的CJK统一表意文字区段 (U+4E00 至 U+9FFF)。
    如果需要更广泛的汉字支持（例如扩展区A、B等），可以扩展此处的Unicode范围。
    """
    return "\u4e00" <= char <= "\u9fff"

def get_consecutive_chinese_chars(text):
    """
    从输入文本中随机查找一个由5个连续汉字组成的子字符串。
    """
    if not text or len(text) < 5:
        return ""  # 如果文本为空或长度小于10，则返回空字符串

    possible_substrings = []
    # 遍历所有可能的10字符子串的起始位置
    for i in range(len(text) - 4):  # 确保子字符串长度为10
        substring = text[i:i+5]
        # 检查子字符串中的所有字符是否都是汉字
        if all(is_chinese_char(char) for char in substring):
            possible_substrings.append(substring)
        
    return possible_substrings






# 定义 PDF 文件路径
pdf_path = r"物理学难题集萃(增订本)【舒幼生等】_part1(OCR).pdf"
json_path = r"物理学难题集萃(增订本)【舒幼生等】_part1(OCR).json"

# extract_pdf_text(pdf_path, save_dir=json_path)

condition = "1 mol 单原子分子理想气体所经准静态循环过程是如热图2-21-1所示的圆，有关参量已在热图2-21-1 中标明。"

page

PDF总页数: 570
PDF总页数: 570
PDF总页数: 570
PDF总页数: 570
PDF总页数: 570


403

In [20]:
pages

[403, 372, 403, 398, 403]

In [None]:
import os
import json
import time
import datetime
import subprocess
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
from prompts import *
from utils import *
import random
import threading
import numpy as np
import argparse
import base64
import glob

# 如果存在 .env 文件,从中加载环境变量
load_dotenv()

# 配置API客户端
client = OpenAI(
    api_key=os.getenv("deepseek_api"),
    base_url="https://api.deepseek.com",
)
MODEL = "deepseek-chat"

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

# 将xxxx/eagle.png替换为你本地图像的绝对路径


qwen_client = OpenAI(
    # 若没有配置环境变量，请用百炼API Key将下行替换为：api_key="sk-xxx"
    api_key= os.getenv("qwen_api"),
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)

def call_qwen(prompt, base64_images):
    """调用千问API，支持多张图片"""
    # 构建消息内容，包含所有图片
    content = [{"type": "text", "text": prompt_extra}]
    
    # 构建用户消息，包含文本和所有图片
    user_content = []
    for img in base64_images:
        user_content.append({
            "type": "image_url",
            "image_url": {"url": f"data:image/png;base64,{img}"}
        })
    
    qwen_completion = qwen_client.chat.completions.create(
        model="qwen-vl-max",
        messages=[
            {"role": "system", "content": content},
            {"role": "user", "content": user_content}
        ],
        response_format={"type": "json_object"}
    )
    return qwen_completion.choices[0].message.content

# 默认文件路径配置


def call_deepseek(prompt):
    """调用DeepSeek API"""
    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "user", "content": prompt}
        ],
        response_format={"type": "json_object"}
    )
    return response.choices[0].message.content

def run_llm_process(llm, ques_id):
    """运行单个LLM进程并返回输出"""
    cmd = rf"node src\index.js -l {llm} -i ./input/{ques_id}.json -a zht"
    process = subprocess.Popen(
        cmd, 
        shell=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        bufsize=1,
        universal_newlines=True,
        encoding='utf-8'
    )

    output = []
    for line in iter(process.stdout.readline, ""):
        print(line, end="")
        output.append(line)
        if "全部处理完成" in line:
            break
    return output

def process_question(problem_obj, json_path, base64_images=None):
    """处理单个问题"""
    global MODEL  # 添加全局变量声明
    
    print(f"\n-------------------处理题目 (question_number: {problem_obj.get('id')})----------------------")
    
    if "图" in problem_obj.get("question",""):
        print("[跳过]跳过带图题")
        return None
    if "证" in problem_obj["question"]:
        print("[跳过]跳过证明题")
        return None

    # 提取五元组
    flag = True
    count = 0
    ans_json = {}
    while flag:
        try:
            print(f"[提取五元组]第{count+1}次尝试,使用模型qwen-vl-max...")
            # 使用传入的base64_images调用千问API
            ans = call_qwen(prompt_extra+"\nQURSTION:"+f"【{problem_obj['id']}】"+problem_obj["question"]+"\nANSWER:"+problem_obj["answer"], base64_images)
            ans_json = json.loads(ans.replace('\n', ''))
            flag = False
            count += 1
        except Exception as e:
            print(f"[提取五元组]Failed:{e}")
            print(ans)
            exit()  
            count += 1
            if count > 5:
                return None

    print(f"[提取五元组]Success:成功从【{problem_obj['id']}】中提取出{len(ans_json)}个五元组")

    results = []
    if not os.path.exists(rf"input"):
        os.makedirs(rf"input")

    for index, item in enumerate(ans_json):
        print(f"[处理第{index+1}个问题]正在处理第{index+1}个五元组...")

        now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
        item["question_number"] = f"{item['question_number']}_{now}"
        ques_id = item["question_number"]
        
        # 保存中间结果
        with open(f"input/{ques_id}.json", "w", encoding="utf-8") as f:
            json.dump([ans_json[index]], f, ensure_ascii=False, indent=4)

        print(f"[处理第{index+1}个问题]{ques_id}.json file saved")

        # 使用线程并行运行三个模型
        threads = []
        for llm in ["deepseek", "qianwen", "doubao"]:
            thread = threading.Thread(target=run_llm_process, args=(llm, ques_id))
            threads.append(thread)
            thread.start()

        # 等待所有线程完成
        for thread in threads:
            thread.join()

        print(f"\n[处理第{index+1}个问题]三模型回答截图完毕")

        # 读取模型输出
        try:
            with open(rf"src\outputs\qianwen\qianwen_output_{ques_id}.json", "r", encoding="utf-8") as f:
                data = json.load(f)
                qwen_ans = data.get("messages", [""])[0] if data.get("messages") and len(data.get("messages")) > 0 else ""
        except Exception as e:
            print(f"[读取千问输出错误] {str(e)}")
            qwen_ans = ""
            
        try:
            with open(rf"src\outputs\deepseek\deepseek_output_{ques_id}.json", "r", encoding="utf-8") as f:
                data = json.load(f)
                deepseek_ans = data.get("messages", [""])[0] if data.get("messages") and len(data.get("messages")) > 0 else ""
        except Exception as e:
            print(f"[读取deepseek输出错误] {str(e)}")
            deepseek_ans = ""
            
        try:
            with open(rf"src\outputs\doubao\doubao_output_{ques_id}.json", "r", encoding="utf-8") as f:
                data = json.load(f)
                doubao_ans = data.get("messages", [""])[0].replace("正在搜索\n","") if data.get("messages") and len(data.get("messages")) > 0 else ""
        except Exception as e:
            print(f"[读取豆包输出错误] {str(e)}")
            doubao_ans = ""

        # 如果三个模型都没有输出，跳过此题
        if not qwen_ans and not deepseek_ans and not doubao_ans:
            print(f"[处理第{index+1}个问题] 三个模型均无有效输出，跳过此题")
            continue

        # 判断对错
        flag = True
        count = 0
        while flag:
            try:
                print(f"    [处理第{index+1}个问题-判断三模型答案对错]第{count+1}次尝试")
                ds_ans3 = call_deepseek(prompt_judge+item["condition"]+item["specific_questions"]+"\n正确答案:"+item["solution"]+"\学生1答案:"+qwen_ans+"\学生2答案:"+deepseek_ans+"\学生3答案:"+doubao_ans)
                ds_ans3_json = json.loads(ds_ans3)
                wrong_num = 0
                wrong_ans = None
                cwjfmx = None
                cwmx = ""
                qw_jietu = None
                ds_jietu = None
                db_jietu = None

                if "正确" in ds_ans3_json["学生1"]:
                    qw_correct = True
                else:
                    qw_correct = False
                    wrong_num += 1
                    wrong_ans = qwen_ans
                    cwjfmx = "千问"
                    qw_jietu = rf"src\outputs\qianwen\qianwen_screenshot_{ques_id}.png"
                    if not cwmx:
                        cwmx = "千问"
                    else:
                        cwmx += ",千问"

                if "正确" in ds_ans3_json["学生2"]:
                    ds_correct = True
                else:
                    ds_correct = False
                    wrong_num += 1
                    wrong_ans = deepseek_ans
                    if not cwjfmx:
                        cwjfmx = "ds"
                    ds_jietu = rf"src\outputs\deepseek\deepseek_screenshot_{ques_id}.png"
                    if not cwmx:
                        cwmx = "ds"
                    else:
                        cwmx += ",ds"

                if "正确" in ds_ans3_json["学生3"]:
                    db_correct = True
                else:
                    db_correct = False
                    wrong_num += 1
                    wrong_ans = doubao_ans
                    if not cwjfmx:
                        cwjfmx = "豆包"
                    db_jietu = rf"src\outputs\doubao\doubao_screenshot_{ques_id}.png"
                    if not cwmx:
                        cwmx = "豆包"
                    else:
                        cwmx += ",豆包"

                flag = False
            except Exception as e:
                print(e)
                count += 1
                if count > 5:
                    return None

        if qw_correct and ds_correct and db_correct:
            print("[三模型都答对了]下一题]")
            continue
        
        else:
            print(f"[三模型有答错]正在提取内容，使用模型{MODEL}...")

        # 提取适合年级和子学科
        flag = True
        count = 0
        while flag:
            try:
                print(f"    [处理第{index+1}个问题-提取['适合年级', '子学科']]第{count+1}次尝试")
                ds_ans1 = call_deepseek(prompt_question+item["condition"]+item["specific_questions"])
                ds_ans1_json = json.loads(ds_ans1)
                shnj = ds_ans1_json["适合年级"]
                zxk = ds_ans1_json["子学科"]
                flag = False
            except Exception as e:
                print(e)
                count += 1
                if count > 5:
                    return None

        # 提取考察知识点和分析过程
        flag = True
        count = 0
        while flag:
            try:
                print(f"    [处理第{index+1}个问题-提取['考察知识点', '分析过程']]第{count+1}次尝试")
                ds_ans2 = call_deepseek(prompt_answer+item["condition"]+item["specific_questions"]+"\n参考答案:"+item["solution"])
                ds_ans2_json = json.loads(ds_ans2)
                kczsd = ds_ans2_json["考察知识点"]
                fxgc = ds_ans2_json["分析过程"]
                flag = False
            except Exception as e:
                print(e)
                count += 1
                if count > 5:
                    return None

        # 提取错误解题方法和易错点
        flag = True
        count = 0
        while flag:
            try:
                print(f"    [处理第{index+1}个问题-提取['错误解题方法','易错点']]第{count+1}次尝试")
                ds_ans4 = call_deepseek(prompt_wrong+item["condition"]+item["specific_questions"]+"\n错误答案:"+wrong_ans+"\n正确答案:"+item["solution"])
                ds_ans4_json = json.loads(ds_ans4)
                cwjtff = ds_ans4_json["错误解题方法"]
                ycd = ds_ans4_json["易错点"]
                flag = False
            except Exception as e:
                print(e)
                count += 1
                if count > 5:
                    return None

        # 提取题目来源
        possible_substrings = get_consecutive_chinese_chars(item["condition"])
        pages = []
        if possible_substrings and len(possible_substrings) > 0:
            for i in range(min(5, len(possible_substrings))):  # 确保不会超出列表范围
                search_text = random.choice(possible_substrings)
                page_result = find_text_in_saved_pdf(json_path, search_text)
                if page_result and len(page_result) > 0 and page_result[0] is not None:
                    pages.append(page_result[0])

        page = find_mode(pages) if pages else None

        # 构建结果行
        result_row = {
            'id': f"zht_{len(results)+1:03d}",
            "问题条件": item["condition"],
            "具体问题": item["specific_questions"],
            "问题数目": 1,
            "适合年级": shnj,
            "题目类型": "计算题",
            "题目学科": "物理",
            "子学科": zxk,
            "领域类型": "自然科学",
            "是否包含图片": "否",
            "考察知识点": kczsd,
            "易错点": ycd,
            "思考过程/分析": fxgc,
            "解题过程": item["solution"],
            "最终答案": item["final_answer"],
            "错误解题方法": cwjtff,
            "错误解法模型": cwjfmx,
            "错误模型": cwmx,
            "三模型打分": wrong_num,
            "deepseek": ds_jietu,
            "千问": qw_jietu,
            "豆包": db_jietu,
            "题目来源": f"数学物理方程学习指导与习题解答 (陈才生) (Z-Library),第{page}页" if page else "未知",
        }
        results.append(result_row)

    return results



    
# 确保输出目录存在
   
pdf_path = "数学物理方程学习指导与习题解答 (陈才生) (Z-Library)(OCR).pdf"
json_path = "数学物理方程学习指导与习题解答 (陈才生) (Z-Library)(OCR).json"
output_dir = "三模型表"
images_dir = "../images"

# 使用命令行参数设置文件路径


print(f"使用参数:\nPDF文件: {pdf_path}\nJSON文件: {json_path}\n输出目录: {output_dir}\n图片目录: {images_dir}")

# 创建空的DataFrame
data = pd.DataFrame(columns=['id',"问题条件","具体问题","问题数目","适合年级","题目类型","题目学科","子学科","领域类型","是否包含图片","考察知识点","易错点","思考过程/分析","解题过程","最终答案","错误解题方法","错误解法模型","错误模型","三模型打分","deepseek","千问","豆包","题目来源"])

# 提取PDF文本
extract_pdf_text(pdf_path, save_dir=json_path)

# 获取images目录下的所有子目录
subfolders = [f for f in os.listdir(images_dir) if os.path.isdir(os.path.join(images_dir, f))]
print(f"找到 {len(subfolders)} 个子文件夹作为问题输入")

# 处理每个子文件夹作为一个问题
for subfolder in subfolders:
    subfolder_path = os.path.join(images_dir, subfolder)
    print(f"处理子文件夹: {subfolder_path}")
    
    # 获取子文件夹中的所有图片文件
    image_files = glob.glob(os.path.join(subfolder_path, "*.png")) + glob.glob(os.path.join(subfolder_path, "*.jpg"))
    
    if not image_files:
        print(f"子文件夹 {subfolder} 中没有图片，跳过")
        continue
        
    # 读取子文件夹中的问题信息
    question_info_path = os.path.join(subfolder_path, "question.json")
    if os.path.exists(question_info_path):
        try:
            with open(question_info_path, "r", encoding="utf-8") as f:
                problem_obj = json.load(f)
        except Exception as e:
            print(f"读取问题信息文件失败: {e}")
            continue
    else:
        # 如果没有问题信息文件，创建一个默认的问题对象
        problem_obj = {
            "id": subfolder,
            "question": f"来自文件夹 {subfolder} 的问题",
            "answer": "无答案"
        }
    
    # 将所有图片转换为base64
    base64_images = []
    for image_file in image_files:
        try:
            base64_img = encode_image(image_file)
            base64_images.append(base64_img)
            print(f"已转换图片: {image_file}")
        except Exception as e:
            print(f"转换图片 {image_file} 失败: {e}")
    
    # 处理问题
    results = process_question(problem_obj, json_path, base64_images)
    if results:
        data = pd.concat([data, pd.DataFrame(results)], ignore_index=True)

# 保存Excel文件
now_time = time.strftime("%Y-%m-%d_%H%M%S", time.localtime())
file_path = os.path.join(output_dir, f"国内三模型_{now_time}.xlsx")
save_to_excel(data, file_path)
print(f"已保存Excel文件: {file_path}")
print("所有问题处理完毕")



    # python pipeline\test.py

使用参数:
PDF文件: 数学物理方程学习指导与习题解答 (陈才生) (Z-Library)(OCR).pdf
JSON文件: 数学物理方程学习指导与习题解答 (陈才生) (Z-Library)(OCR).json
输出目录: 三模型表
图片目录: ../images
找到 18 个子文件夹作为问题输入
处理子文件夹: ../images\043
已转换图片: ../images\043\Snipaste_2025-05-19_15-02-30.png

-------------------处理题目 (question_number: 043)----------------------
[提取五元组]第1次尝试,使用模型qwen-vl-max...
[提取五元组]Failed:Invalid \escape: line 1 column 241 (char 240)
[
  {
    "question_number": "3.5",
    "condition": "$$\\begin{cases} u_{tt} - u_{xx} = 0, & x > 0, t > 0, \\\\ u(x, 0) = u_t(x, 0) = 0, & x > 0, \\\\ u(0, t) = \\frac{t}{1 + t}, & t \\geqslant 0 \\end{cases}$$",
    "specific_questions": "求解 \( u(x, t) \)，然后证明对任意 \( c > 0 \)，极限 \( \\lim_{{x \\to +\\infty}} u(cx, x) \) 存在，并且求出该极限。",
    "solution": "设 \( u(x, t) = v(x, t) + \\frac{t}{1 + t} \)，则 \( v(x, t) \) 满足 $$\\begin{cases} v_{tt} - v_{xx} = \\frac{2}{(1 + t)^3}, & x > 0, t > 0, \\\\ v(x, 0) = 0, v_t(x, 0) = -1, & x \\geqslant 0, \\\\ v(0, t) = 0, & t \\geqslant 0. \\end{cases}$$ 由文献 [1] 中第 3 章半无

In [None]:

ans = """[
  {
    "question_number": "3.5",
    "condition": "$$\\begin{cases} u_{tt} - u_{xx} = 0, & x > 0, t > 0, \\\\ u(x, 0) = u_t(x, 0) = 0, & x > 0, \\\\ u(0, t) = \\frac{t}{1 + t}, & t \\geqslant 0 \\end{cases}$$",
    "specific_questions": "求解 \( u(x, t) \)，然后证明对任意 \( c > 0 \)，极限 \( \\lim_{{x \\to +\\infty}} u(cx, x) \) 存在，并且求出该极限。",
    "solution": "设 \( u(x, t) = v(x, t) + \\frac{t}{1 + t} \)，则 \( v(x, t) \) 满足 $$\\begin{cases} v_{tt} - v_{xx} = \\frac{2}{(1 + t)^3}, & x > 0, t > 0, \\\\ v(x, 0) = 0, v_t(x, 0) = -1, & x \\geqslant 0, \\\\ v(0, t) = 0, & t \\geqslant 0. \\end{cases}$$ 由文献 [1] 中第 3 章半无界弦的初边值问题解的表达式，当 \( x \\geqslant t \) 时，\n\n\[ v(x, t) = \\frac{1}{2} \\int_{x-t}^{x+t} (-1) \\mathrm{d}\\xi + \\frac{1}{2} \\int_{0}^{t} \\int_{x-(t-\\tau)}^{x+(t-\\tau)} \\frac{2}{(1 + \\tau)^3} \\mathrm{d}\\xi \\mathrm{d}\\tau \]\n\n\[ = -\\frac{t}{1 + t}, \]\n\n当 \( 0 \\leqslant x < t \) 时，\n\n\[ v(x, t) = \\frac{1}{2} \\int_{t-x}^{x+t} (-1) \\mathrm{d}\\xi + \\frac{1}{2} \\int_{0}^{t-x} \\int_{(t-\\tau)-x}^{x+(t-\\tau)} \\frac{2}{(1 + \\tau)^3} \\mathrm{d}\\xi \\mathrm{d}\\tau + \\frac{1}{2} \\int_{t-x}^{t} \\int_{x-(t-\\tau)}^{x+(t-\\tau)} \\frac{2}{(1 + \\tau)^3} \\mathrm{d}\\xi \\mathrm{d}\\tau \]\n\n\[ = \\frac{x}{(1 + t - x)(1 + t)}, \]\n\n所以\n\n\[ u(x, t) = \\begin{cases} 0, & x > t, \\\\ \\frac{x}{(1 + t)(1 + t - x)} + \\frac{t}{1 + t}, & 0 \\leqslant x < t, \\end{cases} \]\n\n因此当 \( c \\geqslant 1 \) 时，\( cx \\geqslant x \)，故 \( u(cx, x) = 0 \)，\( \\lim_{{x \\to +\\infty}} u(cx, x) = 0 \)。当 \( 0 < c < 1 \) 时，\( cx < x \)，故\n\n\[ u(cx, x) = \\frac{cx}{(1 + x)(1 + x - cx)} + \\frac{x}{1 + x}, \]\n\n因此，\( \\lim_{{x \\to +\\infty}} u(cx, x) = 1 \)。",
    "final_answer": "0"
  }
]
"""
ans = """[
  {
    "question_number": "3.5",
    "condition": "$$\\\\begin{cases} u_{tt} - u_{xx} = 0, & x > 0, t > 0, \\\\\\ u(x, 0) = u_t(x, 0) = 0, & x > 0, \\\\\\ u(0, t) = \\\\frac{t}{1 + t}, & t \\\\geqslant 0 \\\\end{cases}$$",
    "specific_questions": "求解 \\( u(x, t) \\)，然后证明对任意 \\( c > 0 \\)，极限 \\( \\\\lim_{{x \\\\to +\\\\infty}} u(cx, x) \\) 存在，并且求出该极限。",
    "solution": "设 \\( u(x, t) = v(x, t) + \\\\frac{t}{1 + t} \\)，则 \\( v(x, t) \\) 满足 $$\\\\begin{cases} v_{tt} - v_{xx} = \\\\frac{2}{(1 + t)^3}, & x > 0, t > 0, \\\\\\ v(x, 0) = 0, v_t(x, 0) = -1, & x \\\\geqslant 0, \\\\\\ v(0, t) = 0, & t \\\\geqslant 0. \\\\end{cases}$$ 由文献 [1] 中第 3 章半无界弦的初边值问题解的表达式，当 \\( x \\\\geqslant t \\) 时，\n\n\\[ v(x, t) = \\\\frac{1}{2} \\\\int_{x-t}^{x+t} (-1) \\\\mathrm{d}\\\\xi + \\\\frac{1}{2} \\\\int_{0}^{t} \\\\int_{x-(t-\\\\tau)}^{x+(t-\\\\tau)} \\\\frac{2}{(1 + \\\\tau)^3} \\\\mathrm{d}\\\\xi \\\\mathrm{d}\\\\tau \\]\n\n\\[ = -\\\\frac{t}{1 + t}, \\]\n\n当 \\( 0 \\\\leqslant x < t \\) 时，\n\n\\[ v(x, t) = \\\\frac{1}{2} \\\\int_{t-x}^{x+t} (-1) \\\\mathrm{d}\\\\xi + \\\\frac{1}{2} \\\\int_{0}^{t-x} \\\\int_{(t-\\\\tau)-x}^{x+(t-\\\\tau)} \\\\frac{2}{(1 + \\\\tau)^3} \\\\mathrm{d}\\\\xi \\\\mathrm{d}\\\\tau + \\\\frac{1}{2} \\\\int_{t-x}^{t} \\\\int_{x-(t-\\\\tau)}^{x+(t-\\\\tau)} \\\\frac{2}{(1 + \\\\tau)^3} \\\\mathrm{d}\\\\xi \\\\mathrm{d}\\\\tau \\]\n\n\\[ = \\\\frac{x}{(1 + t - x)(1 + t)}, \\]\n\n所以\n\n\\[ u(x, t) = \\\\begin{cases} 0, & x > t, \\\\\\ \\\\frac{x}{(1 + t)(1 + t - x)} + \\\\frac{t}{1 + t}, & 0 \\\\leqslant x < t, \\\\end{cases} \\]\n\n因此当 \\( c \\\\geqslant 1 \\) 时，\\( cx \\\\geqslant x \\)，故 \\( u(cx, x) = 0 \\)，\\( \\\\lim_{{x \\\\to +\\\\infty}} u(cx, x) = 0 \\)。当 \\( 0 < c < 1 \\) 时，\\( cx < x \\)，故\n\n\\[ u(cx, x) = \\\\frac{cx}{(1 + x)(1 + x - cx)} + \\\\frac{x}{1 + x}, \\]\n\n因此，\\( \\\\lim_{{x \\\\to +\\\\infty}} u(cx, x) = 1 \\)。",
    "final_answer": "0"
  }
]
"""

import json
json.loads(ans)


JSONDecodeError: Invalid \escape: line 4 column 75 (char 110)

In [1]:
from call_llm import *

call_qwen("hello")
# call_deepseek("hello")
# call_doubao("hello")


'Hello! How can I assist you today? 😊'