In [6]:
import json
import argparse
import logging
import sys
import os

# ... (logging setup remains the same) ...

def extract_evaluation_details_single_json(input_path: str, output_path: str = None):
    """
    Reads a single, potentially multi-line JSON object from a file,
    extracts 'evaluation_details' fields, and handles potential list structure.
    """
    logging.info(f"Attempting to read single JSON object and extract 'evaluation_details' from: {input_path}")

    extracted_details = []
    data = None

    try:
        # --- Read the entire file content ---
        logging.info(f"Reading entire file content from {input_path}...")
        with open(input_path, 'r', encoding='utf-8') as infile:
            file_content = infile.read()

        # --- Parse the entire content as one JSON object ---
        logging.info("Parsing file content as a single JSON object...")
        data = json.loads(file_content)
        logging.info("Successfully parsed JSON content.")

    except FileNotFoundError:
        logging.error(f"Error: Input file not found at '{input_path}'")
        sys.exit(1)
    except json.JSONDecodeError as e:
        logging.error(f"Error: Failed to decode JSON from '{input_path}'. It might not be valid JSON. Details: {e}")
        sys.exit(1)
    except Exception as e:
        logging.error(f"An unexpected error occurred while reading or parsing '{input_path}': {e}", exc_info=True)
        sys.exit(1)

    # --- Extract 'evaluation_details' ---
    # Check if the parsed data is a list of objects or a single object
    if isinstance(data, list):
        logging.info(f"Parsed data is a list. Processing {len(data)} items.")
        missing_key_count = 0
        for i, item in enumerate(data):
            if isinstance(item, dict):
                if 'evaluation_details' in item:
                    extracted_details.append(item['evaluation_details'])
                else:
                    logging.debug(f"Key 'evaluation_details' not found in list item {i}")
                    missing_key_count += 1
            else:
                 logging.warning(f"Skipping non-dictionary item in list at index {i}")
        if missing_key_count > 0:
            logging.info(f"{missing_key_count} list items were missing the 'evaluation_details' key.")

    elif isinstance(data, dict):
        logging.info("Parsed data is a single dictionary object.")
        if 'evaluation_details' in data:
            extracted_details.append(data['evaluation_details'])
        else:
            logging.warning("Key 'evaluation_details' not found in the main JSON object.")
    else:
        logging.warning(f"Parsed JSON data is neither a list nor a dictionary (type: {type(data)}). Cannot extract details.")


    logging.info(f"Extracted {len(extracted_details)} 'evaluation_details' objects.")

    if not extracted_details:
         logging.warning(f"No 'evaluation_details' found or extracted from the file '{input_path}'.")
         return # Nothing to output


    # --- Output the results (this part remains mostly the same) ---
    if output_path:
        try:
            output_dir = os.path.dirname(output_path)
            if output_dir:
                os.makedirs(output_dir, exist_ok=True)

            logging.info(f"Writing {len(extracted_details)} extracted details as a single JSON array to: {output_path}")
            with open(output_path, 'w', encoding='utf-8') as outfile:
                 # 将 extracted_details 列表作为一个整体写入 JSON 文件
                 json.dump(extracted_details, outfile, ensure_ascii=False, indent=2)

            logging.info(f"Successfully wrote extracted details to: {output_path}")

        except IOError as e:
            logging.error(f"Error: Could not write to output file '{output_path}'. Details: {e}")
            sys.exit(1)
        except Exception as e:
            logging.error(f"An unexpected error occurred while writing '{output_path}': {e}", exc_info=True)
            sys.exit(1)
    else:
        # 如果没有指定输出路径，则打印到控制台
        logging.info("\n--- Extracted 'evaluation_details' ---")
        # 为了保持一致性，可以考虑将整个列表打印为一个 JSON 字符串
        # print(json.dumps(extracted_details, indent=2, ensure_ascii=False))
        # 或者保持原来的逐个打印方式：
        for i, detail in enumerate(extracted_details):
            print(f"\n--- Detail {i+1} ---")
            print(json.dumps(detail, indent=2, ensure_ascii=False))
        print("\n------------------------------------")


if __name__ == "__main__":
    input_file="/home/zyy/browser_use_myown/logs/qwen2.5_72b_naive_StacticUnits_result/formatted.json"
    output_file="/home/zyy/browser_use_myown/logs/qwen2.5_72b_naive_StacticUnits_result/evaluation_details.json"

    extract_evaluation_details_single_json(input_file, output_file)

2025-05-01 11:43:44,102 - INFO - Attempting to read single JSON object and extract 'evaluation_details' from: /home/zyy/browser_use_myown/logs/qwen2.5_72b_naive_StacticUnits_result/formatted.json
2025-05-01 11:43:44,104 - INFO - Reading entire file content from /home/zyy/browser_use_myown/logs/qwen2.5_72b_naive_StacticUnits_result/formatted.json...
2025-05-01 11:43:44,113 - INFO - Parsing file content as a single JSON object...
2025-05-01 11:43:44,120 - INFO - Successfully parsed JSON content.
2025-05-01 11:43:44,121 - INFO - Parsed data is a list. Processing 68 items.


2025-05-01 11:43:44,122 - INFO - Extracted 68 'evaluation_details' objects.
2025-05-01 11:43:44,122 - INFO - Writing 68 extracted details as a single JSON array to: /home/zyy/browser_use_myown/logs/qwen2.5_72b_naive_StacticUnits_result/evaluation_details.json
2025-05-01 11:43:44,123 - INFO - Successfully wrote extracted details to: /home/zyy/browser_use_myown/logs/qwen2.5_72b_naive_StacticUnits_result/evaluation_details.json


In [3]:
# 找到有改进的任务和变差的任务
import json
import logging
from typing import Dict, Set, List

# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# ====== 修改以下文件路径 ======
# 原始版本评测结果文件路径
NAIVE_RESULT_PATH = "/home/zyy/browser_use_myown/logs/qwen2.5_72b_naive_result/formatted.json"
# Static Units版本评测结果文件路径
STATIC_UNITS_RESULT_PATH = "/home/zyy/browser_use_myown/logs/qwen2.5_72b_naive_StacticUnits_result/formatted.json"
# 输出文件路径
OUTPUT_PATH = "task_performance_changes.json"
# 是否输出详细信息
VERBOSE = True
# ============================

def load_eval_results(file_path: str) -> Dict[str, bool]:
    """
    加载评测结果文件并提取任务ID和评测结果。
    返回字典格式：{task_id: is_success}，其中is_success是布尔值（True为成功，False为失败）
    """
    results = {}
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
        # 遍历所有任务
        for task in data:
            if "task_id" in task and "predicted_label" in task:
                task_id = task["task_id"]
                # predicted_label为1表示成功，0表示失败
                is_success = task["predicted_label"] == 1
                results[task_id] = is_success
                
        logging.info(f"从 {file_path} 加载了 {len(results)} 个任务的评测结果")
        return results
        
    except Exception as e:
        logging.error(f"加载文件 {file_path} 时出错: {str(e)}")
        return {}

def find_improved_tasks(naive_results: Dict[str, bool], static_units_results: Dict[str, bool]) -> Set[str]:
    """
    找出在Static Units版本中成功但在Naive版本中失败的任务ID
    """
    improved_task_ids = set()
    
    for task_id, static_label in static_units_results.items():
        # 检查任务是否存在于naive结果中
        if task_id in naive_results:
            naive_label = naive_results[task_id]
            
            # 如果naive版本失败但static units版本成功，则认为有改进
            if not naive_label and static_label:
                improved_task_ids.add(task_id)
    
    logging.info(f"找到 {len(improved_task_ids)} 个表现有改进的任务")
    return improved_task_ids

def find_degraded_tasks(naive_results: Dict[str, bool], static_units_results: Dict[str, bool]) -> Set[str]:
    """
    找出在Naive版本中成功但在Static Units版本中失败的任务ID
    """
    degraded_task_ids = set()
    
    for task_id, static_label in static_units_results.items():
        # 检查任务是否存在于naive结果中
        if task_id in naive_results:
            naive_label = naive_results[task_id]
            
            # 如果naive版本成功但static units版本失败，则认为表现变差
            if naive_label and not static_label:
                degraded_task_ids.add(task_id)
    
    logging.info(f"找到 {len(degraded_task_ids)} 个表现变差的任务")
    return degraded_task_ids

def get_task_details(file_path: str, task_ids: Set[str]) -> List[Dict]:
    """
    从原始结果文件中获取指定任务ID的详细信息
    """
    task_details = []
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            
        # 遍历所有任务，查找匹配的任务ID
        for task in data:
            if "task_id" in task and task["task_id"] in task_ids:
                # 提取任务描述信息
                detail = {
                    "task_id": task["task_id"],
                    "task": task.get("task", "未知任务描述"),
                    "evaluation_details": task.get("evaluation_details", {}).get("response", "无评测详情")
                }
                task_details.append(detail)
                
        return task_details
        
    except Exception as e:
        logging.error(f"从文件 {file_path} 获取任务详情时出错: {str(e)}")
        return []

def main():
    # 加载评测结果
    naive_results = load_eval_results(NAIVE_RESULT_PATH)
    static_units_results = load_eval_results(STATIC_UNITS_RESULT_PATH)
    
    if not naive_results or not static_units_results:
        logging.error("加载评测结果失败，无法继续")
        return
    
    # 找出改进的任务
    improved_task_ids = find_improved_tasks(naive_results, static_units_results)
    
    # 找出变差的任务
    degraded_task_ids = find_degraded_tasks(naive_results, static_units_results)
    
    # 准备输出数据
    output_data = {
        "improved_task_ids": list(improved_task_ids),
        "degraded_task_ids": list(degraded_task_ids)
    }
    
    # 如果需要详细信息，则获取任务详情
    if VERBOSE:
        improved_task_details = get_task_details(NAIVE_RESULT_PATH, improved_task_ids)
        degraded_task_details = get_task_details(NAIVE_RESULT_PATH, degraded_task_ids)
        
        output_data["improved_task_details"] = improved_task_details
        output_data["degraded_task_details"] = degraded_task_details
        
    # 输出结果到文件
    with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, ensure_ascii=False, indent=2)
        
    logging.info(f"结果已保存到 {OUTPUT_PATH}")
    
    # 打印结果到控制台
    print("\n表现有改进的任务ID列表:")
    for task_id in improved_task_ids:
        print(task_id)
    
    print("\n表现变差的任务ID列表:")
    for task_id in degraded_task_ids:
        print(task_id)
    
    # 计算总体变化情况
    total_common_tasks = len(set(naive_results.keys()) & set(static_units_results.keys()))
    if total_common_tasks > 0:
        improved_percentage = len(improved_task_ids) / total_common_tasks * 100
        degraded_percentage = len(degraded_task_ids) / total_common_tasks * 100
        unchanged_percentage = 100 - improved_percentage - degraded_percentage
        
        print(f"\n总体变化情况 (共 {total_common_tasks} 个任务):")
        print(f"  改进: {len(improved_task_ids)} 个任务 ({improved_percentage:.2f}%)")
        print(f"  变差: {len(degraded_task_ids)} 个任务 ({degraded_percentage:.2f}%)")
        print(f"  不变: {total_common_tasks - len(improved_task_ids) - len(degraded_task_ids)} 个任务 ({unchanged_percentage:.2f}%)")
    
    # 如果VERBOSE为True，还可以打印任务描述
    if VERBOSE:
        if improved_task_ids:
            print("\n===== 表现有改进的任务详情 =====")
            for detail in improved_task_details:
                print(f"\nID: {detail['task_id']}")
                print(f"任务: {detail['task']}")
                print("-" * 80)
        
        if degraded_task_ids:
            print("\n===== 表现变差的任务详情 =====")
            for detail in degraded_task_details:
                print(f"\nID: {detail['task_id']}")
                print(f"任务: {detail['task']}")
                print("-" * 80)

if __name__ == "__main__":
    main()

2025-05-01 15:05:06,429 - INFO - 从 /home/zyy/browser_use_myown/logs/qwen2.5_72b_naive_result/formatted.json 加载了 44 个任务的评测结果


2025-05-01 15:05:06,459 - INFO - 从 /home/zyy/browser_use_myown/logs/qwen2.5_72b_naive_StacticUnits_result/formatted.json 加载了 68 个任务的评测结果
2025-05-01 15:05:06,462 - INFO - 找到 7 个表现有改进的任务
2025-05-01 15:05:06,463 - INFO - 找到 5 个表现变差的任务
2025-05-01 15:05:06,482 - INFO - 结果已保存到 task_performance_changes.json



表现有改进的任务ID列表:
56f8890a837c49f7df766b9c981646f3
c1d6ea6f2196d25782cc3646ff3090db
070c907d34a4ce71dfdbea38f9c5d4d8
c698ff3fc0f6cbce39947c597ab5749b
9bb63ad0e38d5691a618932a8b31c05a
fb20658421aa80248d35444930bce2d9
92a3d4236f167af4afdc08876a902ba6

表现变差的任务ID列表:
75a1b5dcd2c28508a971d98d51fe5767
644a856c3897665e475e0dce50bf217d
4c186c6ed888d0c8d4cf4adb39443080
db1ffb5e60578597d1c3aa3c389ac7b1
871e7771cecb989972f138ecc373107b

总体变化情况 (共 41 个任务):
  改进: 7 个任务 (17.07%)
  变差: 5 个任务 (12.20%)
  不变: 29 个任务 (70.73%)

===== 表现有改进的任务详情 =====

ID: c698ff3fc0f6cbce39947c597ab5749b
任务: Browse the page with event planning tips on Eventbrite.
--------------------------------------------------------------------------------

ID: fb20658421aa80248d35444930bce2d9
任务: Find the latest 2 bed and 1.5+ bath apartment listing for rent in New York.
--------------------------------------------------------------------------------

ID: 56f8890a837c49f7df766b9c981646f3
任务: Show crazy credits for the movie " Prometheus" 

In [3]:
from scipy.optimize import linprog

# 目标函数系数（注意取负以进行最小化）
c = [-30, -10]

# 不等式约束矩阵和右侧向量
# 约束1: 6x1 + 3x2 <= 40
# 约束2: x1 <= 4
# 约束3: -3x1 + x2 >= 0  =>  3x1 - x2 <= 0
A = [
    [6, 3],
    [1, 0],
    [3, -1]
]
b = [40, 4, 0]

# 变量的边界
x0_bounds = (0, None)  # x1 >= 0
x1_bounds = (0, None)  # x2 >= 0

# 求解线性规划问题
res = linprog(c, A_ub=A, b_ub=b, bounds=[x0_bounds, x1_bounds], method='highs')

# 输出结果
if res.success:
    print(f"每周应生产桌子数量 (x1): {res.x[0]:.2f}")
    print(f"每周应生产椅子数量 (x2): {res.x[1]:.2f}")
    print(f"最大利润: ${-res.fun:.2f}")
    print("\n约束松弛变量:")
    print(f"  约束1 (6x1 + 3x2 <= 40) 松弛量: {res.slack[0]:.2f}")
    print(f"  约束2 (x1 <= 4) 松弛量: {res.slack[1]:.2f}")
    print(f"  约束3 (3x1 - x2 <= 0) 松弛量: {res.slack[2]:.2f}")
else:
    print("线性规划问题无可行解。")
    print(f"状态: {res.message}")


每周应生产桌子数量 (x1): 2.67
每周应生产椅子数量 (x2): 8.00
最大利润: $160.00

约束松弛变量:
  约束1 (6x1 + 3x2 <= 40) 松弛量: 0.00
  约束2 (x1 <= 4) 松弛量: 1.33
  约束3 (3x1 - x2 <= 0) 松弛量: 0.00
