In [None]:
import os
import json
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# import ast
import numpy as np

In [None]:
def load_all_results(base_dir: str) -> pd.DataFrame:
    """
    Load result.json files from all experiment subdirectories under the specified base directory,
    and merge them into a pandas DataFrame.
    """
    all_results_data = []

    if not os.path.isdir(base_dir):
        print(f"Error: Base directory '{base_dir}' does not exist.")
        return pd.DataFrame()

    # Iterate through all entries in base directory
    for exp_folder_name in os.listdir(base_dir):
        exp_path = os.path.join(base_dir, exp_folder_name)

        # Ensure it is a directory
        if os.path.isdir(exp_path):
            result_file_path = os.path.join(exp_path, "result.json")

            # Check if result.json exists
            if os.path.exists(result_file_path):
                try:
                    with open(result_file_path, "r", encoding="utf-8") as f:
                        result_data = json.load(f)
                        # Add experiment folder name as a column for identification
                        result_data["experiment"] = exp_folder_name
                        all_results_data.append(result_data)
                except json.JSONDecodeError:
                    print(f"Warning: Unable to parse JSON in file {result_file_path}.")
                except Exception as e:
                    print(f"Warning: Error reading file {result_file_path}: {e}")

    if not all_results_data:
        print("No 'result.json' files found or processed.")
        return pd.DataFrame()

    # Convert list of dictionaries to DataFrame
    results_df = pd.DataFrame(all_results_data)
    return results_df


In [None]:
def cal_area_under_curve(df: pd.DataFrame, start:int, end:int) -> pd.DataFrame:
    """
    Calculate the area under the curve (AUC) for specified metrics, subtracting the area below the minimum value in the interval.
    Also update pass_k and pass_mean to the accuracy at the end step.
    
    Parameters:
        df: DataFrame containing experiment results
        start: Starting turn number
        end: Ending turn number
    
    Returns:
        DataFrame with added AUC and end accuracy columns
    """
    df = df.copy()
    
    passk_auc = []
    pass1_auc = []
    passk_end_values = []
    pass1_end_values = []
    
    for idx, row in df.iterrows():
        passk_dict = row['passk_at_T']
        pass1_dict = row['pass1_at_T']
        
        def extract_series(data_dict):
            values = []
            last_value = 0.0  # Initialize value from previous round
            for t in range(start, end + 1):
                key = str(t)
                if key in data_dict:
                    last_value = data_dict[key]  # Update value from previous round
                    values.append(last_value)
                else:
                    values.append(last_value)  # Use value from previous round
            return values
        
        passk_values = extract_series(passk_dict)
        pass1_values = extract_series(pass1_dict)
        
        min_passk = min(passk_values)
        adjusted_passk = [value - min_passk for value in passk_values]
        passk_auc.append(max(np.trapz(adjusted_passk, dx=1), 0.0))
        
        min_pass1 = min(pass1_values)
        adjusted_pass1 = [value - min_pass1 for value in pass1_values]
        pass1_auc.append(max(np.trapz(adjusted_pass1, dx=1), 0.0))
        
        def get_value_at_step(data_dict):
            end_key = str(end)
            if end_key in data_dict:
                return data_dict[end_key]
            if not data_dict:
                return np.nan
            numeric_keys = sorted(int(k) for k in data_dict.keys())
            lower_keys = [k for k in numeric_keys if k <= end]
            if lower_keys:
                return data_dict[str(lower_keys[-1])]
            higher_keys = [k for k in numeric_keys if k > end]
            if higher_keys:
                return data_dict[str(higher_keys[0])]
            return np.nan
        
        passk_end_values.append(get_value_at_step(passk_dict))
        pass1_end_values.append(get_value_at_step(pass1_dict))
    
    span = max(end - start, 1)  # Avoid division by zero
    df['passk_auc'] = np.array(passk_auc, dtype=float) / span
    df['pass1_auc'] = np.array(pass1_auc, dtype=float) / span
    df['pass_k'] = passk_end_values
    df['pass_mean'] = pass1_end_values
    
    return df

In [None]:
# format = "user_assistant"
format = "user_assistant_format"


In [None]:
selection_criteria = {
    # 'model_name': 'Qwen3-4B',
    # "enable_thinking": False,
    "state": "env",
    'chat_format': f'{format}',
    # "alfworld_mode": "eval_in_distribution",
    'history_has_cot': True,
    "stop_by_self": False,
    # "offer_feedback": True,
    # "prompt_example": "fewshot",
    "history_window_size": 0,
}
# if format == "user_assistant_format_part":
#     selection_criteria['history_window_size'] = 1

In [None]:
model_order = [
    'Qwen3-4B',
    'Qwen3-30B-A3B',
    'Llama3-8B',
    'Llama3-70B',
    'Llama-3.1-8B',
    'Llama-3.3-70B',
    'Glm-9B-Chat',
    'Glm4-9B-Chat',
    "GLM-4-32B-0414",
    'Mistral-7B-Instruct-v0.3',
    'Ministral-3-14B-Instruct-2512',
    'phi-4',
    'deepseek-v3',
    'deepseek-v3.2',
    'gemini-2.5-flash',
    'gemini-2.5-flash-nothinking',
    'Phi-4-reasoning',
    'gpt-oss-120b',
    'deepseek-r1',
    'gemini-2.5-pro',
]
# 创建一个映射字典,将模型名称映射到排序索引
model_order_map = {model: idx for idx, model in enumerate(model_order)}


In [None]:
start_end_mapping = {
    "blocksworld": (0, 20),
    "frozenlake": (0, 30),
    "sodoku": (0, 20),
    "alfworld": (0, 60),
    "webshop": (0, 15)
}
from utils.analysis_files.analysis import get_config_label

# Define all tasks
all_tasks = ["blocksworld", "frozenlake", "sodoku", "alfworld", "webshop"]

# Iterate through all tasks
for task in all_tasks:
    print(f"\n{'='*50}")
    print(f"Task: {task}")
    print(f"{'='*50}")
    
    # Set root directory for all experiment results
    all_exp_base_dir = f"res/{task}/"
    
    # Load all experiment results into DataFrame
    df_experiments = load_all_results(all_exp_base_dir)
    
    if df_experiments.empty:
        print(f"Failed to load experiment results for task {task}.")
        continue
    
    # Filter criteria
    selected_df = df_experiments
    if selection_criteria:
        valid_criteria = {}
        for key, value in selection_criteria.items():
            if key in df_experiments.columns:
                valid_criteria[key] = value
        if valid_criteria:
            query_str = " & ".join([f"`{k}` == {repr(v)}" for k, v in valid_criteria.items()])
            selected_df = df_experiments.query(query_str)
    
    # Sort
    selected_df['model_sort_order'] = selected_df['model_name'].map(
        lambda x: model_order_map.get(x, 999)
    )
    sort_list = ['enable_thinking','model_sort_order','chat_format','history_has_cot',"state"]
    selected_df = selected_df.sort_values(by=sort_list, ascending=[True, True, True, True, False]).reset_index(drop=True)
    
    # Calculate AUC
    start, end = start_end_mapping.get(task)
    selected_df = cal_area_under_curve(selected_df, start=start, end=end)
    
    # Print results
    exclude_keys = set(selection_criteria.keys()) if selection_criteria else set()
    print("\nMEAN PASS\tAUV")
    for idx, row in selected_df.iterrows():
        pass_mean = row["pass_mean"]
        auv = row["pass1_auc"]
        print(f"{pass_mean*100:.1f}\t{auv*100:.1f}")

In [None]:
for task in all_tasks:
    print(f"\n{'='*50}")
    print(f"Task: {task}")
    print(f"{'='*50}")
    
    # Set root directory for all experiment results
    all_exp_base_dir = f"res/{task}/"
    
    # Load all experiment results into DataFrame
    df_experiments = load_all_results(all_exp_base_dir)
    
    if df_experiments.empty:
        print(f"Failed to load experiment results for task {task}.")
        continue
    
    # Filter criteria
    selected_df = df_experiments
    if selection_criteria:
        valid_criteria = {}
        for key, value in selection_criteria.items():
            if key in df_experiments.columns:
                valid_criteria[key] = value
        if valid_criteria:
            query_str = " & ".join([f"`{k}` == {repr(v)}" for k, v in valid_criteria.items()])
            selected_df = df_experiments.query(query_str)
    
    # Sort
    selected_df['model_sort_order'] = selected_df['model_name'].map(
        lambda x: model_order_map.get(x, 999)
    )
    sort_list = ['enable_thinking','model_sort_order','chat_format','history_has_cot',"state"]
    selected_df = selected_df.sort_values(by=sort_list, ascending=[True, True, True, True, False]).reset_index(drop=True)
    
    # Calculate AUC
    start, end = start_end_mapping.get(task)
    selected_df = cal_area_under_curve(selected_df, start=start, end=end)
    
    # Print results
    exclude_keys = set(selection_criteria.keys()) if selection_criteria else set()
    print("\nAUV\tLR")
    for idx, row in selected_df.iterrows():
        loop_ratio_steplevel = row["mean_loop_ratio_after_invalid_steps_stepnorm"]
        # loop_ratio_trajlevel = row["mean_loop_ratio_after_invalid_steps_trajnorm"]
        auv = row["pass1_auc"]
        
        print(f"{auv*100:.1f}\t{loop_ratio_steplevel*100:.1f}")