In [None]:
import orjson
import json
import multiprocessing as mp
import os
import numpy as np
import pandas as pd


In [None]:
def load_all_results(base_dir: str) -> pd.DataFrame:
    """
    Load result.json files from all experiment subdirectories under the specified base directory,
    and merge them into a pandas DataFrame.
    """
    all_results_data = []

    if not os.path.isdir(base_dir):
        print(f"Error: Base directory '{base_dir}' does not exist.")
        return pd.DataFrame()

    # Iterate through all entries in base directory
    for exp_folder_name in os.listdir(base_dir):
        exp_path = os.path.join(base_dir, exp_folder_name)

        # Ensure it is a directory
        if os.path.isdir(exp_path):
            result_file_path = os.path.join(exp_path, "result.json")

            # Check if result.json exists
            if os.path.exists(result_file_path):
                try:
                    with open(result_file_path, "r", encoding="utf-8") as f:
                        result_data = json.load(f)
                        # Add experiment folder name as a column for identification
                        result_data["experiment"] = exp_folder_name
                        all_results_data.append(result_data)
                except json.JSONDecodeError:
                    print(f"Warning: Unable to parse JSON in file {result_file_path}.")
                except Exception as e:
                    print(f"Warning: Error reading file {result_file_path}: {e}")

    if not all_results_data:
        print("No 'result.json' files found or processed.")
        return pd.DataFrame()

    # Convert list of dictionaries to DataFrame
    results_df = pd.DataFrame(all_results_data)
    return results_df


In [None]:
def cal_area_under_curve(df: pd.DataFrame, start:int, end:int) -> pd.DataFrame:
    """
    Calculate the area under the curve (AUC) for specified metrics, subtracting the area below the minimum value in the interval.
    Also update pass_k and pass_mean to the accuracy at the end step.
    
    Parameters:
        df: DataFrame containing experiment results
        start: Starting turn number
        end: Ending turn number
    
    Returns:
        DataFrame with added AUC and end accuracy columns
    """
    df = df.copy()
    
    passk_auc = []
    pass1_auc = []
    passk_end_values = []
    pass1_end_values = []
    
    for idx, row in df.iterrows():
        passk_dict = row['passk_at_T']
        pass1_dict = row['pass1_at_T']
        
        def extract_series(data_dict):
            values = []
            last_value = 0.0  # Initialize value from previous round
            for t in range(start, end + 1):
                key = str(t)
                if key in data_dict:
                    last_value = data_dict[key]  # Update value from previous round
                    values.append(last_value)
                else:
                    values.append(last_value)  # Use value from previous round
            return values
        
        passk_values = extract_series(passk_dict)
        pass1_values = extract_series(pass1_dict)
        
        min_passk = min(passk_values)
        adjusted_passk = [value - min_passk for value in passk_values]
        passk_auc.append(max(np.trapz(adjusted_passk, dx=1), 0.0))
        
        min_pass1 = min(pass1_values)
        adjusted_pass1 = [value - min_pass1 for value in pass1_values]
        pass1_auc.append(max(np.trapz(adjusted_pass1, dx=1), 0.0))
        
        def get_value_at_step(data_dict):
            end_key = str(end)
            if end_key in data_dict:
                return data_dict[end_key]
            if not data_dict:
                return np.nan
            numeric_keys = sorted(int(k) for k in data_dict.keys())
            lower_keys = [k for k in numeric_keys if k <= end]
            if lower_keys:
                return data_dict[str(lower_keys[-1])]
            higher_keys = [k for k in numeric_keys if k > end]
            if higher_keys:
                return data_dict[str(higher_keys[0])]
            return np.nan
        
        passk_end_values.append(get_value_at_step(passk_dict))
        pass1_end_values.append(get_value_at_step(pass1_dict))
    
    span = max(end - start, 1)  # Avoid division by zero
    df['passk_auc'] = np.array(passk_auc, dtype=float) / span
    df['pass1_auc'] = np.array(pass1_auc, dtype=float) / span
    df['pass_k'] = passk_end_values
    df['pass_mean'] = pass1_end_values
    
    return df

In [None]:
all_exp_base_dir = "res/frozen_lake"

# Load all experiment results into DataFrame
df_experiments = load_all_results(all_exp_base_dir)

# Display first few rows of DataFrame for inspection
if not df_experiments.empty:
    print(f"Successfully loaded results from {len(df_experiments)} experiments.")
    # display(df_experiments.head())
else:
    print("Failed to load any experiment results.")

In [None]:
selection_criteria = {
    'model_name': 'Qwen3-4B',
    "enable_thinking": False,
    "state": "env",
    'chat_format': 'user_assistant_format_part',
    # "alfworld_mode": "eval_in_distribution",
    'history_has_cot': True,
    "stop_by_self": False,
    # "history_window_size": 3,
}


In [None]:
start = 1
end = 20
# --- 2. Filter DataFrame based on criteria ---
if selection_criteria:
    # Start from base DataFrame
    query_str = " & ".join(
        [f"`{k}` == {repr(v)}" for k, v in selection_criteria.items()]
    )
    selected_df = df_experiments.query(query_str)
else:
    # If no filter criteria, select all experiments
    selected_df = df_experiments
selected_df = cal_area_under_curve(selected_df, start=start, end=end)
sort_list = ['history_window_size', 'pass_k', 'model_name']
selected_df = selected_df.sort_values(by=sort_list, ascending=[True, True, True]).reset_index(drop=True)
print(f"Found {len(selected_df)} experiments matching the filter criteria.")
display(
    selected_df[["experiment",'model_name',"history_window_size","pass1_auc", "pass_mean"]]
)