In [None]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import numpy as np
from utils.analysis_files.analysis import plot_metric_bar, plot_metric_line

In [None]:
def load_all_results(base_dir: str) -> pd.DataFrame:
    """
    Load result.json files from all experiment subdirectories under the specified base directory,
    and merge them into a pandas DataFrame.
    """
    all_results_data = []

    if not os.path.isdir(base_dir):
        print(f"Error: Base directory '{base_dir}' does not exist.")
        return pd.DataFrame()

    # Iterate through all entries in base directory
    for exp_folder_name in os.listdir(base_dir):
        exp_path = os.path.join(base_dir, exp_folder_name)

        # Ensure it is a directory
        if os.path.isdir(exp_path):
            result_file_path = os.path.join(exp_path, "result.json")

            # Check if result.json exists
            if os.path.exists(result_file_path):
                try:
                    with open(result_file_path, "r", encoding="utf-8") as f:
                        result_data = json.load(f)
                        # Add experiment folder name as a column for identification
                        result_data["experiment"] = exp_folder_name
                        all_results_data.append(result_data)
                except json.JSONDecodeError:
                    print(f"Warning: Unable to parse JSON in file {result_file_path}.")
                except Exception as e:
                    print(f"Warning: Error reading file {result_file_path}: {e}")

    if not all_results_data:
        print("No 'result.json' files found or processed.")
        return pd.DataFrame()

    # Convert list of dictionaries to DataFrame
    results_df = pd.DataFrame(all_results_data)
    return results_df


In [None]:
task = "webshop"
# Set root directory for all experiment results
all_exp_base_dir = f"./res_fake/{task}"

# Load all experiment results into DataFrame
df_experiments = load_all_results(all_exp_base_dir)

# Display first few rows of DataFrame for inspection
if not df_experiments.empty:
    print(f"Successfully loaded results from {len(df_experiments)} experiments.")
    # display(df_experiments.head())
else:
    print("Failed to load any experiment results.")

In [None]:
selection_criteria = {
    # 'model_name': 'Qwen3-30B-A3B',
    # "enable_thinking": False,
    "state": "env",
    'chat_format': 'user_assistant_format',
    # "alfworld_mode": "eval_in_distribution",
    'history_has_cot': True,
    "stop_by_self": False,
    "offer_feedback": True,
    "prompt_example": "fewshot",
}


In [None]:
# --- 2. Filter DataFrame based on criteria ---
if selection_criteria:
    # Start from base DataFrame
    query_str = " & ".join(
        [f"`{k}` == {repr(v)}" for k, v in selection_criteria.items()]
    )
    selected_df = df_experiments.query(query_str)
else:
    # If no filter criteria, select all experiments
    selected_df = df_experiments
model_order = [
    'Qwen3-4B',
    'Qwen3-30B-A3B',
    'Llama3-8B',
    'Llama3-70B',
    'Llama-3.1-8B',
    'Llama-3.3-70B',
    'Glm-9B-Chat',
    'Glm4-9B-Chat',
    "GLM-4-32B-0414",
    'Mistral-7B-Instruct-v0.3',
    'Ministral-3-14B-Instruct-2512',
    'phi-4',
    'deepseek-v3',
    'deepseek-v3.2',
    'gemini-2.5-flash',
    'gemini-2.5-flash-nothinking',
    'Phi-4-reasoning',
    'gpt-oss-120b',
    'deepseek-r1',
    'gemini-2.5-pro',
]
# Create a mapping dictionary to map model names to sort indices
model_order_map = {model: idx for idx, model in enumerate(model_order)}

# Add a temporary column for sorting
selected_df['model_sort_order'] = selected_df['model_name'].map(
    lambda x: model_order_map.get(x, 999)  # Models not in list are placed last
)
sort_list = ['model_sort_order','chat_format','enable_thinking','history_has_cot',"state"]
selected_df = selected_df.sort_values(by=sort_list, ascending=[True, True, True, True, False]).reset_index(drop=True)
print(f"Found {len(selected_df)} experiments matching the filter criteria.")
display(
    selected_df[["model_name","state", "chat_format","enable_thinking", "history_has_cot", "pass_mean", "mean_loop_ratio_after_invalid_steps_stepnorm"]]
)

In [None]:

# plot_metric_line(
#     selected_df,
#     select_col="pass1_at_T",
#     xlabel="Number of Turns to Pass@1",
#     ylabel="pass@1",
#     title="pass@1 at T(urns)",
#     selection_criteria=selection_criteria,
# )
# plot_metric_line(
#     selected_df,
#     select_col="passk_at_T",
#     xlabel="Number of Turns to Pass@k",
#     ylabel="pass@k",
#     title="pass@k at T(urns)",
#     selection_criteria=selection_criteria,
# )

In [None]:
# Load Entropy Data and Save to JSON
label_mapping = {
    'Qwen3-4b\n-enable_thinking=F': 'Qwen3-4B',
    'Qwen3-4B\n-enable_thinking=F': 'Qwen3-4B',
    'Qwen3-4B\n-enable_thinking=T': 'Qwen3-4B-Thinking',
    'Qwen3-30B-A3B\n-enable_thinking=F': 'Qwen3-30B-A3B',
    'Qwen3-30B-A3B\n-enable_thinking=T': 'Qwen3-30B-A3B-Thinking',
    'Llama3-8B\n-enable_thinking=F': 'Llama3-8B',
    'Llama3-70B\n-enable_thinking=F': 'Llama3-70B',
    'Llama-3.1-8B\n-enable_thinking=F': 'Llama-3.1-8B',
    'Llama-3.3-70B\n-enable_thinking=F': 'Llama-3.3-70B',
    'Glm-9B-Chat\n-enable_thinking=F': 'Glm-4-9B-Chat',
    'Glm4-9B-Chat\n-enable_thinking=F': 'Glm-4-9B-Chat',
    "GLM-4-32B-0414\n-enable_thinking=F": "GLM-4-32B-0414",
    'Mistral-7B-Instruct-v0.3\n-enable_thinking=F': 'Mistral-7B',
    'Ministral-3-14B-Instruct-2512\n-enable_thinking=F': 'Ministral-3-14B-Instruct-2512',
    'phi-4\n-enable_thinking=F': 'Phi-4',
    'deepseek-v3\n-enable_thinking=F': 'Deepseek-v3',
    'deepseek-v3.2\n-enable_thinking=F': 'Deepseek-v3.2',
    'gemini-2.5-flash\n-enable_thinking=F': 'Gemini2.5-Flash',
    'gemini-2.5-flash-nothinking\n-enable_thinking=F': 'Gemini2.5-Flash',
    'Phi-4-reasoning\n-enable_thinking=T': 'Phi-4-Reasoning',
    'gpt-oss-120b\n-enable_thinking=T': 'GPT-OSS-120B',
    'deepseek-r1\n-enable_thinking=T': 'Deepseek-R1',
    'gemini-2.5-pro\n-enable_thinking=T': 'Gemini2.5-Pro',
}


In [None]:

from utils.analysis_files.analysis import get_config_label
exclude_keys = set(selection_criteria.keys()) if selection_criteria else set()
entropy = []
for idx, row in selected_df.iterrows():
    label = get_config_label(row, exclude_keys=exclude_keys)
    label = label_mapping.get(label, label)
    pass_1_at_t = row["pass1_at_T"]
    entropy.append({
        "model": label,
        "pass_1_at_t": pass_1_at_t,
    })
with open(f"{task}_loop_pass_1_at_t.json", 'w') as f:
    json.dump(entropy, f, indent=4)

In [None]:
# Get Entropy Data and Save to JSON
from utils.analysis_files.analysis import get_config_label
exclude_keys = set(selection_criteria.keys()) if selection_criteria else set()
entropy = []
for idx, row in selected_df.iterrows():
    label = get_config_label(row, exclude_keys=exclude_keys)
    label = label_mapping.get(label, label)
    mean_normal_step_action_entropy = row["mean_normal_step_action_entropy"]
    mean_loop_step_action_entropy = row["mean_loop_step_action_entropy"]
    mean_normal_step_analysis_entropy = row["mean_normal_step_analysis_entropy"]
    mean_loop_step_analysis_entropy = row["mean_loop_step_analysis_entropy"]
    entropy.append({
        "model": label,
        "mean_normal_step_action_entropy": mean_normal_step_action_entropy,
        "mean_loop_step_action_entropy": mean_loop_step_action_entropy,
        "mean_normal_step_analysis_entropy": mean_normal_step_analysis_entropy,
        "mean_loop_step_analysis_entropy": mean_loop_step_analysis_entropy,
    })
with open(f'{task}_loop_entropy.json', 'w') as f:
    json.dump(entropy, f, indent=4)