In [None]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import numpy as np
from analysis_third_part import plot_metric_bar, plot_metric_line

In [None]:
def load_all_results(base_dir: str) -> pd.DataFrame:
    """
    Load result.json files from all experiment subdirectories under the specified base directory,
    and merge them into a pandas DataFrame.
    """
    all_results_data = []

    if not os.path.isdir(base_dir):
        print(f"Error: Base directory '{base_dir}' does not exist.")
        return pd.DataFrame()

    # Iterate through all entries in base directory
    for exp_folder_name in os.listdir(base_dir):
        exp_path = os.path.join(base_dir, exp_folder_name)

        # Ensure it is a directory
        if os.path.isdir(exp_path):
            result_file_path = os.path.join(exp_path, "result.json")

            # Check if result.json exists
            if os.path.exists(result_file_path):
                try:
                    with open(result_file_path, "r", encoding="utf-8") as f:
                        result_data = json.load(f)
                        # Add experiment folder name as a column for identification
                        result_data["experiment"] = exp_folder_name
                        all_results_data.append(result_data)
                except json.JSONDecodeError:
                    print(f"Warning: Unable to parse JSON in file {result_file_path}.")
                except Exception as e:
                    print(f"Warning: Error reading file {result_file_path}: {e}")

    if not all_results_data:
        print("No 'result.json' files found or processed.")
        return pd.DataFrame()

    # Convert list of dictionaries to DataFrame
    results_df = pd.DataFrame(all_results_data)
    return results_df


In [None]:
# Set root directory for all experiment results
all_exp_base_dir = "analysis_third_part/waa"

# Load all experiment results into DataFrame
df_experiments = load_all_results(all_exp_base_dir)

# Display first few rows of DataFrame for inspection
if not df_experiments.empty:
    print(f"Successfully loaded results from {len(df_experiments)} experiments.")
    # display(df_experiments.head())
else:
    print("Failed to load any experiment results.")

In [None]:
selection_criteria = {

}


In [None]:
def cal_area_under_curve(df: pd.DataFrame, start:int, end:int) -> pd.DataFrame:
    """
    Calculate the area under the curve (AUC) for specified metrics, subtracting the area below the minimum value in the interval.
    Also update pass_k and pass_mean to the accuracy at the end step.
    
    Parameters:
        df: DataFrame containing experiment results
        start: Starting turn number
        end: Ending turn number
    
    Returns:
        DataFrame with added AUC and end accuracy columns
    """
    df = df.copy()
    
    passk_auc = []
    pass1_auc = []
    passk_end_values = []
    pass1_end_values = []
    
    for idx, row in df.iterrows():
        passk_dict = row['passk_at_T']
        pass1_dict = row['pass1_at_T']
        
        def extract_series(data_dict):
            values = []
            last_value = 0.0  # Initialize value from previous round
            for t in range(start, end + 1):
                key = str(t)
                if key in data_dict:
                    last_value = data_dict[key]  # Update value from previous round
                    values.append(last_value)
                else:
                    values.append(last_value)  # Use value from previous round
            return values
        
        passk_values = extract_series(passk_dict)
        pass1_values = extract_series(pass1_dict)
        
        min_passk = min(passk_values)
        adjusted_passk = [value - min_passk for value in passk_values]
        passk_auc.append(max(np.trapz(adjusted_passk, dx=1), 0.0))
        
        min_pass1 = min(pass1_values)
        adjusted_pass1 = [value - min_pass1 for value in pass1_values]
        pass1_auc.append(max(np.trapz(adjusted_pass1, dx=1), 0.0))
        
        def get_value_at_step(data_dict):
            end_key = str(end)
            if end_key in data_dict:
                return data_dict[end_key]
            if not data_dict:
                return np.nan
            numeric_keys = sorted(int(k) for k in data_dict.keys())
            lower_keys = [k for k in numeric_keys if k <= end]
            if lower_keys:
                return data_dict[str(lower_keys[-1])]
            higher_keys = [k for k in numeric_keys if k > end]
            if higher_keys:
                return data_dict[str(higher_keys[0])]
            return np.nan
        
        passk_end_values.append(get_value_at_step(passk_dict))
        pass1_end_values.append(get_value_at_step(pass1_dict))
    
    span = max(end - start, 1)  # Avoid division by zero
    df['passk_auc'] = np.array(passk_auc, dtype=float) / span
    df['pass1_auc'] = np.array(pass1_auc, dtype=float) / span
    df['pass_k'] = passk_end_values
    df['pass_mean'] = pass1_end_values
    
    return df

In [None]:
def cal_area_under_curve_with_loop_split(df: pd.DataFrame, start: int, end: int) -> pd.DataFrame:
    """
    Calculate the area under the curve (AUC) for specified metrics, including overall, with-loop, and without-loop trajectory pass@1.
    
    Parameters:
        df: DataFrame containing experiment results
        start: Starting turn number
        end: Ending turn number
    
    Returns:
        DataFrame with added AUC columns for different categories
    """
    df = df.copy()
    
    pass1_auc = []
    pass1_looped_auc = []
    pass1_non_looped_auc = []
    
    pass1_end_values = []
    pass1_looped_end_values = []
    pass1_non_looped_end_values = []
    
    for idx, row in df.iterrows():
        pass1_dict = row['pass1_at_T']
        pass1_looped_dict = row['pass1_at_T_looped']
        pass1_non_looped_dict = row['pass1_at_T_non_looped']
        def extract_series(data_dict):
            values = []
            last_value = 0.0
            for t in range(start, end + 1):
                key = str(t)
                if key in data_dict:
                    last_value = data_dict[key]
                    values.append(last_value)
                else:
                    values.append(last_value)
            return values
        
        # Calculate overall pass@1 AUC
        pass1_values = extract_series(pass1_dict)
        min_pass1 = min(pass1_values)
        adjusted_pass1 = [value - min_pass1 for value in pass1_values]
        pass1_auc.append(max(np.trapz(adjusted_pass1, dx=1), 0.0))
        
        # Calculate pass@1 AUC for trajectories with loops
        pass1_looped_values = extract_series(pass1_looped_dict)
        min_pass1_looped = min(pass1_looped_values)
        adjusted_pass1_looped = [value - min_pass1_looped for value in pass1_looped_values]
        pass1_looped_auc.append(max(np.trapz(adjusted_pass1_looped, dx=1), 0.0))
        
        # Calculate pass@1 AUC for trajectories without loops
        pass1_non_looped_values = extract_series(pass1_non_looped_dict)
        min_pass1_non_looped = min(pass1_non_looped_values)
        adjusted_pass1_non_looped = [value - min_pass1_non_looped for value in pass1_non_looped_values]
        pass1_non_looped_auc.append(max(np.trapz(adjusted_pass1_non_looped, dx=1), 0.0))
        
        def get_value_at_step(data_dict):
            end_key = str(end)
            if end_key in data_dict:
                return data_dict[end_key]
            if not data_dict:
                return np.nan
            numeric_keys = sorted(int(k) for k in data_dict.keys())
            lower_keys = [k for k in numeric_keys if k <= end]
            if lower_keys:
                return data_dict[str(lower_keys[-1])]
            higher_keys = [k for k in numeric_keys if k > end]
            if higher_keys:
                return data_dict[str(higher_keys[0])]
            return np.nan
        
        pass1_end_values.append(get_value_at_step(pass1_dict))
        pass1_looped_end_values.append(get_value_at_step(pass1_looped_dict))
        pass1_non_looped_end_values.append(get_value_at_step(pass1_non_looped_dict))
    
    span = max(end - start, 1)
    df['pass1_auc'] = np.array(pass1_auc, dtype=float) / span
    df['pass1_looped_auc'] = np.array(pass1_looped_auc, dtype=float) / span
    df['pass1_non_looped_auc'] = np.array(pass1_non_looped_auc, dtype=float) / span
    
    df['pass1_end'] = pass1_end_values
    df['pass1_looped_end'] = pass1_looped_end_values
    df['pass1_non_looped_end'] = pass1_non_looped_end_values
    
    return df

In [None]:
# --- 2. Filter DataFrame based on criteria ---
if selection_criteria:
    # Start from base DataFrame
    query_str = " & ".join(
        [f"`{k}` == {repr(v)}" for k, v in selection_criteria.items()]
    )
    selected_df = df_experiments.query(query_str)
else:
    # If no filter criteria, select all experiments
    selected_df = df_experiments
model_order = [
    'Qwen3-4B',
    'Qwen3-30B-A3B',
    'Llama3-8B',
    'Llama3-70B',
    'Glm-9B-Chat',
    "GLM-4-32B-0414",
    'Mistral-7B-Instruct-v0.3',
    'deepseek-v3'
]
# Create a mapping dictionary to map model names to sort indices
model_order_map = {model: idx for idx, model in enumerate(model_order)}

# Add a temporary column for sorting
selected_df['model_sort_order'] = selected_df['model_name'].map(
    lambda x: model_order_map.get(x, 999)  # Models not in list are placed last
)

# start = 1
# end = 50

# selected_df = cal_area_under_curve(selected_df, start=start, end=end)
# print(f"根据筛选条件，共找到 {len(selected_df)} 个实验。")
display(
    selected_df[["model_name","pass_mean", "pass_mean"]]
)


In [None]:
# start = 0
# end = 50

# selected_df = cal_area_under_curve_with_loop_split(selected_df, start=start, end=end)
# display(
#     selected_df[[
#         "model_name",
#         "pass_mean", 
#         "pass1_auc", 
#         "pass1_looped_auc",
#         "pass1_non_looped_auc",
#         "mean_loop_ratio_after_invalid_steps_stepnorm"
#     ]]
# )
# exclude_keys = set(selection_criteria.keys()) if selection_criteria else set()
# print("MEAN PASS\tAUV")
# for idx, row in selected_df.iterrows():
#     # print(label)
#     # pass_k = row["pass_k"]
#     pass_mean_looped = row["pass1_looped_end"]
#     pass_mean_non_looped = row["pass1_non_looped_end"]
#     auv_looped = row["pass1_looped_auc"]
#     auv_non_looped = row["pass1_non_looped_auc"]
#     print(f"{pass_mean_looped*100:.1f}\t{auv_looped*100:.1f}")
#     print(f"{pass_mean_non_looped*100:.1f}\t{auv_non_looped*100:.1f}")

In [None]:
start = 0
end = 50

selected_df = cal_area_under_curve(selected_df, start=start, end=end)
display(
    selected_df[[
        "model_name",
        "pass_mean", 
        "pass1_auc", 
        # "pass1_looped_auc",
        # "pass1_non_looped_auc",
        "mean_loop_ratio_after_invalid_steps_stepnorm"
    ]]
)
exclude_keys = set(selection_criteria.keys()) if selection_criteria else set()
print("AUV\t LR")
for idx, row in selected_df.iterrows():
    label = row["model_name"]
    print(label)
    # pass_k = row["pass_k"]
    # pass_mean = row["pass_mean"]
    loop_ratio_steplevel = row["mean_loop_ratio_after_invalid_steps_stepnorm"]
    # loop_ratio_trajlevel = row["mean_loop_ratio_after_invalid_steps_trajnorm"]
    auv = row["pass1_auc"]
    
    print(f"{auv*100:.1f}\t{loop_ratio_steplevel*100:.1f}")

In [None]:

plot_metric_line(
    selected_df,
    select_col="pass1_at_T",
    xlabel="Number of Turns to Pass@1",
    ylabel="pass@1",
    title="pass@1 at T(urns)",
    selection_criteria=selection_criteria,
)
plot_metric_line(
    selected_df,
    select_col="passk_at_T",
    xlabel="Number of Turns to Pass@k",
    ylabel="pass@k",
    title="pass@k at T(urns)",
    selection_criteria=selection_criteria,
)