In [None]:
import orjson
import multiprocessing as mp
import os
from utils.analysis_files.analysis import load_all_data
import numpy as np
import pandas as pd
from utils.analysis_files.analysis import load_all_data, identify_error_atomic_actions
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def process_line(line):
    if not line.strip():
        return None
    item = orjson.loads(line)
    # item.pop("rollout_trajectories", None)
    return item

def load_jsonl_parallel(data_dir, n_workers=32):
    with open(data_dir, "rb") as f:
        lines = f.readlines()

    with mp.Pool(n_workers) as pool:
        data = [x for x in pool.map(process_line, lines) if x is not None]
    
    return data

def find_jsonl_file(directory):
    for file in os.listdir(directory):
        if file.endswith('.jsonl'):
            return os.path.join(directory, file)
    return None

In [None]:
def load_all_exp_data(base_dir: str, selection_criteria: dict = None):
    """
    Load data from all experiment subdirectories under the specified base directory,
    filter experiments according to selection_criteria,
    and return looped atomic action length statistics for each experiment
    """
    from utils.analysis_files.analysis import get_config, get_config_label
    
    all_exp_results = []
    
    if not os.path.isdir(base_dir):
        print(f"Error: Base directory '{base_dir}' does not exist.")
        return pd.DataFrame()
    
    # Iterate through all experiment folders
    for exp_folder_name in os.listdir(base_dir):
        exp_path = os.path.join(base_dir, exp_folder_name)
        
        if not os.path.isdir(exp_path):
            continue
        
        try:
            # Read configuration
            config_data = get_config(exp_path)
            
            # Filter according to selection_criteria
            if selection_criteria:
                match = all(
                    config_data.get(k) == v 
                    for k, v in selection_criteria.items()
                )
                if not match:
                    continue
            
            # Find jsonl file
            jsonl_path = find_jsonl_file(exp_path)
            if not jsonl_path:
                print(f"Warning: No .jsonl file found in {exp_path}")
                continue
            
            # Load data
            print(f"Loading data from: {exp_folder_name}")
            raw_data = load_jsonl_parallel(jsonl_path, n_workers=32)
            df = load_all_data(raw_data)
            
            # Calculate looped atomic action length statistics
            df_sorted = df.sort_values(['sample_idx', 'traj_idx', 'step_idx']).reset_index(drop=True)
            
            result_list = []
            for (sample_idx, traj_idx), group in df_sorted.groupby(['sample_idx', 'traj_idx']):
                group = group.reset_index(drop=True)
                marks = identify_error_atomic_actions(group)
                marks['sample_idx'] = sample_idx
                marks['traj_idx'] = traj_idx
                marks['step_idx'] = group['step_idx'].values
                result_list.append(marks)
            
            marks_df = pd.concat(result_list, ignore_index=True)
            df_sorted = df_sorted.merge(
                marks_df, 
                on=['sample_idx', 'traj_idx', 'step_idx'], 
                how='left'
            )
            
            # Count looped atomic action lengths
            looped_ids = df_sorted[df_sorted['is_looped'] == True][
                ['sample_idx', 'traj_idx', 'error_atomic_id']
            ].drop_duplicates()
            
            atomic_action_lengths = []
            for _, row in looped_ids.iterrows():
                sample_idx = row['sample_idx']
                traj_idx = row['traj_idx']
                error_id = row['error_atomic_id']
                
                group = df_sorted[
                    (df_sorted['sample_idx'] == sample_idx) & 
                    (df_sorted['traj_idx'] == traj_idx) & 
                    (df_sorted['error_atomic_id'] == error_id)
                ]
                
                length = len(group)
                atomic_action_lengths.append({
                    'sample_idx': sample_idx,
                    'traj_idx': traj_idx,
                    'error_atomic_id': error_id,
                    'length': length
                })
            
            atomic_lengths_df = pd.DataFrame(atomic_action_lengths)
            
            # Calculate length distribution ratios
            if len(atomic_lengths_df) > 0:
                length_ratios = atomic_lengths_df['length'].value_counts(normalize=True).sort_index()
                length_ratios_dict = length_ratios.to_dict()
            else:
                length_ratios_dict = {}
            
            # Generate experiment label
            exclude_keys = set(selection_criteria.keys()) if selection_criteria else set()
            exp_label = get_config_label(pd.Series(config_data), exclude_keys)
            
            # Save result
            result = {
                'experiment': exp_folder_name,
                'label': exp_label,
                'length_ratios': length_ratios_dict,
                'total_looped_actions': len(atomic_lengths_df),
                **config_data
            }
            all_exp_results.append(result)
            
            print(f"  - Found {len(atomic_lengths_df)} looped atomic actions")
            
        except Exception as e:
            print(f"Error processing {exp_path}: {e}")
            continue
    
    if not all_exp_results:
        print("No experiments found matching the criteria.")
        return pd.DataFrame()
    
    results_df = pd.DataFrame(all_exp_results)
    print(f"\nSuccessfully loaded {len(results_df)} experiments.")
    return results_df

In [None]:
selection_criteria = {
    # 'model_name': 'Qwen3-30B-A3B',
    # "enable_thinking": False,
    "state": "env",
    'chat_format': 'user_assistant_format',
    # "alfworld_mode": "eval_in_distribution",
    'history_has_cot': True,
    "stop_by_self": False,
    # "offer_feedback": True,
    # "prompt_example": "fewshot",
}

In [None]:
# Load all experiment data
all_exp_base_dir = "./res/blocksworld"
data_all_experiments = load_all_exp_data(all_exp_base_dir, selection_criteria)


In [None]:
if not data_all_experiments.empty:
    stats_results = []
    
    for idx, row in data_all_experiments.iterrows():
        length_ratios = row['length_ratios']
        
        if length_ratios:
            # Calculate ratio for len=1
            ratio_len_1 = length_ratios.get(1, 0)
            
            # Calculate ratio for len>1
            ratio_len_gt_1 = sum(ratio for length, ratio in length_ratios.items() if length > 1)
            
            stats_results.append({
                'experiment': row['experiment'],
                "enable_thinking": row["enable_thinking"],
                "chat_format": row["chat_format"],
                "history_has_cot": row["history_has_cot"],
                "state": row["state"],
                'label': row['label'],
                'model_name': row.get('model_name', 'Unknown'),
                'total_looped_actions': row['total_looped_actions'],
                'ratio_len_1': ratio_len_1,
                'ratio_len_gt_1': ratio_len_gt_1
            })
        else:
            stats_results.append({
                'experiment': row['experiment'],
                "enable_thinking": row["enable_thinking"],
                "chat_format": row["chat_format"],
                "history_has_cot": row["history_has_cot"],
                "state": row["state"],
                'label': row['label'],
                'model_name': row.get('model_name', 'Unknown'),
                'total_looped_actions': 0,
                'ratio_len_1': 0,
                'ratio_len_gt_1': 0
            })
    
    stats_df = pd.DataFrame(stats_results)
    
    # Define model order
    model_order = [
        'Qwen3-4B',
        'Qwen3-30B-A3B',
        'Llama3-8B',
        'Llama3-70B',
        'Glm-9B-Chat',
        'Glm4-9B-Chat',
        "GLM-4-32B-0414",
        'Mistral-7B-Instruct-v0.3',
        'Ministral-3-14B-Instruct-2512',
        'phi-4',
        'deepseek-v3',
        'deepseek-v3.2',
        'gemini-2.5-flash',
        'gemini-2.5-flash-nothinking',
        'Phi-4-reasoning',
        'gpt-oss-120b',
        'deepseek-r1',
        'gemini-2.5-pro',
    ]
    
    # Create model sort mapping
    model_order_map = {model: idx for idx, model in enumerate(model_order)}
    stats_df['model_sort_order'] = stats_df['model_name'].map(
        lambda x: model_order_map.get(x, 999)  # Models not in list are placed last
    )
    sort_list = ['enable_thinking','model_sort_order','chat_format','history_has_cot',"state"]
    # Sort by model order
    stats_df = stats_df.sort_values(by=sort_list, ascending=[True, True, True, True, False]).reset_index(drop=True)
    
    # Delete auxiliary sort column
    # stats_df = stats_df.drop('model_sort_order', axis=1)
    
    display(
    stats_df[["model_name","total_looped_actions","ratio_len_1","ratio_len_gt_1"]]
)
else:
    print("No experiments to analyze")

In [None]:
print("Len1\tLen>1\t")
for idx, row in stats_df.iterrows():
    Len1_ratio = row["ratio_len_1"]
    LenGT1_ratio = row["ratio_len_gt_1"]
    print(f"{Len1_ratio*100:.1f}\t{LenGT1_ratio*100:.1f}")

In [None]:
# Plot line chart for all experiments
if not data_all_experiments.empty:
    # Find all length values that appear in all experiments
    export_data = {
        'metadata': {
            'total_experiments': len(data_all_experiments),
            'selection_criteria': selection_criteria
        },
        'experiments': []
    }
    all_lengths = set()
    for idx, row in data_all_experiments.iterrows():
        length_ratios = row['length_ratios']
        if length_ratios:
            all_lengths.update(length_ratios.keys())
    
    # If there is data, sort by length
    if all_lengths:
        all_lengths = sorted(all_lengths)
        
        plt.figure(figsize=(12, 7))
        colors = sns.color_palette("tab10", n_colors=len(data_all_experiments))
        
        for idx, row in data_all_experiments.iterrows():
            length_ratios = row['length_ratios']
            
            # Create data for all lengths, set missing lengths to 0
            x_data = all_lengths
            y_data = [length_ratios.get(length, 0) for length in all_lengths]
            
            export_data['experiments'].append({
                'experiment_name': row['experiment'],
                'label': row['label'],
                'x_data': all_lengths,
                'y_data': y_data
            })
            plt.plot(
                x_data, 
                y_data, 
                marker='o', 
                linewidth=2, 
                markersize=6,
                label=row['label'],
                color=colors[idx]
            )
        
        plt.xlabel('Atomic Action Length', fontsize=12)
        plt.ylabel('Ratio', fontsize=12)
        plt.title('Looped Atomic Action Length Distribution (All Experiments)', fontsize=14)
        plt.xticks(all_lengths)
        plt.grid(True, alpha=0.3)
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.show()
    else:
        print("No looped atomic actions found in any experiment")
else:
    print("No experiments to plot")

In [None]:
# output_file = 'loop_unit_len_stats_data.json'
# with open(output_file, 'w', encoding='utf-8') as f:
#     import json
#     json.dump(export_data, f, indent=2, ensure_ascii=False)
# print(f"Data exported to {output_file}")