In [None]:
import pandas as pd
import json
import os
import re
from pathlib import Path
from tqdm import tqdm
import orjson
from collections import defaultdict
import multiprocessing as mp
from multiprocessing.pool import ThreadPool

In [None]:
def get_action_category(action_str):
    """
    Classify actions based on pyautogui function name
    """
    if not action_str or 'pyautogui' not in action_str:
        return None
    
    # Extract function name
    match = re.search(r'pyautogui\.(\w+)\(', action_str)
    if not match:
        return None
    
    func_name = match.group(1)
    
    
    return func_name

def analyze_loop_action_types(base_dir):
    """
    Count type distribution of loop actions in analysis.json
    """
    def process_line(line):
        if not line.strip():
            return None
        return orjson.loads(line)

    def load_jsonl_parallel(data_path, n_workers=64):
        with open(data_path, "rb") as f:
            lines = f.readlines()
        # Use thread pool for IO-intensive and CPU-intensive tasks that release GIL
        with ThreadPool(n_workers) as pool:
            data = [x for x in pool.map(process_line, lines) if x is not None]
        return data
    all_traj = load_jsonl_parallel(base_dir)
    action_counts = defaultdict(int)
    total_actions = 0


            
    # Iterate through all sample folders under each task
    for traj in all_traj:
        # Check if there are loop error records
        trajectory= traj["trajectory"]
        for step in trajectory:
                for action in step["action"]:
                    category = get_action_category(action)
                    if category:
                        action_counts[category] += 1
                        total_actions += 1

    # Calculate statistics
    results = []
    for category, count in action_counts.items():
        percentage = (count / total_actions * 100) if total_actions > 0 else 0
        results.append({
            "Action Type": category,
            "Count": count,
            "Percentage": f"{percentage:.2f}%"
        })
    
    df = pd.DataFrame(results)
    if not df.empty:
        df = df.sort_values("Count", ascending=False).reset_index(drop=True)
    
    return df


In [None]:
# ...existing code...
root_dir = "analysis_third_part"
dataset="waa"
dataset_path = Path(root_dir) / dataset

if not dataset_path.exists():
    print(f"Dataset path not found: {dataset}")
else:
    print(f"Starting to analyze dataset: {dataset}")
    
    # Data for aggregating all models
    all_models_data = []
    all_action_types = set()
    
    # Iterate through all model folders under dataset
    for model_dir in dataset_path.iterdir():
        if not model_dir.is_dir():
            continue
            
        model_name = model_dir.name
        print(f"\n{'='*30}\nProcessing model: {model_name}")
        
        # Run analysis
        all_traj_dir = f"{dataset}/{model_name}_transformed_trajectories.jsonl"
        loop_stats_df = analyze_loop_action_types(all_traj_dir)
        
        if not loop_stats_df.empty:
            print(f"Loop action type statistics ({model_name}):")
            print(loop_stats_df)
            
            # Save single model result
            output_file = f"{dataset_path}/all_action_stats_{model_name}.csv"
            loop_stats_df.to_csv(output_file, index=False, sep='\t')
            print(f"Results saved to: {output_file}")
            
            # Collect all action types
            for action_type in loop_stats_df['Action Type']:
                all_action_types.add(action_type)
            
            # Add model name column
            loop_stats_df['Model'] = model_name
            all_models_data.append(loop_stats_df)
        else:
            print(f"Model {model_name} has no Loop action data or directory is empty.")
    
    # Aggregate all model data
    if all_models_data:
        # Merge all model data
        combined_df = pd.concat(all_models_data, ignore_index=True)
        
        # Create pivot table, rows are models, columns are action types
        pivot_df = combined_df.pivot_table(
            index='Model',
            columns='Action Type',
            values='Count',
            fill_value=0,
            aggfunc='sum'
        ).reset_index()
        
        # Ensure all action types are in columns
        for action_type in all_action_types:
            if action_type not in pivot_df.columns:
                pivot_df[action_type] = 0
        
        # Calculate total and percentage for each model
        action_columns = [col for col in pivot_df.columns if col != 'Model']
        pivot_df['Total'] = pivot_df[action_columns].sum(axis=1)
        
        # Add percentage column for each action type
        for action_type in action_columns:
            pivot_df[f'{action_type}_pct'] = (pivot_df[action_type] / pivot_df['Total'] * 100).round(2)
        
        # Save aggregated results
        summary_file = f"{dataset_path}/all_action_stats_all_models_summary.csv"
        pivot_df.to_csv(summary_file, index=False, sep='\t')
        print(f"\n{'='*50}")
        print(f"All models aggregated results saved to: {summary_file}")
        print(f"\nAggregated statistics:")
        print(pivot_df)
        
        # Also save raw merged data
        raw_summary_file = f"{dataset_path}/all_action_stats_all_models_raw.csv"
        combined_df.to_csv(raw_summary_file, index=False, sep='\t')
        print(f"Raw merged data saved to: {raw_summary_file}")
    else:
        print("\nNo model data found.")