In [None]:
import orjson
import multiprocessing as mp
import os
from utils.analysis_files.analysis import load_all_data
from omegaconf import OmegaConf
import pandas as pd
from utils.analysis_files.analysis_wcr import cal_weighted_corrected_rate
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
def process_line(line):
    if not line.strip():
        return None
    item = orjson.loads(line)
    # item.pop("rollout_trajectories", None)
    return item

def load_jsonl_parallel(data_dir, n_workers=32):
    with open(data_dir, "rb") as f:
        lines = f.readlines()

    with mp.Pool(n_workers) as pool:
        data = [x for x in pool.map(process_line, lines) if x is not None]
    
    return data

def find_jsonl_file(directory):
    for file in os.listdir(directory):
        if file.endswith('.jsonl'):
            return os.path.join(directory, file)
    return None

In [None]:

# data_dir_base_env_state = "res/alfworld/Llama3-70B_user_assistant_format_20251206"
data_dir_base_env_state = "res/alfworld/Llama-3.3-70B_user_assistant_format_20251221_001"


# df_no_state = load_all_data(load_jsonl_parallel(find_jsonl_file(data_dir_base_no_state), n_workers=128))
df_env_state = load_all_data(load_jsonl_parallel(find_jsonl_file(data_dir_base_env_state), n_workers=128))

In [None]:
def get_unique_id(row):
    if row['seed'] is not None and row['seed'] != 'none':
        return row['seed']
    else:
        return row['query']

# Add unique identifier column for both datasets
# df_no_state['unique_id'] = df_no_state.apply(get_unique_id, axis=1)
df_env_state['unique_id'] = df_env_state.apply(get_unique_id, axis=1)


In [None]:
# 1. Calculate average recall distance for each sample
def calculate_obj_recall_distance(df, sample_idx, traj_idx):
    """Calculate obj recall distance for a single trajectory"""
    traj_df = df[(df['sample_idx'] == sample_idx) & (df['traj_idx'] == traj_idx)].copy()
    
    # Get task information
    from utils.analysis_files.alfworld_mem_recall import extract_objects_and_locations
    task_type, _, _, task_objects, _ = extract_objects_and_locations(traj_df.iloc[0]["query"])
    
    task_obj_set = set([obj.lower() for obj in task_objects])
    
    # Store the last step where each task obj appeared
    obj_last_seen = {}
    distances = []
    
    for step_idx in sorted(traj_df['step_idx'].unique()):
        if step_idx == 0:
            continue
            
        step_data = traj_df[traj_df['step_idx'] == step_idx]
        
        # Skip invalid action
        if not step_data['action_is_valid'].iloc[0]:
            continue
        
        # Get objects in current obs
        obs = step_data["observation"].values[0]
        _, _, obs_objects, _, _ = extract_objects_and_locations(obs)
        
        # Update last appearance position for each object
        for obj_tuple in obs_objects:
            obj_name = obj_tuple[0].lower()
            obj_last_seen[obj_name] = step_idx
        
        # Get objects in current action
        action = step_data["action"].values[0]
        _, _, action_objects, _, _ = extract_objects_and_locations(action)
        
        # Check if objects in action hit task obj
        for action_obj in action_objects:
            action_obj_name = action_obj[0].lower()
            if action_obj_name in task_obj_set:
                # If this task obj appeared before, calculate distance
                if action_obj_name in obj_last_seen:
                    distance = step_idx - obj_last_seen[action_obj_name]
                    distances.append(distance)
    
    # Return average recall distance for this trajectory
    if distances:
        return np.mean(distances)
    else:
        return np.nan


In [None]:
# Calculate recall distance for all trajectories
from utils.analysis_files.alfworld_mem_recall import extract_objects_and_locations

recall_distances = []
for (sample_idx, traj_idx), group in df_env_state.groupby(['sample_idx', 'traj_idx']):
    dist = calculate_obj_recall_distance(df_env_state, sample_idx, traj_idx)
    recall_distances.append({
        'sample_idx': sample_idx,
        'traj_idx': traj_idx,
        'obj_recall_distance': dist
    })

recall_distance_df = pd.DataFrame(recall_distances)

# Calculate average recall distance for each sample
sample_avg_distance = recall_distance_df.groupby('sample_idx')['obj_recall_distance'].mean().reset_index()
sample_avg_distance.columns = ['sample_idx', 'avg_obj_recall_distance']

# 2. Merge to merged_df
# First need to map sample_idx to unique_id
sample_id_mapping = df_env_state[['sample_idx', 'unique_id','avg_accuracy']].drop_duplicates()
sample_avg_distance = sample_avg_distance.merge(sample_id_mapping, on='sample_idx', how='left')


In [None]:
# 5) Plot scatter plot: avg_obj_recall_distance vs avg_accuracy (env-state only)
plt.figure(figsize=(8,5), dpi=120)
p = sns.scatterplot(
    data=sample_avg_distance.dropna(subset=['avg_obj_recall_distance','avg_accuracy']),
    x='avg_obj_recall_distance', y='avg_accuracy', s=60, alpha=0.7
)
p.set_title("Env-state: Average Object Recall Distance vs Avg Accuracy", fontsize=14)
p.set_xlabel("Average Object Recall Distance (per sample)", fontsize=12)
p.set_ylabel("Average Accuracy (env_state)", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.3)
plt.show()

# 6) Output simple statistics and correlation coefficient
valid = sample_avg_distance.dropna(subset=['avg_obj_recall_distance','avg_accuracy'])
print(f"Sample count: {len(valid)}  Average recall distance: {valid['avg_obj_recall_distance'].mean():.3f}")
if len(valid) >= 2:
    corr = valid[['avg_obj_recall_distance','avg_accuracy']].corr().iloc[0,1]
    print(f"Pearson correlation coefficient between recall distance and avg_accuracy: {corr:.3f}")

In [None]:
# Plot scatter plot and box plot of success vs recall_distance for each traj
fig, ax = plt.subplots(figsize=(10, 6), dpi=120)

# Prepare data: add success information for each traj
traj_success = df_env_state.groupby(['sample_idx', 'traj_idx'])['success'].first().reset_index()
recall_distance_with_success = recall_distance_df.merge(traj_success, on=['sample_idx', 'traj_idx'], how='left')

# Remove NaN values
valid_data = recall_distance_with_success.dropna(subset=['obj_recall_distance'])

# Plot grouped by success status
success_data = valid_data[valid_data['success'] == True]
fail_data = valid_data[valid_data['success'] == False]

# Plot scatter plot
ax.scatter(fail_data['obj_recall_distance'], 
           [0] * len(fail_data), 
           alpha=0.6, s=80, c='red', label='Failed', marker='x')
ax.scatter(success_data['obj_recall_distance'], 
           [1] * len(success_data), 
           alpha=0.6, s=80, c='green', label='Success', marker='o')

ax.set_xlabel('Object Recall Distance', fontsize=14)
ax.set_ylabel('Trajectory Success', fontsize=14)
ax.set_yticks([0, 1])
ax.set_yticklabels(['Failed', 'Success'])
ax.set_title('Trajectory Success vs. Object Recall Distance', fontsize=16)
ax.legend(fontsize=12)
ax.grid(True, axis='x', linestyle='--', alpha=0.3)

plt.tight_layout()
plt.show()

# Output statistics
print("\nRecall distance statistics by Success status:")
print(f"Success trajectory count: {len(success_data)}, Average recall distance: {success_data['obj_recall_distance'].mean():.2f}")
print(f"Failed trajectory count: {len(fail_data)}, Average recall distance: {fail_data['obj_recall_distance'].mean():.2f}")

# Use box plot to show clearer comparison
fig, ax = plt.subplots(figsize=(8, 6), dpi=120)
sns.boxplot(data=valid_data, x='success', y='obj_recall_distance', ax=ax)
ax.set_xlabel('Trajectory Success', fontsize=14)
ax.set_ylabel('Object Recall Distance', fontsize=14)
ax.set_xticklabels(['Failed', 'Success'])
ax.set_title('Object Recall Distance Distribution by Success Status', fontsize=16)
ax.grid(True, axis='y', linestyle='--', alpha=0.3)
plt.tight_layout()
plt.savefig('recall_distance_boxplot_by_success.pdf', bbox_inches='tight')
plt.show()

# Perform statistical test
from scipy import stats
if len(success_data) > 0 and len(fail_data) > 0:
    t_stat, p_value = stats.ttest_ind(
        success_data['obj_recall_distance'].dropna(), 
        fail_data['obj_recall_distance'].dropna()
    )
    print(f"\nt-test result: t-statistic={t_stat:.3f}, p-value={p_value:.4f}")
    if p_value < 0.05:
        print("Conclusion: There is a significant difference in recall distance between Success and Failed trajectories")
    else:
        print("Conclusion: There is no significant difference in recall distance between Success and Failed trajectories")

In [None]:
# Save box plot data as JSON format
import json
# Prepare data to save
exp_name = data_dir_base_env_state.split('/')[-1]
boxplot_data = {
    'metadata': {
        'exp_name': exp_name,
        'total_trajectories': len(valid_data),
        'success_count': len(success_data),
        'failed_count': len(fail_data),
        'success_mean_recall_distance': float(success_data['obj_recall_distance'].mean()) if len(success_data) > 0 else None,
        'failed_mean_recall_distance': float(fail_data['obj_recall_distance'].mean()) if len(fail_data) > 0 else None,
    },
    'trajectories': []
}
# Add data for each trajectory
for _, row in valid_data.iterrows():
    boxplot_data['trajectories'].append({
        'sample_idx': int(row['sample_idx']),
        'traj_idx': int(row['traj_idx']),
        'success': bool(row['success']),
        'obj_recall_distance': float(row['obj_recall_distance'])
    })
# Save as JSON file
with open(f'recall_distance_boxplot_data_{exp_name}.json', 'w') as f:
    json.dump(boxplot_data, f, indent=2)
print(f"\nâœ“ Box plot data saved to recall_distance_boxplot_data_{exp_name}.json")
print(f"  - Total trajectories: {len(valid_data)}")
print(f"  - Success trajectories: {len(success_data)}, Failed trajectories: {len(fail_data)}")