# 2024-12-04 Robust Inference

This notebook analyzes results from:
- `dist_agent_1lm` (Baseline)
- `dist_agent_1lm_noise` (Noise)
- `dist_agent_1lm_randrot` (RR)
- `dist_agent_1lm_randrot_noise` (RR + Noise)

TODO: Should it be analyzing the `nohyp` versions of these results?


In [11]:
import os
import pandas as pd
import numpy as np

results_dir = "~/tbp/monty_lab/monty_capabilities_analysis/results/dmc"
baseline_path = os.path.join(results_dir, "dist_agent_1lm", "eval_stats.csv")
noise_path = os.path.join(results_dir, "dist_agent_1lm_noise", "eval_stats.csv")
rr_path = os.path.join(results_dir, "dist_agent_1lm_randrot", "eval_stats.csv")
rr_noise_path = os.path.join(
    results_dir, "dist_agent_1lm_randrot_noise", "eval_stats.csv"
)

baseline_df = pd.read_csv(baseline_path)
noise_df = pd.read_csv(noise_path)
rr_df = pd.read_csv(rr_path)
rr_noise_df = pd.read_csv(rr_noise_path)


## Performance-related Analyses

### Overall System Performance 
- Accuracy and Rotation across all objects for each condition 
- Distribution of steps needed for recognition 
- Per-object accuracy to identify particularly robust or sensitive objects

In [10]:
# Accuracy across each condition

correct_values = ["correct", "correct_mlh"]

# if primary_performance column has "correct" or "correct_mlh", then it is considered correct
baseline_df["correct"] = baseline_df["primary_performance"].isin(correct_values)
noise_df["correct"] = noise_df["primary_performance"].isin(correct_values)
rr_df["correct"] = rr_df["primary_performance"].isin(correct_values)
rr_noise_df["correct"] = rr_noise_df["primary_performance"].isin(correct_values)

# Calculate accuracy
baseline_acc = baseline_df["correct"].mean()
noise_acc = noise_df["correct"].mean()
rr_acc = rr_df["correct"].mean()
rr_noise_acc = rr_noise_df["correct"].mean()

# Print results as percentages
print(f"Baseline accuracy: {baseline_acc * 100:.2f}%")
print(f"Noise accuracy: {noise_acc * 100:.2f}%")
print(f"RR accuracy: {rr_acc * 100:.2f}%")
print(f"RR + Noise accuracy: {rr_noise_acc * 100:.2f}%")


Baseline accuracy: 98.14%
Noise accuracy: 96.01%
RR accuracy: 91.37%
RR + Noise accuracy: 89.61%


In [12]:
# Rotation error across all objects for each condition
# Note: the rotation_error column is in radians, and contain NaN values if the object was not recognized

# Calculate rotation error
baseline_rot_err = baseline_df["rotation_error"].mean()
noise_rot_err = noise_df["rotation_error"].mean()
rr_rot_err = rr_df["rotation_error"].mean()
rr_noise_rot_err = rr_noise_df["rotation_error"].mean()

# Print results in radians and degrees
print(
    f"Baseline rotation error: {baseline_rot_err:.4f} radians ({baseline_rot_err * 180 / np.pi:.2f} degrees)"
)
print(
    f"Noise rotation error: {noise_rot_err:.4f} radians ({noise_rot_err * 180 / np.pi:.2f} degrees)"
)
print(
    f"RR rotation error: {rr_rot_err:.4f} radians ({rr_rot_err * 180 / np.pi:.2f} degrees)"
)
print(
    f"RR + Noise rotation error: {rr_noise_rot_err:.4f} radians ({rr_noise_rot_err * 180 / np.pi:.2f} degrees)"
)


Baseline rotation error: 0.0406 radians (2.32 degrees)
Noise rotation error: 0.0913 radians (5.23 degrees)
RR rotation error: 0.3753 radians (21.50 degrees)
RR + Noise rotation error: 0.4330 radians (24.81 degrees)


In [47]:
# Per-object accuracy to identify particularly robust or sensitive objects
def get_per_object_metrics(df):
    metrics = df.groupby("primary_target_object").agg(
        {
            "primary_performance": lambda x: x.isin(correct_values).mean(),
            "rotation_error": "mean",
            "monty_matching_steps": "mean",
        }
    )
    metrics["object_name"] = metrics.index
    metrics.columns = [
        "accuracy",
        "rotation_error",
        "monty_matching_steps",
        "object_name",
    ]
    return metrics


baseline_obj_acc = get_per_object_metrics(baseline_df)
noise_obj_acc = get_per_object_metrics(noise_df)
rr_obj_acc = get_per_object_metrics(rr_df)
rr_noise_obj_acc = get_per_object_metrics(rr_noise_df)

# Make an altair scatter plot for baseline_obj_acc x-axis = monty_matching_steps, y-axis = accuracy with tooltip for object name
import altair as alt

# Add hover tooltip for object name
baseline_obj_acc_chart = (
    alt.Chart(baseline_obj_acc)
    .mark_point()
    .encode(x="monty_matching_steps", y="accuracy", tooltip=["object_name"])
    .properties(title="dist_agent_1lm Object Accuracy", width=500, height=500)
).interactive()

x_line = alt.Chart(pd.DataFrame({"x": [50]})).mark_rule(color="red").encode(x="x")
y_line = alt.Chart(pd.DataFrame({"y": [0.95]})).mark_rule(color="red").encode(y="y")

baseline_obj_acc_chart + x_line + y_line


**How to Interpret the Plot**

I have divided into "quadrants" for easier interpretation:
- **Top-Left**: Objects that are easy to recognize (high accuracy and low monty_matching_steps)
  - Quick and Accurate recognition 
- **Top-Right**: Objects that are hard to recognize (low accuracy and low monty_matching_steps)
  - Eventually accurate but takes a while (most are `correct_mlh`)
- **Bottom-Left**: Objects that are easy to recognize (high accuracy and high monty_matching_steps)
  - Most problematic? Gets to the wrong object quickly. 
- **Bottom-Right**: Objects that are hard to recognize (low accuracy and high monty_matching_steps)
  - Some mistakes, lots of MLH
  

In [50]:
object_of_interest = "bleach_cleanser"
an_object = baseline_df[baseline_df["primary_target_object"] == object_of_interest]

# Just display the following columns
columns_to_display_in_order = [
    "primary_performance",
    "num_steps",
    "rotation_error",
    "result",
    "primary_target_object",
    "monty_steps",
    "monty_matching_steps",
    "primary_target_rotation_euler",
]
an_object[columns_to_display_in_order]


Unnamed: 0,primary_performance,num_steps,rotation_error,result,primary_target_object,monty_steps,monty_matching_steps,primary_target_rotation_euler
70,correct,21,0.0,bleach_cleanser,bleach_cleanser,128,21,[0 0 0]
147,correct_mlh,25,2.9717,"['master_chef_can', 'mustard_bottle', 'bleach_...",bleach_cleanser,10000,25,[ 0 90 0]
224,confused_mlh,37,,"['power_drill', 'nine_hole_peg_test', 'sugar_b...",bleach_cleanser,10000,37,[ 0 180 0]
301,correct,23,0.0,bleach_cleanser,bleach_cleanser,297,23,[ 0 270 0]
378,correct,21,0.0261,bleach_cleanser,bleach_cleanser,185,21,[90 0 0]
455,correct,50,0.0,bleach_cleanser,bleach_cleanser,427,50,[ 90 180 0]
532,correct,21,2.9847,bleach_cleanser,bleach_cleanser,54,21,[35 45 0]
609,correct,23,0.0002,bleach_cleanser,bleach_cleanser,148,23,[325 45 0]
686,correct,21,0.0,bleach_cleanser,bleach_cleanser,234,21,[ 35 315 0]
763,correct,21,0.0,bleach_cleanser,bleach_cleanser,103,21,[325 315 0]


In [55]:
# Show all altair plots for noise, rr, and rr_noise (side by side)

noise_obj_acc_chart = (
    alt.Chart(noise_obj_acc)
    .mark_point()
    .encode(
        x="monty_matching_steps",
        y="accuracy",
        tooltip=["object_name"],
    )
    .properties(title="dist_agent_1lm_noise")
)
rr_obj_acc_chart = (
    alt.Chart(rr_obj_acc)
    .mark_point()
    .encode(
        x="monty_matching_steps",
        y="accuracy",
        tooltip=["object_name"],
    )
    .properties(title="dist_agent_1lm_randrot")
)
rr_noise_obj_acc_chart = (
    alt.Chart(rr_noise_obj_acc)
    .mark_point()
    .encode(
        x="monty_matching_steps",
        y="accuracy",
        tooltip=["object_name"],
    )
    .properties(title="dist_agent_1lm_randrot_noise")
)

baseline_obj_acc_chart | noise_obj_acc_chart | rr_obj_acc_chart | rr_noise_obj_acc_chart
