In [1]:
import pandas as pd

In [2]:
# Load eyeball evaluation CSV
eyeball_df = pd.read_csv('evaluation_eyeball.csv')  # Adjust delimiter if needed

In [3]:
# Ensure correct column names (strip spaces and fix headers if necessary)
eyeball_df.columns = [col.strip().lower() for col in eyeball_df.columns]

In [4]:
# Fill missing values as 0 for comparison purposes
eyeball_df.fillna(0, inplace=True)

In [5]:
# Ensure numeric types
eyeball_df["single_aspect_overall"] = eyeball_df["single_aspect_overall"].astype(int)
eyeball_df["correctness"] = eyeball_df["correctness"].astype(int)
eyeball_df["completeness"] = eyeball_df["completeness"].astype(int)
eyeball_df["relevance"] = eyeball_df["relevance"].astype(int)
eyeball_df["action_efficiency"] = eyeball_df["action_efficiency"].astype(int)

In [6]:
eyeball_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     41 non-null     object
 1   task_type              41 non-null     object
 2   additional_info        41 non-null     object
 3   owner                  41 non-null     object
 4   single_aspect_overall  41 non-null     int64 
 5   correctness            41 non-null     int64 
 6   completeness           41 non-null     int64 
 7   relevance              41 non-null     int64 
 8   action_efficiency      41 non-null     int64 
dtypes: int64(5), object(4)
memory usage: 3.0+ KB


In [7]:
# Extract human labels
human_single_aspect = []
human_multi_aspect = []

In [8]:
human_single_aspect.extend(eyeball_df["single_aspect_overall"].tolist())
human_multi_aspect.extend(
    (
        (eyeball_df["correctness"] == 1)
        & (eyeball_df["completeness"] == 1)
        & (eyeball_df["relevance"] == 1)
        & (eyeball_df["action_efficiency"] == 1)
    ).astype(int).tolist()
)

In [9]:
print(len(human_single_aspect))
print(len(human_multi_aspect))
print(human_single_aspect)
print(human_multi_aspect)

41
41
[0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0]
[0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0]


In [10]:
# Load LLM evaluations
eval_single_aspect = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0]
eval_multi_aspect = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0]

In [11]:
print(len(eval_single_aspect))
print(len(eval_multi_aspect))

41
41


In [12]:
# Comparison
comparison = []
for i, (hm_s, llm_s, hm_m, llm_m) in enumerate(zip(human_single_aspect, eval_single_aspect, human_multi_aspect, eval_multi_aspect)):
    comparison.append(
        {
            "id": eyeball_df.iloc[i]["id"],
            "Human_Single": hm_s,
            "LLM_Single": llm_s,
            "Single_Match": hm_s == llm_s,
            "Human_Multi": hm_m,
            "LLM_Multi": llm_m,
            "Multi_Match": hm_m == llm_m,
        }
    )

In [13]:
# Create DataFrame for analysis
comparison_df = pd.DataFrame(comparison)

In [14]:
comparison_df.head()

Unnamed: 0,id,Human_Single,LLM_Single,Single_Match,Human_Multi,LLM_Multi,Multi_Match
0,NocNoc--0,0,0,True,0,0,True
1,NocNoc--1,0,0,True,0,0,True
2,NocNoc--2,0,0,True,0,0,True
3,NocNoc--3,1,0,False,0,0,True
4,NocNoc--4,1,0,False,1,0,False


In [15]:
# Summary statistics
hm_single_accuracy = comparison_df["Human_Single"].mean()
hm_multi_accuracy = comparison_df["Human_Multi"].mean()
llm_single_accuracy = comparison_df["LLM_Single"].mean()
llm_multi_accuracy = comparison_df["LLM_Multi"].mean()
print(f"Human Single Aspect Accuracy (Success rate): {hm_single_accuracy:.2%}")
print(f"LLM Single Aspect Accuracy (Success rate): {llm_single_accuracy:.2%}")
print(f"Human Multi Aspect Accuracy: {hm_multi_accuracy:.2%}")
print(f"LLM Multi Aspect Accuracy: {llm_multi_accuracy:.2%}")

Human Single Aspect Accuracy (Success rate): 56.10%
LLM Single Aspect Accuracy (Success rate): 41.46%
Human Multi Aspect Accuracy: 43.90%
LLM Multi Aspect Accuracy: 31.71%


In [16]:
# Show matches
single_match = comparison_df["Single_Match"].mean()
multi_match = comparison_df["Multi_Match"].mean()
print(f"Single Aspect Match: {single_match:.2%}")
print(f"Multi Aspect Match: {multi_match:.2%}")

Single Aspect Match: 75.61%
Multi Aspect Match: 68.29%


In [17]:
# Show mismatches
mismatches_df = comparison_df[~comparison_df["Single_Match"] | ~comparison_df["Multi_Match"]]
mismatches_df

Unnamed: 0,id,Human_Single,LLM_Single,Single_Match,Human_Multi,LLM_Multi,Multi_Match
3,NocNoc--3,1,0,False,0,0,True
4,NocNoc--4,1,0,False,1,0,False
6,NocNoc--6,1,0,False,1,0,False
7,NocNoc--7,1,0,False,1,0,False
10,NocNoc--10,1,1,True,0,1,False
17,NocNoc--17,1,0,False,1,1,True
19,NocNoc--19,0,0,True,1,0,False
20,NocNoc--20,0,0,True,1,0,False
21,NocNoc--22,1,1,True,1,0,False
22,NocNoc--23,1,0,False,0,0,True
