In [1]:
import ast
import pandas as pd
import re

In [2]:
# Load eyeball evaluation CSV
evaluation_eyeball = pd.read_csv('evaluation_eyeball.csv')  # Adjust delimiter if needed
eyeball_df = evaluation_eyeball.copy()

# Ensure correct column names (strip spaces and fix headers if necessary)
eyeball_df.columns = ["human_"+col.strip().lower() for col in eyeball_df.columns]

# Fill missing values as 0 for comparison purposes
eyeball_df.fillna(0, inplace=True)

# Ensure numeric types
eyeball_df["human_single_aspect_overall"] = eyeball_df["human_single_aspect_overall"].astype(int)
eyeball_df["human_multi_aspect_overall"] = (
    (eyeball_df["human_correctness"] == 1)
    & (eyeball_df["human_completeness"] == 1)
    & (eyeball_df["human_relevance"] == 1)
    & (eyeball_df["human_action_efficiency"] == 1)
).astype(int)
eyeball_df["human_correctness"] = eyeball_df["human_correctness"].astype(int)
eyeball_df["human_completeness"] = eyeball_df["human_completeness"].astype(int)
eyeball_df["human_relevance"] = eyeball_df["human_relevance"].astype(int)
eyeball_df["human_action_efficiency"] = eyeball_df["human_action_efficiency"].astype(int)

In [3]:
eyeball_df = eyeball_df[["human_id", "human_single_aspect_overall", "human_multi_aspect_overall", "human_correctness", "human_completeness", "human_relevance", "human_action_efficiency"]]

In [4]:
eyeball_df.info()
eyeball_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 7 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   human_id                     41 non-null     object
 1   human_single_aspect_overall  41 non-null     int64 
 2   human_multi_aspect_overall   41 non-null     int64 
 3   human_correctness            41 non-null     int64 
 4   human_completeness           41 non-null     int64 
 5   human_relevance              41 non-null     int64 
 6   human_action_efficiency      41 non-null     int64 
dtypes: int64(6), object(1)
memory usage: 2.4+ KB


Unnamed: 0,human_id,human_single_aspect_overall,human_multi_aspect_overall,human_correctness,human_completeness,human_relevance,human_action_efficiency
0,NocNoc--0,0,0,0,0,0,0
1,NocNoc--1,0,0,0,0,1,1
2,NocNoc--2,0,0,0,0,1,1
3,NocNoc--3,1,0,1,0,0,0
4,NocNoc--4,1,1,1,1,1,1


In [5]:
def extract_last_list_from_file(path):
    with open(path, encoding="utf-8") as f:
        for line in reversed(f.readlines()):
            line = line.strip()
            if line.startswith("[") and line.endswith("]"):
                return ast.literal_eval(line)
    raise ValueError(f"No list found at the end of {path}")

def parse_aspect_reasoning_blocks(multi_eval_path):
    parsed_results = []
    
    with open(multi_eval_path, encoding="utf-8") as f:
        content = f.read()

    # Split by task block
    task_blocks = content.split("--------------------- ")
    for block in task_blocks:
        match = re.search(r"taskNocNoc--(\d+)", block)
        if not match:
            continue

        task_id = f"NocNoc--{match.group(1)}"
        result = {"ID": task_id}
        # Extract aspect judgments + reasoning
        for aspect in ["Correctness", "Completeness", "Relevance", "Action_Efficiency"]:
            pattern = rf"{aspect}:\s*(Yes|No|N/A)[^\n]*\nReasoning:\s*(.*?)(?=\n(?:\w+:|Overall Verdict:|$))"
            aspect_match = re.search(pattern, block, re.DOTALL)
            if aspect_match:
                val = aspect_match.group(1)
                reasoning = aspect_match.group(2).strip()
                result[f"{aspect}"] = 1 if val == "Yes" else 0 if val == "No" else None
                # result[f"reason_{aspect}"] = reasoning
            else:
                result[f"{aspect}"] = None
                # result[f"reason_{aspect}"] = None

        parsed_results.append(result)

    return pd.DataFrame(parsed_results)

In [6]:
evaluation_llm = parse_aspect_reasoning_blocks("evaluation_multi_eval.md")
llm_df = evaluation_llm.copy()

# Ensure correct column names (strip spaces and fix headers if necessary)
llm_df.columns = ["llm_"+col.strip().lower() for col in llm_df.columns]

# Fill missing values as 0 for comparison purposes
llm_df.fillna(0, inplace=True)

# Ensure numeric types
llm_df["llm_correctness"] = llm_df["llm_correctness"].astype(int)
llm_df["llm_completeness"] = llm_df["llm_completeness"].astype(int)
llm_df["llm_relevance"] = llm_df["llm_relevance"].astype(int)
llm_df["llm_action_efficiency"] = llm_df["llm_action_efficiency"].astype(int)

In [7]:
# Load LLM evaluations
llm_single_aspect_list = extract_last_list_from_file("evaluation_single_eval.md")
llm_multi_aspect_list = extract_last_list_from_file("evaluation_multi_eval.md")
# llm_single_aspect_list = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0]
# llm_multi_aspect_list = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0]
llm_df["llm_single_aspect_overall"] = llm_single_aspect_list
llm_df["llm_multi_aspect_overall"] = llm_multi_aspect_list
llm_df["llm_single_aspect_overall"] = llm_df["llm_single_aspect_overall"].astype(int)
llm_df["llm_multi_aspect_overall"] = llm_df["llm_multi_aspect_overall"].astype(int)

In [8]:
llm_df = llm_df[["llm_id", "llm_single_aspect_overall", "llm_multi_aspect_overall", "llm_correctness", "llm_completeness", "llm_relevance", "llm_action_efficiency"]]

In [9]:
llm_df.info()
llm_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   llm_id                     41 non-null     object
 1   llm_single_aspect_overall  41 non-null     int64 
 2   llm_multi_aspect_overall   41 non-null     int64 
 3   llm_correctness            41 non-null     int64 
 4   llm_completeness           41 non-null     int64 
 5   llm_relevance              41 non-null     int64 
 6   llm_action_efficiency      41 non-null     int64 
dtypes: int64(6), object(1)
memory usage: 2.4+ KB


Unnamed: 0,llm_id,llm_single_aspect_overall,llm_multi_aspect_overall,llm_correctness,llm_completeness,llm_relevance,llm_action_efficiency
0,NocNoc--0,0,0,0,0,0,0
1,NocNoc--1,0,0,0,0,0,0
2,NocNoc--2,0,0,0,0,0,0
3,NocNoc--3,0,0,0,0,0,0
4,NocNoc--4,0,0,0,0,0,0


In [11]:
# Join eyeball_df and llm_df on human_id and llm_id
comparison_df = pd.merge(eyeball_df, llm_df, left_on="human_id", right_on="llm_id")
comparison_df["single_match"] = comparison_df["human_single_aspect_overall"] == comparison_df["llm_single_aspect_overall"]
comparison_df["multi_match"] = comparison_df["human_multi_aspect_overall"] == comparison_df["llm_multi_aspect_overall"]
comparison_df["correctness_match"] = comparison_df["human_correctness"] == comparison_df["llm_correctness"]
comparison_df["completeness_match"] = comparison_df["human_completeness"] == comparison_df["llm_completeness"]
comparison_df["celevance_match"] = comparison_df["human_relevance"] == comparison_df["llm_relevance"]
comparison_df["action_efficiency_match"] = comparison_df["human_action_efficiency"] == comparison_df["llm_action_efficiency"]

In [12]:
comparison_df.info()
comparison_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   human_id                     41 non-null     object
 1   human_single_aspect_overall  41 non-null     int64 
 2   human_multi_aspect_overall   41 non-null     int64 
 3   human_correctness            41 non-null     int64 
 4   human_completeness           41 non-null     int64 
 5   human_relevance              41 non-null     int64 
 6   human_action_efficiency      41 non-null     int64 
 7   llm_id                       41 non-null     object
 8   llm_single_aspect_overall    41 non-null     int64 
 9   llm_multi_aspect_overall     41 non-null     int64 
 10  llm_correctness              41 non-null     int64 
 11  llm_completeness             41 non-null     int64 
 12  llm_relevance                41 non-null     int64 
 13  llm_action_efficiency        41 non-n

Unnamed: 0,human_id,human_single_aspect_overall,human_multi_aspect_overall,human_correctness,human_completeness,human_relevance,human_action_efficiency,llm_id,llm_single_aspect_overall,llm_multi_aspect_overall,llm_correctness,llm_completeness,llm_relevance,llm_action_efficiency,single_match,multi_match,correctness_match,completeness_match,celevance_match,action_efficiency_match
0,NocNoc--0,0,0,0,0,0,0,NocNoc--0,0,0,0,0,0,0,True,True,True,True,True,True
1,NocNoc--1,0,0,0,0,1,1,NocNoc--1,0,0,0,0,0,0,True,True,True,True,False,False
2,NocNoc--2,0,0,0,0,1,1,NocNoc--2,0,0,0,0,0,0,True,True,True,True,False,False
3,NocNoc--3,1,0,1,0,0,0,NocNoc--3,0,0,0,0,0,0,False,True,False,True,True,True
4,NocNoc--4,1,1,1,1,1,1,NocNoc--4,0,0,0,0,0,0,False,False,False,False,False,False


In [18]:
# Summary statistics
hm_single_accuracy = comparison_df["human_single_aspect_overall"].mean()
hm_multi_accuracy = comparison_df["human_multi_aspect_overall"].mean()
hm_correctness_accuracy = comparison_df["human_correctness"].mean()
hm_completeness_accuracy = comparison_df["human_completeness"].mean()
hm_relevance_accuracy = comparison_df["human_relevance"].mean()
hm_action_efficiency_accuracy = comparison_df["human_action_efficiency"].mean()
llm_single_accuracy = comparison_df["llm_single_aspect_overall"].mean()
llm_multi_accuracy = comparison_df["llm_multi_aspect_overall"].mean()
llm_correctness_accuracy = comparison_df["llm_correctness"].mean()
llm_completeness_accuracy = comparison_df["llm_completeness"].mean()
llm_relevance_accuracy = comparison_df["llm_relevance"].mean()
llm_action_efficiency_accuracy = comparison_df["llm_action_efficiency"].mean()
print("==========================")
print("Summary Statistics")
print("==========================")
print(f"Human Single Aspect Accuracy (Success rate): {hm_single_accuracy:.2%}")
print(f"Human Multi Aspect Accuracy: {hm_multi_accuracy:.2%}")
print(f"Human Correctness Accuracy: {hm_correctness_accuracy:.2%}")
print(f"Human Completeness Accuracy: {hm_completeness_accuracy:.2%}")
print(f"Human Relevance Accuracy: {hm_relevance_accuracy:.2%}")
print(f"Human Action Efficiency Accuracy: {hm_action_efficiency_accuracy:.2%}")
print("==========================")
print(f"LLM Single Aspect Accuracy (Success rate): {llm_single_accuracy:.2%}")
print(f"LLM Multi Aspect Accuracy: {llm_multi_accuracy:.2%}")
print(f"LLM Correctness Accuracy: {llm_correctness_accuracy:.2%}")
print(f"LLM Completeness Accuracy: {llm_completeness_accuracy:.2%}")
print(f"LLM Relevance Accuracy: {llm_relevance_accuracy:.2%}")
print(f"LLM Action Efficiency Accuracy: {llm_action_efficiency_accuracy:.2%}")

Summary Statistics
Human Single Aspect Accuracy (Success rate): 56.10%
Human Multi Aspect Accuracy: 43.90%
Human Correctness Accuracy: 63.41%
Human Completeness Accuracy: 51.22%
Human Relevance Accuracy: 73.17%
Human Action Efficiency Accuracy: 53.66%
LLM Single Aspect Accuracy (Success rate): 41.46%
LLM Multi Aspect Accuracy: 31.71%
LLM Correctness Accuracy: 31.71%
LLM Completeness Accuracy: 31.71%
LLM Relevance Accuracy: 24.39%
LLM Action Efficiency Accuracy: 0.00%


In [19]:
# Show matches
single_match = comparison_df["single_match"].mean()
multi_match = comparison_df["multi_match"].mean()
correctness_match = comparison_df["correctness_match"].mean()
completeness_match = comparison_df["completeness_match"].mean()
celevance_match = comparison_df["celevance_match"].mean()
action_efficiency_match = comparison_df["action_efficiency_match"].mean()
print(f"Single Aspect Match: {single_match:.2%}")
print(f"Multi Aspect Match: {multi_match:.2%}")
print(f"Correctness Match: {correctness_match:.2%}")
print(f"Completeness Match: {completeness_match:.2%}")
print(f"Relevance Match: {celevance_match:.2%}")
print(f"Action Efficiency Match: {action_efficiency_match:.2%}")

Single Aspect Match: 75.61%
Multi Aspect Match: 68.29%
Correctness Match: 63.41%
Completeness Match: 65.85%
Relevance Match: 51.22%
Action Efficiency Match: 46.34%


In [17]:
# Show mismatches
mismatches_df = comparison_df[~comparison_df["Single_Match"] | ~comparison_df["Multi_Match"]]
mismatches_df

Unnamed: 0,id,Human_Single,LLM_Single,Single_Match,Human_Multi,LLM_Multi,Multi_Match
3,NocNoc--3,1,0,False,0,0,True
4,NocNoc--4,1,0,False,1,0,False
6,NocNoc--6,1,0,False,1,0,False
7,NocNoc--7,1,0,False,1,0,False
10,NocNoc--10,1,1,True,0,1,False
17,NocNoc--17,1,0,False,1,1,True
19,NocNoc--19,0,0,True,1,0,False
20,NocNoc--20,0,0,True,1,0,False
21,NocNoc--22,1,1,True,1,0,False
22,NocNoc--23,1,0,False,0,0,True
