# 0. Overall Comparison

In [None]:
import pandas as pd

# Load eyeball evaluation CSV
eyeball_df = pd.read_csv('evaluation/evaluation_eyeball.csv')  # Adjust delimiter if needed

# Ensure correct column names (strip spaces and fix headers if necessary)
eyeball_df.columns = [col.strip() for col in eyeball_df.columns]

# Extract human labels
human_single_aspect = []
human_multi_aspect = []

# Fill missing values as 0 for comparison purposes
eyeball_df.fillna(0, inplace=True)

# Ensure numeric types
eyeball_df["SingleAspect"] = eyeball_df["SingleAspect"].astype(int)
eyeball_df["Correctness"] = eyeball_df["Correctness"].astype(int)
eyeball_df["Completeness"] = eyeball_df["Completeness"].astype(int)
eyeball_df["Relevance"] = eyeball_df["Relevance"].astype(int)
eyeball_df["Action Efficiency"] = eyeball_df["Action Efficiency"].astype(int)

for _, row in eyeball_df.iterrows():
    human_single_aspect.append(row["SingleAspect"])
    # If all 4 aspects pass, multi_aspect = 1
    multi_pass = (
        row["Correctness"] == 1 and 
        row["Completeness"] == 1 and 
        row["Relevance"] == 1 and 
        row["Action Efficiency"] == 1
    )
    human_multi_aspect.append(int(multi_pass))

# Load model evaluations
eval_single_aspect = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0]
eval_multi_aspect = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0]

# Comparison
comparison = []
for i, (h_s, m_s, h_m, m_m) in enumerate(zip(human_single_aspect, eval_single_aspect, human_multi_aspect, eval_multi_aspect)):
    comparison.append({
        "ID": eyeball_df.iloc[i]["ID"],
        "Human_Single": h_s,
        "Model_Single": m_s,
        "Single_Match": h_s == m_s,
        "Human_Multi": h_m,
        "Model_Multi": m_m,
        "Multi_Match": h_m == m_m,
    })

# Create DataFrame for analysis
comparison_df = pd.DataFrame(comparison)

# Summary statistics
single_accuracy = comparison_df["Single_Match"].mean()
multi_accuracy = comparison_df["Multi_Match"].mean()

print(f"Single Aspect Match Accuracy: {single_accuracy:.2%}")
print(f"Multi Aspect Match Accuracy: {multi_accuracy:.2%}")

# Show mismatches
mismatches_df = comparison_df[~comparison_df["Single_Match"] | ~comparison_df["Multi_Match"]]
mismatches_df

Single Aspect Match Accuracy: 58.54%
Multi Aspect Match Accuracy: 65.85%


Unnamed: 0,ID,Human_Single,Model_Single,Single_Match,Human_Multi,Model_Multi,Multi_Match
3,NocNoc--3,1,0,False,0,0,True
9,NocNoc--9,0,1,False,0,0,True
10,NocNoc--10,0,1,False,0,1,False
11,NocNoc--11,0,1,False,0,1,False
15,NocNoc--15,0,1,False,0,1,False
17,NocNoc--17,0,0,True,0,1,False
18,NocNoc--18,0,1,False,0,1,False
21,NocNoc--22,1,1,True,1,0,False
22,NocNoc--23,1,0,False,0,0,True
26,NocNoc--27,0,1,False,0,1,False


In [26]:
import re, ast
import pandas as pd

# ------------------------------------------------------------------
# 1. Load the eyeball CSV  (comma-separated now, no need for sep='\t')
# ------------------------------------------------------------------
eyeball_df = pd.read_csv("evaluation/evaluation_eyeball.csv")

# keep only rows that a human actually judged (Owner == "YG")
# yg_df = eyeball_df[eyeball_df["Owner"] == "YG"].copy()

# ------------------------------------------------------------------
# 2. Rename / coerce columns
# ------------------------------------------------------------------
eyeball_df = eyeball_df.rename(columns={
    "SingleAspect" : "human_single",
    "Correctness"  : "H_corr",
    "Completeness" : "H_comp",
    "Relevance"    : "H_rel",
    "Action Efficiency" : "H_eff",
})


# make sure the numeric columns are ints (NaN → 0)
for col in ["human_single","H_corr","H_comp","H_rel","H_eff"]:
    eyeball_df[col] = eyeball_df[col].fillna(0).astype(int)

# Aggregate the four sub-aspects into one “all pass = 1, otherwise 0”
eyeball_df["human_multi"] = (
    eyeball_df[["H_corr","H_comp","H_rel","H_eff"]].min(axis=1)
)

# ------------------------------------------------------------------
# 3. Read the *auto* single- and multi-aspect lists from the logs
# ------------------------------------------------------------------
def extract_last_list_from_file(path):
    with open(path, encoding="utf-8") as f:
        for line in reversed(f.readlines()):
            line = line.strip()
            if line.startswith("[") and line.endswith("]"):
                return ast.literal_eval(line)
    raise ValueError(f"No list found at the end of {path}")

auto_single = extract_last_list_from_file("evaluation/evaluation.log")
auto_multi  = extract_last_list_from_file("evaluation/evaluation_multi_eval.log")

auto_df = pd.DataFrame({
    "ID"        : [f"NocNoc--{i}" for i in range(len(auto_single))],
    "auto_single": auto_single,
    "auto_multi" : auto_multi,
})

# ------------------------------------------------------------------
# 4. Merge and compare
# ------------------------------------------------------------------
merged = eyeball_df.merge(auto_df, on="ID", how="left")

merged["single_match"] = merged["human_single"] == merged["auto_single"]
merged["multi_match"]  = merged["human_multi"]  == merged["auto_multi"]

# Optional: compare aspect-by-aspect, too
for aspect, col in zip(
    ["Correctness","Completeness","Relevance","Action efficiency"],
    ["H_corr","H_comp","H_rel","H_eff"]
):
    merged[f"{aspect}_match"] = merged[col] == merged["auto_multi"]  # or replace with per-aspect list if you have it

# ------------------------------------------------------------------
# 5. Quick report
# ------------------------------------------------------------------
print(f"Tasks compared: {len(merged)}")
print(f"Single-aspect agreement: {merged['single_match'].mean():.2%}")
print(f"Multi-aspect  agreement: {merged['multi_match'].mean():.2%}\n")

print("Single-aspect mismatches:")
print(merged.loc[~merged["single_match"], ["ID","human_single","auto_single"]])

print("\nMulti-aspect mismatches:")
print(merged.loc[~merged["multi_match"], ["ID","human_multi","auto_multi"]])


Tasks compared: 41
Single-aspect agreement: 53.66%
Multi-aspect  agreement: 63.41%

Single-aspect mismatches:
            ID  human_single  auto_single
3    NocNoc--3             1          0.0
9    NocNoc--9             0          1.0
10  NocNoc--10             0          1.0
11  NocNoc--11             0          1.0
15  NocNoc--15             0          1.0
18  NocNoc--18             0          1.0
21  NocNoc--22             1          0.0
23  NocNoc--24             1          0.0
25  NocNoc--26             0          1.0
26  NocNoc--27             0          1.0
27  NocNoc--28             0          1.0
28  NocNoc--29             0          1.0
30  NocNoc--31             0          1.0
31  NocNoc--32             0          1.0
32  NocNoc--33             0          1.0
33  NocNoc--34             0          1.0
35  NocNoc--36             0          1.0
36  NocNoc--37             0          1.0
40  NocNoc--41             0          NaN

Multi-aspect mismatches:
            ID  human_mu

# 0. Extract Overall Evaluation

In [23]:
import ast

# ------------------------------------------------------------------
# 3. Read the *auto* single- and multi-aspect lists from the logs
# ------------------------------------------------------------------
def extract_last_list_from_file(path):
    with open(path, encoding="utf-8") as f:
        for line in reversed(f.readlines()):
            line = line.strip()
            if line.startswith("[") and line.endswith("]"):
                return ast.literal_eval(line)
    raise ValueError(f"No list found at the end of {path}")

auto_single = extract_last_list_from_file("evaluation/evaluation.log")
auto_multi  = extract_last_list_from_file("evaluation/evaluation_multi_eval.log")

print('auto_single')
print(auto_single)
print()
print('auto_multi')
print(auto_multi)

auto_single
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0]

auto_multi
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0]


# 1. Multi Aspect Analysis

In [19]:
import re
import pandas as pd

def parse_aspect_reasoning_blocks(log_path):
    with open(log_path, encoding="utf-8") as f:
        content = f.read()

    # Split by task block
    task_blocks = content.split("--------------------- ")
    parsed_results = []

    for block in task_blocks:
        match = re.search(r"taskNocNoc--(\d+)", block)
        if not match:
            continue

        task_id = f"NocNoc--{match.group(1)}"
        result = {"ID": task_id}

        # Extract aspect judgments + reasoning
        for aspect in ["Correctness", "Completeness", "Relevance", "Action Efficiency"]:
            pattern = rf"{aspect}:\s*(Yes|No|N/A)[^\n]*\nReasoning:\s*(.*?)(?=\n(?:\w+:|Overall Verdict:|$))"
            aspect_match = re.search(pattern, block, re.DOTALL)
            if aspect_match:
                val = aspect_match.group(1)
                reasoning = aspect_match.group(2).strip()
                result[f"A_{aspect}"] = 1 if val == "Yes" else 0 if val == "No" else None
                result[f"R_{aspect}"] = reasoning
            else:
                result[f"A_{aspect}"] = None
                result[f"R_{aspect}"] = None

        parsed_results.append(result)

    return pd.DataFrame(parsed_results)

# เรียกใช้งาน
aspect_df = parse_aspect_reasoning_blocks("evaluation/evaluation_multi_eval.log")
aspect_df

Unnamed: 0,ID,A_Correctness,R_Correctness,A_Completeness,R_Completeness,A_Relevance,R_Relevance,A_Action Efficiency,R_Action Efficiency
0,NocNoc--0,,,,,,,,
1,NocNoc--1,0.0,The task was to find the most affordable air p...,0.0,The task required sorting air purifiers from l...,,The task was not about finding the best deal o...,0.0,The screenshots do not show any sorting action...
2,NocNoc--2,0.0,The task was to find budget-friendly projector...,0.0,"The task required sorting projectors by price,...",0.0,The response did not provide relevant informat...,0.0,The agent did not efficiently find and sort pr...
3,NocNoc--3,,,,,,,,
4,NocNoc--4,0.0,The task was to find the newest hair dryers an...,0.0,The task required both finding the newest hair...,,This task does not involve finding the best de...,0.0,The agent took multiple steps to change the so...
5,NocNoc--5,,,,,,,,
6,NocNoc--6,0.0,The task was to find the Dyson vacuum from the...,0.0,The task required sorting by best selling to f...,,The task was not about finding the best deal o...,1.0,"The agent performed unnecessary actions, such ..."
7,NocNoc--7,0.0,"The agent's response states ""Garmin Official S...",0.0,The task required identifying the highest-rate...,,The task was not about finding the best deal o...,0.0,The agent did not perform the necessary action...
8,NocNoc--8,,,,,,,,
9,NocNoc--9,0.0,"The response claims the newest is the ""Samsung...",0.0,The task required identifying both the newest ...,,The task was not about finding the best deal o...,1.0,"The agent's actions appear efficient, as there..."
