## Initialize Feedback Function(s)

In [None]:
# Create snowpark session.

from snowflake.snowpark import Session

# snowflake_connection_parameters = {
#     "account": os.environ["SNOWFLAKE_ACCOUNT"],
#     "user": os.environ["SNOWFLAKE_USER"],
#     "password": os.environ["SNOWFLAKE_USER_PASSWORD"],
#     "database": os.environ["SNOWFLAKE_DATABASE"],
#     "schema": os.environ["SNOWFLAKE_SCHEMA"],
#     "role": os.environ["SNOWFLAKE_ROLE"],
#     "warehouse": os.environ["SNOWFLAKE_WAREHOUSE"],
# }


snowflake_connection_parameters = {
    "account": "SNOWHOUSE",
    "user": "ajia",
    "authenticator": "externalbrowser",
}
snowpark_session = Session.builder.configs(
    snowflake_connection_parameters
).create()

# TruSession is no longer required as long as snowflake connector exists
# sf_connector = SnowflakeConnector(snowpark_session=snowpark_session)

In [None]:
from trulens.providers.cortex import Cortex

provider = Cortex(
    model_engine="claude-4-sonnet",
    snowpark_session=snowpark_session,
    reasoning_effort="high",
)
# provider = OpenAI(model_engine="o3")

feedback_functions = {
    "Logical Consistency": provider.logical_consistency_with_cot_reasons,
    # "Execution Efficiency": provider.execution_efficiency_with_cot_reasons,
    # "Plan Adherence": provider.plan_adherence_with_cot_reasons,
    # "Plan Quality": provider.plan_quality_with_cot_reasons,
    # "TRAIL": provider.trail_with_cot_reasons,
}

for name in feedback_functions:
    print(f"- {name}")

## PROCESS ALL GAIA FILES

In [None]:
import json
import os

import pandas as pd
from trail_traversal import build_span_tree
from trail_traversal import reset_state

gaia_dir = "GAIA"
all_files = []
for filename in os.listdir(gaia_dir):
    if filename.endswith("0140b3f657eddf76ca82f72c49ac8e58.json"):
        reset_state()
        print(filename)
        filepath = os.path.join(gaia_dir, filename)
        with open(filepath, "r") as f:
            trace_data = json.load(f)
        output_file = os.path.join(gaia_dir, filename.replace(".json", ".txt"))
        with open(output_file, "w") as f:
            f.write(f"Trace ID: {trace_data['trace_id']}\n\n")
        root_spans = build_span_tree(trace_data)
        for root_span in root_spans:
            root_span.display(output_file=output_file)
        all_files.append(output_file)

print(len(all_files))

## GAIA CUSTOM INSTRUCTIONS

In [None]:
# Description of agent architecture and trace structure
GAIA_trace_explanation = """
Agent Architecture and Trace Structure: The agent architecture consists of a primary manager Agent (also referred to as CodeAgent) that delegates tasks to a search_agent (also referred to as ToolCallingAgent).

Overall Flow:
Every trace consists of several spans (with span_id numbers and parent span_id numbers). Each trace begins with the manager (CodeAgent). The process follows a clear, hierarchical structure where the manager outlines a high-level plan and the search_agent executes the detailed, tool-based steps for each part of that plan.

1. Manager Agent Initiation:
The trace starts with the manager. In its initial child spans, you will observe the following sequence:
- A preparatory survey is created based on the user's query.
- A high-level plan is formulated from this survey.

The Manager agent begins executing Step 1 of its plan.

2. Manager Agent Step 1:
Within the child span for Step 1, the Manager agent decides how to proceed given the initial fact survey and plan. The Manager agent will produce a thought, which may call the search_agent to perform the necessary actions or research.

3. search_agent (ToolCallingAgent) Execution Loop:
Once called, the search_agent begins its own execution loop. In its child spans, you will observe the following sequence:
- A preparatory survey to the specific sub-task it received from the Manager agent.
- A plan tailored to the specific sub-task it received from the Manager agent.

The search_agent executes an initial set of up to four steps. Each step involves an LLM call to generate a tool-call, followed by the tool's execution.
After these initial steps, search_agent synthesizes the information gathered into an updated fact list and refines its plan.
The search_agent may then continue to execute more tool-steps based on this updated plan.

This loop continues until the search_agent has gathered enough information to comprehensively answer the manager's sub-task, at which point it calls final_answer.

4. Returning Control to the Manager agent
The final_answer from the search_agent is returned to the Manager agent, concluding the Manager agent's Step 1. The Manager agent then proceeds to Step 2 of its high-level plan, using the result from the previous step as context. 
This entire cycle repeats for all subsequent steps in the Manager Agent's plan.

Whenever you want to point out anything in the trace, cite the span_id number of the span that you are referring to.
"""

In [None]:
# Custom instructions for each metric (mostly around shaping the output)
logical_consistency_prompt = """
Track each agent's system instructions and conversation history, ensuring all subsequent outputs from that agent adhere to its established guidelines and prior dialogue, even when agents speak interchangeably. 
For the manager agent and each unique search_agent that may exist in the trace, evaluate the logical consistency for the agent's actions and responses. For each agent, ensure that each response is consistent with the system instructions and prior dialogue.
You must structure your entire response: 
**Manager Agent** 
**System Instructions**
[Paste all system instructions associated with the manager agent.]
**Logical Consistency issues**
[All Logical Consistency issues associated with the manager agent] 

**search_agent 0** (if exists)
**System Instructions**
[Paste all system instructions associated with the search_agent.]
**Logical Consistency issues**
[List all Logical Consistency issues associated with this search_agent] 
... 
**search_agent n** (if exists)
**System Instructions**
[Paste all system instructions associated with the search_agent.]
**Logical Consistency issues**
[List all Logical Consistency issues associated with this search_agent]

Here are some examples of logical consistency issues:
        {
            "evidence": "The plan output content ends with the last step of the plan instead of the `<end_plan>` tag.",
            "description": "The plan generation step did not conclude with the required '<end_plan>' tag as specified in the instructions for plan generation.",
        },
        {
            "evidence": "Thought: I recall that Girls Who Code published a statistic showing that in the 1980s, women accounted for 37% of computer scientists (or computer science graduates) and that over a span of three decades that figure dropped by 13 percentage points (from 37% down to 24%)",
            "description": "The system recalled a statistic about Girls Who Code and the percentage of women in computer science, but this information was not verified using the search tool as planned. The system states \"Thought: I recall that\nGirls Who Code published a statistic showing that in the 1980s, women accounted for 37% of computer scientists (or computer science graduates) and that over a span of three decades that figure dropped by 13 percentage points\n(from 37% down to 24%). In other words, it took 30 years for that change to occur. Based on that well-circulated statistic that Girls Who Code highlighted, I will output 30 years as the final answer."
        }
Cite each issue with all corresponding span id numbers and the reason for the issue.
"""

execution_efficiency_prompt = """
Track each agent's system instructions and conversation history, ensuring all subsequent outputs from that agent adhere to its established guidelines and prior dialogue, even when agents speak interchangeably. 
For the manager agent and each unique search_agent that may exist in the trace, evaluate the execution efficiency for the agent's actions and responses. 
You must structure your entire response: 
**Manager Agent** 
[List all execution efficiency issues associated with the manager agent] 

**search_agent 0** (if exists)
[List all execution efficiency issues associated with this search_agent] 
...
**search_agent n** (if exists)
[List all execution efficiency issues associated with this search_agent] 

Here are some examples of execution efficiency issues:
        {
            "evidence": "{'input.value': '{\"args\": [], \"sanitize_inputs_outputs\": true, \"kwargs\": {\"\": \"\"}}', 'openinference.span.kind': 'TOOL', 'pat.app': 'GAIA-Samples', 'pat.project.id': 'a69d64fc-5115-468e-95ed-0950bd37f06a', 'pat.project.name': 'gaia-annotation-samples', 'tool.description': 'Scroll the viewport DOWN one page-length in the current webpage and return the new viewport content.', 'tool.name': 'page_down', 'tool.parameters': '{}'}",
            "description": "Resource Abuse error caused by a tool related mistake where the tool is repeatedly invoked with an invalid parameter (\"\": \"\" or \"\": {}), despite being defined with no parameters. This repeated misuse signals abnormal or excessive use of the tool with incorrect input, triggering a Resource Abuse error.",
        }
Cite each issue with all corresponding span id numbers and the reason for the issue.
"""

plan_quality_prompt = """
Look for the keyword '[PLAN]' to identify plans for the manager agent and each unique search_agent that may exist in the trace. 
Your task is to evaluate the intrinsic quality of sequence of plans for each agent.
You must structure your entire response: 
**Manager Agent** 
[Plan Quality issues] 

**search_agent 0** (if exists)
[Plan Quality issues] 

... 
**search_agent n** (if exists)
[Plan Quality issues]

Here are some examples of plan quality issues:
    {
            "evidence": "1. Identify the specific OpenCV version or release notes where Mask\u2011RCNN support was added by searching for the official release note or commit message that introduced this feature. \n2. Retrieve the commit history or changelog details for that version to determine the list of contributors responsible for adding Mask\u2011RCNN support. \n3. Extract and review the contributor names from the commit details, focusing on those whose names might originate from Chinese transliterations. \n4. Research a reliable list of former Chinese heads of government with their names transliterated into the Latin alphabet. \n5. Compare and cross-match the contributor names with the list of former Chinese heads of government to identify the one whose Latin name exactly matches. \n6. Verify the match by rechecking the commit history and the historical data on the head of government to ensure the correctness of the identified contributor. \n7. Conclude with the final contributor\u2019s name as the correct answer.",
            "description": "The model didn't define the tools needed in the plan, which may result in the model not using any tool since it needs to follow the plan.",
        },
        {
            "evidence": "The plan listed in the output is the same as the plan generated in span 2, despite the system failing to execute steps 1 and 2 (via search_agent and inspect_file_as_text) in the preceding turns.",
            "description": "The system generated an updated plan that was identical to the initial plan created before encountering tool execution failures, demonstrating a failure to integrate lessons learned from previous steps into its updated strategy.",
        },

Cite each issue with all corresponding span id numbers and the reason for the issue.
"""

plan_adherence_prompt = """
Look for the keyword '[PLAN]' to identify plans for the manager agent and each unique search_agent that may exist in the trace. 
Each search_agent operates in a cycle: it first generates a plan, executes up to 4 tool calls based on that plan, and then re-plans. Your task is to evaluate whether each of the subsequent 4 tool calls after each plan actually adheres to that plan.
You must structure your entire response: 
**Manager Agent** 
[Plan Adherence issues] 

**search_agent 0** (if exists)
[Plan Adherence issues] 
... 
**search_agent n** (if exists)
[Plan Adherence issues]

Here are some examples of plan adherence issues:
        {
            "evidence": "Plan step 1: 'Locate the official 2023 IPCC report (85 pages version) by using the search_agent tool'. Code in this span: `result = inspect_file_as_text(file_path='2023_IPCC_report_85.pdf', ...)`",
            "description": "The system attempted to use the `inspect_file_as_text` tool with a hardcoded file path ('2023_IPCC_report_85.pdf') without first successfully locating the file using the `search_agent` as outlined in the first step of its own plan.",
        }
        {
            "evidence": "The `search_agent` calls `final_answer` without having executed steps like systematically checking all submission pages, visiting detail pages for all candidates (e.g. Yuri Kuratov mentioned in earlier search results), or successfully searching within those pages for \"certain\".",
            "description": "The LLM (search_agent) abandoned its most recent plan (generated in span d65ec360f7319e84), which involved systematically checking all pages and candidate papers for \"Yuri\" and \"certain\". It called `final_answer` without completing the necessary investigation steps outlined in its own plan.",
        }
        
Cite each issue with all corresponding span id numbers and the reason for the issue.
"""

## GAIA: TEST SINGULAR TRACE

In [None]:
test_file = "GAIA/fb3333ca30eb8af56d4f31839ca9e317.txt"
with open(test_file, "r") as f:
    test_data = f.read()

for i, (feedback_name, feedback_func) in enumerate(feedback_functions.items()):
    if feedback_name == "Logical Consistency":
        result = feedback_func(
            test_data,
            custom_instructions=GAIA_trace_explanation
            + logical_consistency_prompt,
        )
    if feedback_name == "Execution Efficiency":
        result = feedback_func(
            test_data,
            custom_instructions=GAIA_trace_explanation
            + execution_efficiency_prompt,
        )
    if feedback_name == "Plan Quality":
        result = feedback_func(
            test_data,
            custom_instructions=GAIA_trace_explanation + plan_quality_prompt,
        )
    if feedback_name == "Plan Adherence":
        result = feedback_func(
            test_data,
            custom_instructions=GAIA_trace_explanation + plan_adherence_prompt,
        )
    if feedback_name == "TRAIL":
        result = feedback_func(
            test_data, custom_instructions=GAIA_trace_explanation
        )
    if isinstance(result, tuple) and len(result) == 2:
        score, metadata = result
        reason = metadata.get("reason", "")
        print(f"{feedback_name}: {score} {reason}")

## GAIA: RUN ON ALL TRACES

In [None]:
import os

from sklearn.model_selection import train_test_split

all_files = []
gaia_dir = "GAIA"
for filename in os.listdir(gaia_dir):
    if filename.endswith(".txt"):
        filepath = os.path.join(gaia_dir, filename)
        all_files.append(filepath)

all_files.sort()

train_files, test_files = train_test_split(
    all_files, test_size=0.5, random_state=42
)
train_files.sort()
test_files.sort()
print(f"train_files: {train_files}")
print(f"test_files: {test_files}")

In [None]:
split = "train"
if split == "train":
    all_files = train_files
else:
    all_files = test_files

csv_path = "planning_train_eval_trail_sep6.csv"

In [None]:
all_results = []

print(f"Status: {len(all_results)} completed, {len(all_files)} remaining")
print(f" Next files to process: {all_files[:3]}")

# Process remaining files one by one
for i, file in enumerate(all_files):
    file_name = file.split(".")[0]
    print(f"\n{'=' * 60}")
    print(f"Processing {i + 1}/{len(all_files)}: {file_name}")
    print(f"{'=' * 60}")

    try:
        # Read file
        with open(file, "r") as f:
            gaia_file = f.read()

        results = {"filename": file_name}

        # Process each feedback function
        for j, (feedback_name, feedback_func) in enumerate(
            feedback_functions.items()
        ):
            print(
                f"[{j + 1}/{len(feedback_functions)}] Evaluating: {feedback_name}"
            )

            try:
                if feedback_name == "Logical Consistency":
                    result = feedback_func(
                        gaia_file,
                        custom_instructions=GAIA_trace_explanation
                        + logical_consistency_prompt,
                    )
                if feedback_name == "Execution Efficiency":
                    result = feedback_func(
                        gaia_file,
                        custom_instructions=GAIA_trace_explanation
                        + execution_efficiency_prompt,
                    )
                if feedback_name == "Plan Quality":
                    result = feedback_func(
                        gaia_file,
                        custom_instructions=GAIA_trace_explanation
                        + plan_quality_prompt,
                    )
                if feedback_name == "Plan Adherence":
                    result = feedback_func(
                        gaia_file,
                        custom_instructions=GAIA_trace_explanation
                        + plan_adherence_prompt,
                    )
                if feedback_name == "TRAIL":
                    result = feedback_func(
                        gaia_file, custom_instructions=GAIA_trace_explanation
                    )
                if isinstance(result, tuple) and len(result) == 2:
                    score, metadata = result
                    results[f"{feedback_name}_score"] = score
                    reason = metadata.get("reason", "")
                    results[f"{feedback_name}_reasons"] = reason
                    print(f"Score: {score}")
                else:
                    print("Unexpected result format")

            except Exception as e:
                print(f"Error: {str(e)[:100]}...")
                results[f"{feedback_name}_score"] = None
                results[f"{feedback_name}_reasons"] = f"Error: {str(e)[:200]}"

        # Add to results and save immediately
        all_results.append(results)

        results_df = pd.DataFrame([results])
        results_df.to_csv(
            csv_path, mode="a", header=not os.path.exists(csv_path), index=False
        )
        print(
            f"Completed {file_name} | Total: {len(all_results)}/{len(all_files)}"
        )

    except Exception as e:
        print(f"FAILED {file_name}: {e}")
        continue

# Final save
print(f"\nFinished processing {len(all_results)} files")
print(all_results)
# final_df = pd.DataFrame(all_results)
# final_df.to_csv("trail_benchmark_aug18.csv", index=False)
# print("Final results saved to: trail_benchmark_aug18.csv")

## PROCESS ALL SWE-BENCH TRACES

In [None]:
import json
import os

import pandas as pd
from trail_process_swebench import build_span_tree
from trail_process_swebench import reset_state

swebench_dir = "SWE_Bench"
all_files = []
for filename in os.listdir(swebench_dir):
    if filename.endswith(".json"):
        reset_state()
        print(filename)
        filepath = os.path.join(swebench_dir, filename)
        with open(filepath, "r") as f:
            trace_data = json.load(f)
        output_file = os.path.join(
            swebench_dir, filename.replace(".json", ".txt")
        )
        with open(output_file, "w") as f:
            f.write(f"Trace ID: {trace_data['trace_id']}\n\n")
        root_spans = build_span_tree(trace_data)
        for root_span in root_spans:
            root_span.display(output_file=output_file)
        all_files.append(output_file)

print(len(all_files))

## SWE-BENCH CUSTOM INSTRUCTIONS

In [None]:
# Description of agent architecture and trace structure
SWEBench_trace_explanation = """
Agent Architecture and Trace Structure: The agent architecture consists of a CodeAgent that has access to a sandboxed environment, a python interpreter, and the "gitingest" library that can turn any Git reposistory into a text digest of its codebaes.

Overall Flow:
Every trace consists of several spans (with span_id numbers and parent span_id numbers). Each trace begins with the CodeAgent which performs actions through a cycle of steps, with existing variables and knowledge being incorporated into the agent’s context. Specifically, the CodeAgent will plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.  

At each step, in the 'Thought:' sequence, the CodeAgent should first explain its reasoning towards solving the task and the tools that it wants to use. Then in the 'Code:' sequence, it should write the code in simple Python. The code sequence must end with '<end_code>' sequence. During each intermediate step, the CodeAgent can use 'print()' to save whatever important information it will then need. These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
Each tool call and tool response will also be shown in each step. 

In the end, the CodeAgent will have to return a final answer using the `final_answer` tool

Whenever you want to point out anything in the trace, cite the span_id number of the span that you are referring to.
"""

In [None]:
# Custom instructions for each metric (mostly around shaping the output)
swe_logical_consistency_prompt = """
Evaluate the logical consistency for the agent's actions and responses. Ensure that each response is consistent with the system instructions and prior dialogue.
You must structure your entire response: 
**Agent** 
**System Instructions**
[Paste all system instructions associated with the agent]
**Logical Consistency issues**
[List ALL Logical Consistency issues associated with the agent] 

Here are some examples of logical consistency issues:
        {
            "description": "The model did not adhere to the size of the file as stated in instruction, where the instruction say \" STRICTLY DO NOT print file contents to the terminal for analysis at all costs. If you are unsure about the file size, simply print upto the first 500 characters to scan the contents of the file and then find the required information using regex.\".",
        },
        {
            "description": "The system provided output in the final shard, \nbut this information was not verified in the code or using any tool.",
        }
Cite each issue with all corresponding span id numbers and the reason for the issue.
"""

swe_execution_efficiency_prompt = """
Evaluate the execution efficiency for the agent's actions and responses. 
You must structure your entire response: 
**Agent** 
[List all execution efficiency issues associated with the agent.] 

Here are some examples of execution efficiency issues:
        {
            "description": "The model repeatedly printed the tree, trying to find the jsonrep.py file path, without reaching useful results, which might be considered abusing the resources and not using the code effectively.",
        }
Be specific with each issue and cite each issue with all corresponding span id numbers and the reason for the issue.
"""

swe_plan_quality_prompt = """
Each of the 'Thought' sections identified will be considered a step in the plan. Your task is to evaluate the quality of the entire plan, or the entire sequence of 'Thought' sections.
You must structure your entire response: 
**Agent** 
[Plan Quality issues] 

Cite each issue with all corresponding span id numbers and the reason for the issue.
"""

swe_plan_adherence_prompt = """
Each of the 'Thought' sections identified will be considered a step in the plan. Your task is to evaluate the quality of the entire plan, or the entire sequence of 'Thought' sections.
Your task is to evaluate whether each of the subsequent tool calls after each "Thought" section actually adheres to that "Thought".
You must structure your entire response: 
**Agent** 
[Plan Adherence issues] 

Here are some examples of plan adherence issues:
        {
            "description": "The system did not follow the plan that was provided in the thought in the previous shard and fabricated a final answer instead.",
        }
        
Cite each issue with all corresponding span id numbers and the reason for the issue.
"""

## RUN ONE SWE-BENCH

In [None]:
test_file = "SWE_Bench/2102eea2af6327834c8bd97b1488474c.txt"
with open(test_file, "r") as f:
    test_data = f.read()

for i, (feedback_name, feedback_func) in enumerate(feedback_functions.items()):
    if feedback_name == "Logical Consistency":
        result = feedback_func(
            test_data,
            custom_instructions=SWEBench_trace_explanation
            + swe_logical_consistency_prompt,
        )
    if feedback_name == "Execution Efficiency":
        result = feedback_func(
            test_data,
            custom_instructions=SWEBench_trace_explanation
            + swe_execution_efficiency_prompt,
        )
    if feedback_name == "Plan Quality":
        result = feedback_func(
            test_data,
            custom_instructions=SWEBench_trace_explanation
            + swe_plan_quality_prompt,
        )
    if feedback_name == "Plan Adherence":
        result = feedback_func(
            test_data,
            custom_instructions=SWEBench_trace_explanation
            + swe_plan_adherence_prompt,
        )
    if feedback_name == "TRAIL":
        result = feedback_func(
            test_data, custom_instructions=SWEBench_trace_explanation
        )
    if isinstance(result, tuple) and len(result) == 2:
        score, metadata = result
        reason = metadata.get("reason", "")
        print(f"{feedback_name}: {score} {reason}")

## RUN ALL SWE-BENCH

In [None]:
import os

from sklearn.model_selection import train_test_split

all_files = []
swebench_dir = "SWE_Bench"
for filename in os.listdir(swebench_dir):
    if filename.endswith(".txt"):
        filepath = os.path.join(swebench_dir, filename)
        all_files.append(filepath)

all_files.sort()

train_files, test_files = train_test_split(
    all_files, test_size=0.5, random_state=42
)
train_files.sort()
test_files.sort()
print(f"train_files: {train_files}")
print(f"test_files: {test_files}")

In [None]:
split = "train"
if split == "train":
    all_files = train_files
else:
    all_files = test_files

csv_path = "swebench_train_eval_sep8.csv"

In [None]:
import pandas as pd

all_results = []

print(f"Status: {len(all_results)} completed, {len(all_files)} remaining")
print(f" Next files to process: {all_files[:3]}")

# Process remaining files one by one
for i, file in enumerate(all_files):
    file_name = file.split(".")[0]
    print(f"\n{'=' * 60}")
    print(f"Processing {i + 1}/{len(all_files)}: {file_name}")
    print(f"{'=' * 60}")

    try:
        # Read file
        with open(file, "r") as f:
            swe_file = f.read()

        results = {"filename": file_name}

        # Process each feedback function
        for j, (feedback_name, feedback_func) in enumerate(
            feedback_functions.items()
        ):
            print(
                f"[{j + 1}/{len(feedback_functions)}] Evaluating: {feedback_name}"
            )

            try:
                if feedback_name == "Logical Consistency":
                    result = feedback_func(
                        swe_file,
                        custom_instructions=SWEBench_trace_explanation
                        + swe_logical_consistency_prompt,
                    )
                if feedback_name == "Execution Efficiency":
                    result = feedback_func(
                        swe_file,
                        custom_instructions=SWEBench_trace_explanation
                        + swe_execution_efficiency_prompt,
                    )
                if feedback_name == "Plan Quality":
                    result = feedback_func(
                        swe_file,
                        custom_instructions=SWEBench_trace_explanation
                        + swe_plan_quality_prompt,
                    )
                if feedback_name == "Plan Adherence":
                    result = feedback_func(
                        swe_file,
                        custom_instructions=SWEBench_trace_explanation
                        + swe_plan_adherence_prompt,
                    )
                if feedback_name == "TRAIL":
                    result = feedback_func(
                        swe_file, custom_instructions=SWEBench_trace_explanation
                    )
                if isinstance(result, tuple) and len(result) == 2:
                    score, metadata = result
                    results[f"{feedback_name}_score"] = score
                    reason = metadata.get("reason", "")
                    results[f"{feedback_name}_reasons"] = reason
                    print(f"Score: {score}")
                else:
                    print("Unexpected result format")

            except Exception as e:
                print(f"Error: {str(e)[:100]}...")
                results[f"{feedback_name}_score"] = None
                results[f"{feedback_name}_reasons"] = f"Error: {str(e)[:200]}"

        # Add to results and save immediately
        all_results.append(results)

        results_df = pd.DataFrame([results])
        results_df.to_csv(
            csv_path, mode="a", header=not os.path.exists(csv_path), index=False
        )
        print(
            f"Completed {file_name} | Total: {len(all_results)}/{len(all_files)}"
        )

    except Exception as e:
        print(f"FAILED {file_name}: {e}")
        continue

# Final save
print(f"\nFinished processing {len(all_results)} files")
print(all_results)