In [None]:
from dotenv import load_dotenv
import pandas as pd

load_dotenv()

df = pd.read_csv("staged_thinking_data.csv")

df.head()

In [None]:
import json
from typing import Any, Dict, List

sample_response = df.iloc[4]["ResponseFull"]


# Try to parse it as JSON
try:
    parsed_try = json.loads(sample_response)
    print("Parsed JSON structure:")
# print(json.dumps(parsed, indent=2))
except json.JSONDecodeError as e:
    print(f"Not valid JSON: {e}")
    print("Raw text content:")
    # print(sample_response[:200])

print(len(parsed_try["content"]))
print(parsed_try["content"][6:8])
formatted_response = json.dumps(parsed_try["content"])
type(formatted_response)


def format_execution_trace(content: List[Dict]) -> str:
    """
    Format the content list as a readable execution trace for trajectory evaluation.

    Args:
        content: List of execution steps from the ResponseFull JSON

    Returns:
        Formatted string representation of the execution trace
    """
    if not isinstance(content, list):
        return str(content)

    # Option 1: Clean JSON formatting
    formatted_trace = json.dumps(content, indent=2, ensure_ascii=False)
    formatted_trace = formatted_trace.replace("\\n", "\n")

    # Option 2: More readable step-by-step format (alternative)
    # formatted_trace = format_as_readable_steps(content)

    return formatted_trace


def format_as_readable_steps(content: List[Dict]) -> str:
    """
    Alternative: Format as more readable step-by-step execution trace.
    """
    steps = []
    for i, step in enumerate(content, 1):
        step_type = step.get("type", "unknown")

        if step_type == "thinking":
            thinking_text = step.get("thinking", {}).get("text", "")
            steps.append(f"thinking: {thinking_text}")

        elif step_type == "tool_use":
            tool_info = step.get("tool_use", {})
            tool_name = tool_info.get("name", "unknown")
            tool_type = tool_info.get("type", "unknown")
            tool_input = tool_info.get("input", {})
            steps.append(f"tool_name: {tool_name}")
            steps.append(f"tool_type: {tool_type}")
            steps.append(f"input: {json.dumps(tool_input, indent=2)}")

        elif step_type == "tool_results":
            tool_results = step.get("tool_results", {})
            steps.append(f"tool_results: {json.dumps(tool_results, indent=2)}")

        elif step_type == "text":
            text_content = step.get("text", "")
            steps.append(f"text: {text_content}")

        steps.append("")  # Empty line between steps

    return "\n".join(steps)


# readable_trace = format_as_readable_steps(parsed_try['content'])
# print(f"readable_trace: {readable_trace}")
formatted_trace = format_execution_trace(parsed_try["content"])
print(f"raw_trace: {formatted_trace}")

## Initialize Feedback Function(s)

In [None]:
# Create snowpark session.

from dotenv import load_dotenv
from snowflake.snowpark import Session

load_dotenv()


# snowflake_connection_parameters = {
#     "account": os.environ["SNOWFLAKE_ACCOUNT"],
#     "user": os.environ["SNOWFLAKE_USER"],
#     "password": os.environ["SNOWFLAKE_USER_PASSWORD"],
#     "database": os.environ["SNOWFLAKE_DATABASE"],
#     "schema": os.environ["SNOWFLAKE_SCHEMA"],
#     "role": os.environ["SNOWFLAKE_ROLE"],
#     "warehouse": os.environ["SNOWFLAKE_WAREHOUSE"],
# }


snowflake_connection_parameters = {
    "account": "SNOWHOUSE",
    "user": "ajia",
    "authenticator": "externalbrowser",
}
print(snowflake_connection_parameters)
snowpark_session = Session.builder.configs(
    snowflake_connection_parameters
).create()

# TruSession is no longer required as long as snowflake connector exists
# sf_connector = SnowflakeConnector(snowpark_session=snowpark_session)

In [None]:
from trulens.providers.cortex import Cortex

provider = Cortex(
    model_engine="claude-3-7-sonnet", snowpark_session=snowpark_session
)
# provider = OpenAI(model_engine="gpt-4o")

# Create feedback functions without selectors (better for direct string evaluation)
# These can be called directly on strings, which is what you want for your DataFrame processing

feedback_functions = {
    "Step Relevance": provider.trajectory_step_relevance_with_cot_reasons,
    "Logical Consistency": provider.trajectory_logical_consistency_with_cot_reasons,
    "Workflow Efficiency": provider.trajectory_workflow_efficiency_with_cot_reasons,
    "Plan Adherence": provider.trajectory_plan_adherence_with_cot_reasons,
}

for name in feedback_functions:
    print(f"- {name}")

### TRAIL: GAIA Traces

In [None]:
import json
from typing import Dict, List

gaia_file = json.load(open("0242ca2533fac5b8b604a9060b3e15d6.json"))
gaia_log = gaia_file["spans"][0]

print(gaia_log["child_spans"][1].keys())
gaia_trace = format_execution_trace(gaia_log["child_spans"][1]["child_spans"])
print(gaia_trace)

In [None]:
provider.trajectory_step_relevance_with_cot_reasons(gaia_trace)

In [None]:
provider.trajectory_logical_consistency_with_cot_reasons(gaia_trace)

In [None]:
provider.trajectory_workflow_efficiency_with_cot_reasons(gaia_trace)

In [None]:
provider.trajectory_plan_adherence_with_cot_reasons(gaia_trace)

### Tyler V2 Dataset

In [None]:
from typing import Dict, List


def evaluate_row_content(
    row: pd.Series, feedback_functions: Dict[str, Any]
) -> Dict[str, Any]:
    """
    Evaluate a single row's content with all feedback functions.

    Args:
        row: A pandas Series representing a row from the DataFrame
        feedback_functions: List of feedback functions to apply

    Returns:
        Dictionary with feedback results
    """
    results = {
        "conversation_id": row["conversation_id"],
        "question": row["Question"],
        "answer": row["Answer"],
        "status": row["Status"],
        "expected_answer": row["ExpectedAnswer"],
        "LLM-judge analysis": row["Analysis"],
        "LLM-judge reasoning": row["Reasoning"],
        "LLM-judge rating": row["Rating"],
        "ResponseFull": row["ResponseFull"],
    }

    try:
        # Parse the JSON from ResponseFull
        parsed = json.loads(
            row["ResponseFull"]
        )  # Cast to string to fix type issue

        # Extract the content
        content = parsed.get("content", [])

        # Format as execution trace using the readable format
        if isinstance(content, list):
            content_str = format_execution_trace(content)
        else:
            content_str = str(content)

        # Apply each feedback function
        for feedback_name, feedback_func in feedback_functions.items():
            try:
                # Call the feedback function directly with the formatted trace string
                result = feedback_func(content_str)

                if isinstance(result, tuple) and len(result) == 2:
                    score, metadata = result
                    results[f"{feedback_name}_score"] = score
                    results[f"{feedback_name}_reasons"] = metadata.get(
                        "reason", ""
                    )
                    # Convert metadata to string for CSV storage
                else:
                    results[f"{feedback_name}_score"] = result
                    results[f"{feedback_name}_reasons"] = ""
                    results[f"{feedback_name}_metadata"] = "{}"

            except Exception as e:
                print(
                    f"Error evaluating {feedback_name} on {row['conversation_id']}: {e}"
                )
                results[f"{feedback_name}_score"] = float("nan")
                results[f"{feedback_name}_reasons"] = f"Error: {str(e)}"
                results[f"{feedback_name}_metadata"] = (
                    f'{{"error": "{str(e)}"}}'
                )

    except json.JSONDecodeError as e:
        print(f"Failed to parse JSON for {row['conversation_id']}: {e}")
        # Set all feedback results to NaN for this row
        for feedback_name in feedback_functions:
            results[f"{feedback_name}_score"] = float("nan")
            results[f"{feedback_name}_reasons"] = f"JSON Parse Error: {str(e)}"
            results[f"{feedback_name}_metadata"] = (
                f'{{"json_error": "{str(e)}"}}'
            )

    return results


# Process all rows and collect results
print("Processing rows for feedback evaluation...")
all_results = []

for i, (idx, row) in enumerate(df.iterrows()):
    print(f"Processing row {i + 1}/{len(df)}: {row['conversation_id']}")

    # Evaluate this row using the direct feedback functions
    row_results = evaluate_row_content(row, feedback_functions)
    all_results.append(row_results)

# Convert to DataFrame - this creates one row per conversation_id
results_df = pd.DataFrame(all_results)

# Display basic information about results
print(f"\nProcessing complete! Evaluated {len(results_df)} rows.")
print(f"Results DataFrame shape: {results_df.shape}")
print(f"Columns: {list(results_df.columns)}")

# Show first few rows
print("\nFirst 3 rows of results:")
print(results_df.head(3))

In [None]:
provider.trajectory_step_relevance_with_cot_reasons(
    format_execution_trace(parsed_try["content"])
)

In [None]:
provider.trajectory_logical_consistency_with_cot_reasons(
    format_execution_trace(parsed_try["content"])
)

In [None]:
print(results_df["Step Relevance_reasons"].iloc[2])

In [None]:
# Show statistics for each feedback function
print("\nFeedback Score Statistics:")
for feedback_name in feedback_functions:
    score_col = f"{feedback_name}_score"
    if score_col in results_df.columns:
        valid_scores = results_df[score_col].dropna()
        if len(valid_scores) > 0:
            print(f"\n{feedback_name}:")
            print(f"  Mean: {valid_scores.mean():.3f}")
            print(f"  Std:  {valid_scores.std():.3f}")
            print(f"  Min:  {valid_scores.min():.3f}")
            print(f"  Max:  {valid_scores.max():.3f}")
        else:
            print(f"\n{feedback_name}: No valid scores")

# Save results to CSV
output_filename = "claude37sonnet_staged_thinking_feedback_raw_results_3.csv"
results_df.to_csv(output_filename, index=False)
print(f"\nResults saved to '{output_filename}'")

# Also save a summary statistics CSV
summary_stats = []
for feedback_name in feedback_functions:
    score_col = f"{feedback_name}_score"
    if score_col in results_df.columns:
        valid_scores = results_df[score_col].dropna()
        if len(valid_scores) > 0:
            summary_stats.append({
                "feedback_function": feedback_name,
                "mean_score": valid_scores.mean(),
                "std_score": valid_scores.std(),
                "min_score": valid_scores.min(),
                "max_score": valid_scores.max(),
            })

if summary_stats:
    summary_df = pd.DataFrame(summary_stats)
    summary_filename = (
        "claude37sonnet_staged_thinking_feedback_raw_summary_stats_3.csv"
    )
    summary_df.to_csv(summary_filename, index=False)
    print(f"Summary statistics saved to '{summary_filename}'")
    print("\nSummary Statistics:")
    print(summary_df)

# Optional: Display some example reasons for debugging
print("\nExample Feedback Reasons:")
for feedback_name in feedback_functions:
    reasons_col = f"{feedback_name}_reasons"
    if reasons_col in results_df.columns:
        # Get first non-empty reason
        non_empty_reasons = results_df[
            results_df[reasons_col].notna() & (results_df[reasons_col] != "")
        ][reasons_col]
        if len(non_empty_reasons) > 0:
            print(f"\n{feedback_name} - Example reason (first 200 chars):")
            first_reason = str(list(non_empty_reasons)[0])
            print(f"  {first_reason[:200]}...")
        else:
            print(f"\n{feedback_name}: No reasons available")

print("\nFiles created:")
print(
    f"- {output_filename}: Full results with all feedback scores and metadata"
)
print(f"- {summary_filename}: Summary statistics for each feedback function")