In [None]:
import pandas as pd

df = pd.read_csv("analysis - bucketed_raw_staged_thinking_results.csv")

df.head()

In [None]:
import pandas as pd

# Calculate correlations for trajectory feedback metrics
metrics = [
    "Step Relevance_score",
    "Logical Consistency_score",
    "Workflow Efficiency_score",
]
targets = ["LLM-Judge Rating", "GT"]

# Create correlation table
correlation_data = []
for metric in metrics:
    correlations = []
    for target in targets:
        corr = df[metric].corr(df[target])
        correlations.append(corr)
    correlation_data.append(correlations)

# Create DataFrame for better formatting
correlation_table = pd.DataFrame(
    data=correlation_data,
    columns=["LLM Judge Rating", "Ground Truth"],
    index=["Step Relevance", "Logical Consistency", "Workflow Efficiency"],
)

print("=== Trajectory Feedback Metrics Correlation Table ===")
print(correlation_table.round(3))

# Additional analysis
print("\n=== Additional Analysis ===")
print(
    f"LLM Judge vs Ground Truth correlation: {df['LLM-Judge Rating'].corr(df['GT']):.3f}"
)

# Check for any data issues
print("\n=== Data Quality Check ===")
for metric in metrics:
    unique_values = df[metric].nunique()
    print(f"{metric}: {unique_values} unique values")
    if unique_values == 1:
        print(f"  Warning: {metric} has no variance (all values are the same)")

# Summary insights
print("\n=== Key Insights ===")
best_gt_predictor = correlation_table["Ground Truth"].abs().idxmax()
best_llm_predictor = correlation_table["LLM Judge Rating"].abs().idxmax()
print(f"Best predictor of Ground Truth: {best_gt_predictor}")
print(f"Best predictor of LLM Judge Rating: {best_llm_predictor}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Create confusion matrices for each trajectory metric vs Ground Truth
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.flatten()

metrics = [
    "Step Relevance_score",
    "Logical Consistency_score",
    "Workflow Efficiency_score",
]
labels = [0, 1, 2]  # All possible classes
label_names = ["0", "1", "2"]  # String labels for plotting

print("=== Confusion Matrix Analysis (0=Poor, 1=Moderate, 2=Good) ===\n")

# 1. LLM Judge vs Ground Truth
cm_llm_gt = confusion_matrix(df["GT"], df["LLM-Judge Rating"], labels=labels)
sns.heatmap(
    cm_llm_gt,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=label_names,
    yticklabels=label_names,
    ax=axes[0],
)
axes[0].set_title("LLM Judge vs Ground Truth")
axes[0].set_xlabel("LLM Judge Rating")
axes[0].set_ylabel("Ground Truth")

print("LLM Judge vs Ground Truth:")
print(
    classification_report(
        df["GT"], df["LLM-Judge Rating"], labels=labels, zero_division="warn"
    )
)

# 2-4. Each trajectory metric vs Ground Truth
for i, metric in enumerate(metrics, 1):
    cm = confusion_matrix(df["GT"], df[metric], labels=labels)
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=label_names,
        yticklabels=label_names,
        ax=axes[i],
    )
    axes[i].set_title(f'{metric.replace("_score", "")} vs Ground Truth')
    axes[i].set_xlabel(f'{metric.replace("_score", "")}')
    axes[i].set_ylabel("Ground Truth")

    print(f"\n{metric.replace('_score', '')} vs Ground Truth:")
    print(
        classification_report(
            df["GT"], df[metric], labels=labels, zero_division="warn"
        )
    )

plt.tight_layout()
plt.show()

# Data distribution analysis
print("\n=== Data Distribution Analysis ===")
print("Ground Truth distribution:")
print(df["GT"].value_counts().sort_index())
print("\nLLM Judge Rating distribution:")
print(df["LLM-Judge Rating"].value_counts().sort_index())

for metric in metrics:
    print(f"\n{metric} distribution:")
    print(df[metric].value_counts().sort_index())

# Check for missing classes
print("\n=== Missing Classes Analysis ===")
all_columns = ["GT", "LLM-Judge Rating"] + metrics
for col in all_columns:
    unique_vals = set(df[col].unique())
    missing_classes = set([0, 1, 2]) - unique_vals
    if missing_classes:
        print(f"{col}: Missing classes {sorted(missing_classes)}")
    else:
        print(f"{col}: All classes present")