In [None]:
import pandas as pd

train_df = pd.read_csv("NEW_all_eval_trail_benchmark - GT_train.csv")

train_df.head()

test_df = pd.read_csv("GT_test.csv")

test_df.head()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error

# Create confusion matrices for all models - simplified approach
metric_columns = [
    "Logical Consistency",
    "Execution Efficiency",
    "Plan Adherence",
    "Plan Quality",
]

fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.flatten()

results = {}

split = "train"

if split == "train":
    df = train_df
elif split == "test":
    df = test_df

for idx, metric in enumerate(metric_columns):
    gt_clean = df[f"{metric}_GT"]
    metric_clean = df[f"{metric}_score"]

    # Remove rows with NaN values
    mask = pd.notna(gt_clean) & pd.notna(metric_clean)
    gt_clean = gt_clean[mask]
    metric_clean = metric_clean[mask]

    mapping = {0: 0, 0.3333333333: 1, 0.33: 1, 0.67: 2, 0.6666666667: 2, 1: 3}

    gt_bucketed = [mapping.get(val) for val in gt_clean]
    metric_bucketed = [mapping.get(val) for val in metric_clean]

    # Create confusion matrix using bucketed values
    cm = confusion_matrix(gt_bucketed, metric_bucketed)

    # Get unique labels for proper ordering
    labels = [0, 1, 2, 3]

    # Plot heatmap
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=labels,
        yticklabels=labels,
        ax=axes[idx],
    )
    axes[idx].set_title(f"{metric} vs {metric} Ground Truth")
    axes[idx].set_xlabel(f"{metric} Judge Score")
    axes[idx].set_ylabel("Ground Truth")

    # Calculate accuracy
    accuracy = np.trace(cm) / np.sum(cm)

    # Calculate off-by-one accuracy
    off_by_one_correct = sum(
        1
        for gt, pred in zip(gt_bucketed, metric_bucketed)
        if (gt == 0 and pred == 0)
        or (gt == 1 and pred in [1, 2])
        or (gt == 2 and pred in [1, 2])
        or (gt == 3 and pred == 3)
    )
    off_by_one_accuracy = off_by_one_correct / len(gt_bucketed)

    # Store results
    results[metric] = {
        "confusion_matrix": cm,
        "accuracy": accuracy,
        "off_by_one_accuracy": off_by_one_accuracy,
        "n_samples": len(gt_bucketed),
        "labels": labels,
    }

    # Print classification report
    print(f"\n=== {metric} vs Ground Truth ===")
    print(f"Samples: {len(gt_bucketed)}")
    print(f"Confusion Matrix Accuracy: {accuracy:.3f}")
    print(f"Off-by-One Accuracy: {off_by_one_accuracy:.3f}")

    # Calculate continuous metrics using original values
    print(
        f"Mean Absolute Error: {mean_absolute_error(gt_bucketed, metric_bucketed):.3f}"
    )
    print(
        f"Normalized Mean Absolute Error: {mean_absolute_error(gt_bucketed, metric_bucketed) / mapping.get(1):.3f}"
    )
    print("Classification Report:")
    print(
        classification_report(
            gt_bucketed, metric_bucketed, zero_division="warn"
        )
    )

plt.suptitle(f"{metric}: Confusion Matrices")
plt.tight_layout()
plt.show()

# Summary comparison
print("\n=== SUMMARY COMPARISON ===")
print("Metric\t\t\tAccuracy\tOff-by-1 Acc\tSamples")
print("-" * 65)
for model, stats in results.items():
    print(
        f"{model:<20}\t{stats['accuracy']:.3f}\t\t{stats['off_by_one_accuracy']:.3f}\t\t{stats['n_samples']}"
    )