# LLM-Human Agreement: SHAP Narrative Evaluation (Krippendorff's Alpha)

Computes Krippendorff's Alpha to measure agreement between human and LLM judges on ML SHAP explanation narratives.

In [1]:
import pandas as pd
import numpy as np
import krippendorff

# --- Configuration ---
NUM_NARRATIVES = 60
VALUE_DOMAIN = [1, 2, 3, 4, 5]
METRIC = 'interval'

# --- Load LLM Judcge Data ---
llm_data = pd.read_csv('../../data/output/llm_judge_evaluation_results.csv.gz', compression='gzip')

In [2]:
# --- Generate Dummy Human Scores ---
np.random.seed(42)  # For reproducibility

# Generate human scores with some correlation to LLM scores but with variation
human_scores_faithfulness = np.random.randint(1, 6, NUM_NARRATIVES)
human_scores_completeness = np.random.randint(1, 6, NUM_NARRATIVES)
human_scores_clarity = np.random.randint(1, 6, NUM_NARRATIVES)

In [3]:
# Extract LLM scores for the three metrics
import ast

# Convert string representations of dicts to actual dicts
llm_data['evaluation'] = llm_data['evaluation'].apply(ast.literal_eval)

# Now extract scores
llm_scores_faithfulness = llm_data['evaluation'].apply(lambda x: x['factual_accuracy']['score']).values
llm_scores_completeness = llm_data['evaluation'].apply(lambda x: x['completeness_integration']['score']).values
llm_scores_clarity = llm_data['evaluation'].apply(lambda x: x['clarity_readability']['score']).values

In [4]:
print("human_scores_faithfulness:", human_scores_faithfulness)
print("\nhuman_scores_completeness", human_scores_completeness)
print("\nhuman_scores_clarity", human_scores_clarity)

human_scores_faithfulness: [4 5 3 5 5 2 3 3 3 5 4 3 5 2 4 2 4 5 1 4 2 5 4 1 1 3 3 2 4 4 3 4 4 1 3 5 3
 5 1 2 4 1 4 2 2 1 2 5 2 4 4 4 4 5 3 1 4 2 4 2]

human_scores_completeness [2 4 5 2 2 4 2 2 4 4 1 5 5 2 5 2 1 4 4 4 5 1 5 5 1 1 1 1 4 3 3 1 3 3 1 3 5
 2 2 1 4 1 4 2 1 5 3 4 3 3 1 3 5 3 1 5 2 3 1 2]

human_scores_clarity [2 4 5 3 1 4 5 4 5 5 3 5 4 5 3 3 4 2 2 5 1 5 4 4 4 4 4 3 2 4 1 1 1 1 3 1 4
 5 1 3 3 1 5 1 3 2 4 3 1 4 1 1 2 4 4 2 3 1 5 1]


In [5]:
print("llm_scores_faithfulness:", llm_scores_faithfulness)
print("\nllm_scores_completeness", llm_scores_completeness)
print("\nllm_scores_clarity", llm_scores_clarity)

llm_scores_faithfulness: [3 3 3 3 3 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 1 1 3 3 1 1 1 3 1 3 3 1 3 1
 3 3 3 3 1 1 3 1 3 3 3 3 1 3 1 1 1 1 3 3 3 3 1]

llm_scores_completeness [3 3 3 3 3 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 2 2 3 3 2 2 2 3 2 3 3 2 3 2
 3 3 3 3 2 2 3 2 2 3 3 3 1 3 2 2 2 2 3 3 3 3 2]

llm_scores_clarity [4 4 4 4 4 3 4 3 4 4 4 4 4 4 4 4 4 4 4 4 4 3 4 3 3 4 4 3 3 3 4 4 4 4 3 4 3
 4 4 4 4 3 3 4 3 4 4 4 4 3 4 3 3 3 3 4 4 4 4 3]


In [6]:
# --- Prepare Reliability Data Arrays ---
reliability_data_faithfulness = np.array([human_scores_faithfulness, llm_scores_faithfulness])
reliability_data_completeness = np.array([human_scores_completeness, llm_scores_completeness])
reliability_data_clarity = np.array([human_scores_clarity, llm_scores_clarity])

# --- Calculate Krippendorff's Alpha for each metric ---

print("--- Faithfulness ---")
alpha_faithfulness = krippendorff.alpha(reliability_data=reliability_data_faithfulness,
                                       value_domain=VALUE_DOMAIN)
print(f"Krippendorff's Alpha for Faithfulness: {alpha_faithfulness:.3f}")

print("\n--- Completeness ---")
alpha_completeness = krippendorff.alpha(reliability_data=reliability_data_completeness,
                                       value_domain=VALUE_DOMAIN)
print(f"Krippendorff's Alpha for Completeness: {alpha_completeness:.3f}")

print("\n--- Clarity ---")
alpha_clarity = krippendorff.alpha(reliability_data=reliability_data_clarity,
                                  value_domain=VALUE_DOMAIN)
print(f"Krippendorff's Alpha for Clarity: {alpha_clarity:.3f}")

# --- Interpretation Guidance ---
print("\n--- Interpretation ---")
print("Krippendorff's Alpha values typically range from -1 to 1:")
print("  1.0   = Perfect agreement")
print("  0.80+ = Excellent agreement")
print("  0.67+ = Good agreement (often considered acceptable for drawing tentative conclusions)")
print("  <0.67 = Questionable agreement (requires careful consideration)")
print("  0.0   = Agreement expected by chance")
print("  <0.0  = Systematic disagreement")
print("\nYour alpha values indicate the level of consistency between your human judge and LLM judge for each aspect.")
print("A higher alpha value suggests better alignment between their scores.")

--- Faithfulness ---
Krippendorff's Alpha for Faithfulness: -0.054

--- Completeness ---
Krippendorff's Alpha for Completeness: 0.060

--- Clarity ---
Krippendorff's Alpha for Clarity: -0.139

--- Interpretation ---
Krippendorff's Alpha values typically range from -1 to 1:
  1.0   = Perfect agreement
  0.80+ = Excellent agreement
  0.67+ = Good agreement (often considered acceptable for drawing tentative conclusions)
  <0.67 = Questionable agreement (requires careful consideration)
  0.0   = Agreement expected by chance
  <0.0  = Systematic disagreement

Your alpha values indicate the level of consistency between your human judge and LLM judge for each aspect.
A higher alpha value suggests better alignment between their scores.
