## Prepare the data for Human Evaluation

In [1]:
import pandas as pd

llm_narratives = pd.read_csv('../../data/output/llm_generated_narratives_on_shap.csv.gz', compression='gzip')
llm_judge = pd.read_csv('../../data/output/llm_judge_evaluation_results.csv.gz', compression='gzip').drop(columns=['index'])

In [6]:
import ast

combined_llm_narratives_and_judge = pd.concat([llm_narratives, llm_judge], axis=1)
combined_llm_narratives_and_judge.index = pd.RangeIndex(start=1, stop=len(combined_llm_narratives_and_judge)+1)

def get_top_shap_values_dict(shap_str, top_n=5):
    # Convert string to dict if needed
    if isinstance(shap_str, str):
        shap_dict = ast.literal_eval(shap_str)
    else:
        shap_dict = shap_str
    
    # Sort by absolute values (largest to smallest), but keep original values
    sorted_items = sorted(shap_dict.items(), key=lambda x: abs(x[1]), reverse=True)[:top_n]
    
    # Create dictionary with feature names and values
    top_shap_dict = {}
    for feature_name, value in sorted_items:
        top_shap_dict[feature_name] = value
    
    return top_shap_dict

def sort_shap_values_by_abs(shap_str):
    # Convert string to dict if needed
    if isinstance(shap_str, str):
        shap_dict = ast.literal_eval(shap_str)
    else:
        shap_dict = shap_str
    
    # Sort by absolute values (largest to smallest), but keep original values
    sorted_items = sorted(shap_dict.items(), key=lambda x: abs(x[1]), reverse=True)
    
    # Return as dictionary
    return dict(sorted_items)

# Sort the main shap_values column by absolute values
combined_llm_narratives_and_judge['shap_values'] = combined_llm_narratives_and_judge['shap_values'].apply(sort_shap_values_by_abs)

# Add column for top 5 SHAP values as dictionary
combined_llm_narratives_and_judge['top_5_shap_values'] = combined_llm_narratives_and_judge['shap_values'].apply(lambda x: get_top_shap_values_dict(x, top_n=5))

# Add columns for human evaluation tracking
combined_llm_narratives_and_judge['finished'] = False
combined_llm_narratives_and_judge['human_evaluation'] = None  # Will store JSON evaluation data

# Create evaluation copy with shuffled rows for random evaluation
evaluation_data = combined_llm_narratives_and_judge.sample(frac=1, random_state=42).reset_index(drop=False)
evaluation_data.rename(columns={'index': 'original_index'}, inplace=True)

# Save both versions
combined_llm_narratives_and_judge.to_csv("../../data/output/combined_llm_narratives_and_judge.csv.gz", index=True)
evaluation_data.to_csv("../../data/output/evaluation_data_shuffled.csv.gz", index=True, compression='gzip')

print(f"Created evaluation dataset with {len(evaluation_data)} rows")
print("Columns: finished (True/False), human_evaluation (JSON)")
print("Saved to: ../../data/output/evaluation_data_shuffled.csv.gz")

Created evaluation dataset with 60 rows
Columns: finished (True/False), human_evaluation (JSON)
Saved to: ../../data/output/evaluation_data_shuffled.csv.gz


In [3]:
# Display the results
print("=== SAMPLE DATA ===")
display(combined_llm_narratives_and_judge[['shap_values', 'top_5_shap_values']].head(2))

print("\n=== VERIFICATION: Check if sorting by absolute values works correctly ===")
for i in range(2):
    print(f"\nRow {i+1}:")
    
    # Get the shap values (should be sorted by abs value)
    shap_vals = combined_llm_narratives_and_judge['shap_values'].iloc[i]
    if isinstance(shap_vals, str):
        shap_vals = ast.literal_eval(shap_vals)
    
    print("All SHAP values (sorted by abs):")
    for j, (feature, value) in enumerate(list(shap_vals.items())[:8]):  # Show first 8
        print(f"  {j+1}. {feature}: {value:.4f} (abs: {abs(value):.4f})")
    
    # Get top 5
    top_5 = combined_llm_narratives_and_judge['top_5_shap_values'].iloc[i]
    if isinstance(top_5, str):
        top_5 = ast.literal_eval(top_5)
    
    print("Top 5 SHAP values:")
    for j, (feature, value) in enumerate(top_5.items()):
        print(f"  {j+1}. {feature}: {value:.4f} (abs: {abs(value):.4f})")

print(f"\n=== SUMMARY ===")
print(f"✅ Removed top3_shap_values column")
print(f"✅ top_5_shap_values is now in dictionary format")
print(f"✅ Both shap_values and top_5_shap_values are ordered by largest absolute values")
print(f"✅ Original positive/negative signs are preserved")
print(f"\n=== COLUMNS IN DATASET ===")
print("Available columns:", list(combined_llm_narratives_and_judge.columns))

=== SAMPLE DATA ===


Unnamed: 0,shap_values,top_5_shap_values
1,"{'membership_category': -8.010722160339355, 'a...","{'membership_category': -8.010722160339355, 'a..."
2,"{'membership_category': -8.259814262390137, 'a...","{'membership_category': -8.259814262390137, 'a..."



=== VERIFICATION: Check if sorting by absolute values works correctly ===

Row 1:
All SHAP values (sorted by abs):
  1. membership_category: -8.0107 (abs: 8.0107)
  2. avg_frequency_login_days: -3.1072 (abs: 3.1072)
  3. year: -0.1845 (abs: 0.1845)
  4. medium_of_operation_Desktop: -0.1695 (abs: 0.1695)
  5. days_since_last_login: -0.1516 (abs: 0.1516)
  6. points_in_wallet: -0.1434 (abs: 0.1434)
  7. preferred_offer_types_Gift Vouchers/Coupons: 0.1286 (abs: 0.1286)
  8. avg_transaction_value: -0.1206 (abs: 0.1206)
Top 5 SHAP values:
  1. membership_category: -8.0107 (abs: 8.0107)
  2. avg_frequency_login_days: -3.1072 (abs: 3.1072)
  3. year: -0.1845 (abs: 0.1845)
  4. medium_of_operation_Desktop: -0.1695 (abs: 0.1695)
  5. days_since_last_login: -0.1516 (abs: 0.1516)

Row 2:
All SHAP values (sorted by abs):
  1. membership_category: -8.2598 (abs: 8.2598)
  2. avg_frequency_login_days: -2.9948 (abs: 2.9948)
  3. avg_time_spent: 0.2575 (abs: 0.2575)
  4. points_in_wallet: 0.2389 (abs:

## Manual Evaluation Interface

To evaluate LLM-generated narratives of ML SHAP explanations manually:

- **Open gradio app**: At terminal of project root directory, run `python notebook/modeling/llm_evaluation_app.py` and then go to http://127.0.0.1:7860/ for the gradio app.
- **Upload the data file**: Use the generated `evaluation_data_shuffled.csv.gz` from above
- **Start evaluating**: Follow the on-screen instructions to complete evaluations
- **Auto-backup**: Each evaluation automatically creates a backup file
- **Final export**: Download the final CSV to replace the original data file

**Features:**
- ✅ Auto-save after each evaluation
- ✅ Progress tracking and resume capability  
- ✅ Randomized presentation order
- ✅ Browser localStorage backup
- ✅ Direct CSV export for `../../data/output/evaluation_data_shuffled.csv.gz`