# Evaluation on LLM-generated narratives for SHAP explanations

## Part 1: Data loading & Processing

In [1]:
# output the cleaned data
import pandas as pd

merged_data_final = pd.read_csv("../../data/processed/cleaned_data.csv.gz", compression="gzip")

X = merged_data_final.drop(['churn_risk_score'], axis = 1)
y = merged_data_final['churn_risk_score']

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(
                    X, y, train_size=0.6, 
                    stratify= y,
                    random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(
                    X_temp, y_temp, train_size=0.5,
                    stratify=y_temp,
                    random_state=42)

In [3]:
# Export train and test dataset to `data` folder
pd.concat([X_train, y_train], axis=1).to_csv("../../data/input/train.csv.gz", index=False)
pd.concat([X_valid, y_valid], axis=1).to_csv("../../data/input/valid.csv.gz", index=False)
pd.concat([X_test, y_test], axis=1).to_csv("../../data/input/test.csv.gz", index=False)

### Prepare Structured Data

In [4]:
# Prepare structured training data
X_train_structured_with_id = X_train.copy()
X_train_structured = X_train_structured_with_id.drop(columns=["id", "feedback"])

print(f"Training structured data shape: {X_train_structured.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Training structured columns: {list(X_train_structured.columns)}")

Training structured data shape: (22194, 22)
Training target shape: (22194,)
Training structured columns: ['gender_M', 'region_category_Town', 'region_category_Village', 'joined_through_referral_Yes', 'preferred_offer_types_Gift Vouchers/Coupons', 'preferred_offer_types_Without Offers', 'medium_of_operation_Desktop', 'medium_of_operation_Smartphone', 'internet_option_Mobile_Data', 'internet_option_Wi-Fi', 'used_special_discount_Yes', 'offer_application_preference_Yes', 'past_complaint_Yes', 'years_since_joining', 'membership_category', 'complaint_status', 'age', 'days_since_last_login', 'avg_time_spent', 'avg_transaction_value', 'avg_frequency_login_days', 'points_in_wallet']


In [5]:
# Prepare structured validation data
X_valid_structured_with_id = X_valid.copy()
X_valid_structured = X_valid_structured_with_id.drop(columns=["id", "feedback"])

print(f"Validation structured data shape: {X_valid_structured.shape}")
print(f"Validation target shape: {y_valid.shape}")
print(f"Validation structured columns: {list(X_valid_structured.columns)}")

Validation structured data shape: (7398, 22)
Validation target shape: (7398,)
Validation structured columns: ['gender_M', 'region_category_Town', 'region_category_Village', 'joined_through_referral_Yes', 'preferred_offer_types_Gift Vouchers/Coupons', 'preferred_offer_types_Without Offers', 'medium_of_operation_Desktop', 'medium_of_operation_Smartphone', 'internet_option_Mobile_Data', 'internet_option_Wi-Fi', 'used_special_discount_Yes', 'offer_application_preference_Yes', 'past_complaint_Yes', 'years_since_joining', 'membership_category', 'complaint_status', 'age', 'days_since_last_login', 'avg_time_spent', 'avg_transaction_value', 'avg_frequency_login_days', 'points_in_wallet']


In [6]:
# Prepare structured test data
X_test_structured_with_id = X_test.copy()
X_test_structured = X_test_structured_with_id.drop(columns=["id", "feedback"])

print(f"Test structured data shape: {X_test_structured.shape}")
print(f"Test target shape: {y_test.shape}")
print(f"Test structured columns: {list(X_test_structured.columns)}")

Test structured data shape: (7399, 22)
Test target shape: (7399,)
Test structured columns: ['gender_M', 'region_category_Town', 'region_category_Village', 'joined_through_referral_Yes', 'preferred_offer_types_Gift Vouchers/Coupons', 'preferred_offer_types_Without Offers', 'medium_of_operation_Desktop', 'medium_of_operation_Smartphone', 'internet_option_Mobile_Data', 'internet_option_Wi-Fi', 'used_special_discount_Yes', 'offer_application_preference_Yes', 'past_complaint_Yes', 'years_since_joining', 'membership_category', 'complaint_status', 'age', 'days_since_last_login', 'avg_time_spent', 'avg_transaction_value', 'avg_frequency_login_days', 'points_in_wallet']


### Pick the best model - "XGBoost with Structured Data"

In [7]:
import xgboost as xgb
from sklearn.metrics import (f1_score, accuracy_score, precision_score, 
                            recall_score, classification_report, confusion_matrix)

# Train the model
xgb_model = xgb.XGBClassifier(max_depth=10,
                            random_state=42,
                            # Introduce randomness to make training faster and reduce overfitting
                            subsample=0.8, ## Uses 80% of the data for each tree.
                            colsample_bytree=0.8, ## Uses 80% of the features for each tree.
                            # the parameters below make the model trained faster by enabling parallelism
                            n_jobs = -1)
xgb_model.fit(X_train_structured, y_train)

# Predictions on training and test sets
y_train_pred_xgb = xgb_model.predict(X_train_structured)
y_valid_pred_xgb = xgb_model.predict(X_valid_structured)
y_test_pred_xgb = xgb_model.predict(X_test_structured)

# Accuracy scores
train_accuracy = accuracy_score(y_train, y_train_pred_xgb)
valid_accuracy = accuracy_score(y_valid, y_valid_pred_xgb)
test_accuracy = accuracy_score(y_test, y_test_pred_xgb)

# F1 scores
train_f1_score = f1_score(y_train, y_train_pred_xgb, average='weighted')
valid_f1_score = f1_score(y_valid, y_valid_pred_xgb, average='weighted')
test_f1_score = f1_score(y_test, y_test_pred_xgb, average='weighted')

# Precision scores
train_precision = precision_score(y_train, y_train_pred_xgb, average='weighted')
valid_precision = precision_score(y_valid, y_valid_pred_xgb, average='weighted')
test_precision = precision_score(y_test, y_test_pred_xgb, average='weighted')

# Recall scores
train_recall = recall_score(y_train, y_train_pred_xgb, average='weighted')
valid_recall = recall_score(y_valid, y_valid_pred_xgb, average='weighted')
test_recall = recall_score(y_test, y_test_pred_xgb, average='weighted')

# Output
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Valid Accuracy:  {valid_accuracy:.4f}")
print(f"Test Accuracy:  {test_accuracy:.4f}\n")

print(f"Train F1-score: {train_f1_score:.4f}")
print(f"Valid F1-score:  {valid_f1_score:.4f}")
print(f"Test F1-score:  {test_f1_score:.4f}\n")

print(f"Train Precision: {train_precision:.4f}")
print(f"Valid Precision:  {valid_precision:.4f}")
print(f"Test Precision:  {test_precision:.4f}\n")

print(f"Train Recall: {train_recall:.4f}")
print(f"Valid Recall:  {valid_recall:.4f}")
print(f"Test Recall:  {test_recall:.4f}\n")

print("Classification Report (Test):")
print(classification_report(y_test, y_test_pred_xgb))

print("Confusion Matrix (Test):")
print(confusion_matrix(y_test, y_test_pred_xgb))

Train Accuracy: 1.0000
Valid Accuracy:  0.9258
Test Accuracy:  0.9311

Train F1-score: 1.0000
Valid F1-score:  0.9256
Test F1-score:  0.9310

Train Precision: 1.0000
Valid Precision:  0.9265
Test Precision:  0.9314

Train Recall: 1.0000
Valid Recall:  0.9258
Test Recall:  0.9311

Classification Report (Test):
              precision    recall  f1-score   support

           0       0.94      0.91      0.92      3396
           1       0.92      0.95      0.94      4003

    accuracy                           0.93      7399
   macro avg       0.93      0.93      0.93      7399
weighted avg       0.93      0.93      0.93      7399

Confusion Matrix (Test):
[[3077  319]
 [ 191 3812]]


## Part 2: Generate SHAP explanations

### 2.1 Prepare SHAP values to be feed into LLM

In [8]:
# Calculate SHAP values for the test data
import shap
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

n_samples = 30

# Extract the model from the pipeline
best_model = xgb_model

# Create SHAP TreeExplainer using the extracted model
explainer = shap.TreeExplainer(best_model)

# Perform stratified sampling on the test data to select 'n_samples' instances
# Use the structured data with ID for sampling, then get the same samples from structured data
sss = StratifiedShuffleSplit(n_splits=1, test_size=n_samples, random_state=42)
for _, test_index in sss.split(X_test_structured, y_test):
    X_test_sampled_with_id = X_test_structured_with_id.iloc[test_index]
    X_test_sampled_structured = X_test_structured.iloc[test_index]

# Calculate SHAP values using structured data without 'id' and 'feedback'
shap_values = explainer.shap_values(X_test_sampled_structured)
expected_value = explainer.expected_value

print("SHAP values calculated")
print(f"SHAP values shape: {shap_values.shape}")
print(f"Sampled data shape: {X_test_sampled_structured.shape}")
print(f"Columns used for SHAP: {list(X_test_sampled_structured.columns)}")

SHAP values calculated
SHAP values shape: (30, 22)
Sampled data shape: (30, 22)
Columns used for SHAP: ['gender_M', 'region_category_Town', 'region_category_Village', 'joined_through_referral_Yes', 'preferred_offer_types_Gift Vouchers/Coupons', 'preferred_offer_types_Without Offers', 'medium_of_operation_Desktop', 'medium_of_operation_Smartphone', 'internet_option_Mobile_Data', 'internet_option_Wi-Fi', 'used_special_discount_Yes', 'offer_application_preference_Yes', 'past_complaint_Yes', 'years_since_joining', 'membership_category', 'complaint_status', 'age', 'days_since_last_login', 'avg_time_spent', 'avg_transaction_value', 'avg_frequency_login_days', 'points_in_wallet']


In [9]:
# Convert SHAP values to DataFrame for easier manipulation
shap_df = pd.DataFrame(shap_values, columns=X_test_sampled_structured.columns)

# Add 'id' column to shap_df for alignment
shap_df['id'] = X_test_sampled_with_id['id'].values

# Reset index to ensure clean iteration
shap_df = shap_df.reset_index(drop=True)

# Initialize a dictionary to store the JSON structures
json_structures = {}

# Generate a JSON structure for each row in shap_df
for index, row in shap_df.iterrows():
    # Create a dictionary for the current row
    row_dict = row.to_dict()

    # Use id as the key for the JSON structure and remove it from the values
    customer_id = row_dict.pop('id')
    json_structures[customer_id] = row_dict

print(f"Generated SHAP structures for {len(json_structures)} customers")
print(f"Example SHAP features: {list(list(json_structures.values())[0].keys())[:5]}")

Generated SHAP structures for 30 customers
Example SHAP features: ['gender_M', 'region_category_Town', 'region_category_Village', 'joined_through_referral_Yes', 'preferred_offer_types_Gift Vouchers/Coupons']


In [10]:
def sort_and_get_top_features(features):
    sorted_features = sorted(features.items(), key=lambda item: abs(item[1]), reverse=True)
    top_features = sorted_features[:10]
    return top_features 

# Create an empty DataFrame
features_shap_values = pd.DataFrame()

# Iterate over each ID key
for id_key, features in json_structures.items():
    sorted_features_df = sort_and_get_top_features(features)
    keys = [key for key, _ in sorted_features_df]
    values = [value for _, value in sorted_features_df]
    features_shap_values = pd.concat([features_shap_values, pd.DataFrame({"ID": id_key,
                                                                          "top10_feature": [keys],
                                                                          "top10_shap_values":[values]})])
                                     
features_shap_values = features_shap_values.reset_index(drop=True)

In [11]:
# Combine predict and predict_proba in a DataFrame
import pandas as pd
import json

# Predict labels and probabilities using structured data
labels = xgb_model.predict(X_test_sampled_structured)
proba = xgb_model.predict_proba(X_test_sampled_structured)

# Create predictions DataFrame
predictions = pd.DataFrame(proba, columns=[f"prediction_score_{cls}" for cls in xgb_model.classes_])
predictions.insert(0, "prediction_label", labels)

# Reset index if necessary (to ensure alignment during concat)
X_test_sampled_with_id = X_test_sampled_with_id.reset_index(drop=True)
predictions = predictions.reset_index(drop=True)

# Combine features with predictions
combined_df = pd.concat([X_test_sampled_with_id, predictions], axis=1)

# Convert to JSON (list of dicts)
parsed_json = json.loads(combined_df.to_json(orient='records'))

# Example output
print("Sample combined data:")
print(f"Keys: {list(parsed_json[0].keys())}")
print(f"Prediction label: {parsed_json[0]['prediction_label']}")
print(f"Sample features: {[(k,v) for k,v in list(parsed_json[0].items())[:5]]}")

Sample combined data:
Keys: ['gender_M', 'region_category_Town', 'region_category_Village', 'joined_through_referral_Yes', 'preferred_offer_types_Gift Vouchers/Coupons', 'preferred_offer_types_Without Offers', 'medium_of_operation_Desktop', 'medium_of_operation_Smartphone', 'internet_option_Mobile_Data', 'internet_option_Wi-Fi', 'used_special_discount_Yes', 'offer_application_preference_Yes', 'past_complaint_Yes', 'id', 'years_since_joining', 'membership_category', 'complaint_status', 'feedback', 'age', 'days_since_last_login', 'avg_time_spent', 'avg_transaction_value', 'avg_frequency_login_days', 'points_in_wallet', 'prediction_label', 'prediction_score_0', 'prediction_score_1']
Prediction label: 0
Sample features: [('gender_M', 1), ('region_category_Town', 0), ('region_category_Village', 0), ('joined_through_referral_Yes', 0), ('preferred_offer_types_Gift Vouchers/Coupons', 1)]


In [12]:
def prepare_input_data(user_id, parsed_json, json_structures):
    desired_data = next(item for item in parsed_json if item['id'] == int(user_id))

    # get predicted_label from parsed_json
    result = desired_data['prediction_label']
    
    shap_values = json_structures[int(user_id)]
    
    return result, shap_values

### 2.2 Prompt Template to generate narratives for SHAP explanations

```markdown
# System Prompt
You are a churn explanation assistant. Provide clear, descriptive, business-oriented narratives that non-technical users can understand.

# User Prompt
The model predicted this customer will <CHURN_STATUS>.
Top <TOP_N> features affecting the prediction:
- <FEATURE_1>: <direction - in what direction it affects churn, either increase or decrease>, <magnitude - how strong the impact of the feature is> effect
- <FEATURE_2>: <direction>, <magnitude> effect
...
- <FEATURE_N>: <direction>, <magnitude> effect

Write a concise, business-friendly explanation for a non-technical user (50 words).  
Describe why these features contribute to the prediction, using clear, empathetic language.  
Do not use SHAP jargon or the phrase "based on".
```

---

For example:

```markdown
# System Prompt
You are a churn explanation assistant. Provide clear, descriptive, business-oriented narratives that non-technical users can understand.

# User Prompt
The model predicted this customer will not churn.
Top 5 features affecting the prediction:
- Joined Through Referral Yes: decreases churn, very strong effect
- Past Complaint Yes: increases churn, moderate effect
- Membership Category: decreases churn, strong effect
- Avg Frequency Login Days: decreases churn, strong effect
- Points In Wallet: decreases churn, moderate effect

Write a concise, business-friendly explanation for a non-technical user (50 words).  
Describe why these features contribute to the prediction, using clear, empathetic language.  
Do not use SHAP jargon or the phrase "based on".
```

In [13]:
import os
from typing import Dict, Any
import openai

from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

openai_api_key = os.getenv("OPENAI_API_KEY")

# Feature magnitude mapping for SHAP values to reduce ambiguity
def map_shap_magnitude(value):
    abs_val = abs(value)
    if abs_val > 5:
        return "very strong"
    elif abs_val > 1:
        return "strong"
    elif abs_val > 0.3:
        return "moderate"
    else:
        return "weak"

# Generate LLM-generated narratives for ML shap explanations
def generate_churn_explainability(result: int, 
                                  shap_values: dict,
                                  api_key: str,
                                  top_n: int = 5, # Top 5 features are mentioned
                                  model: str = "gpt-4o-mini",
                                  temperature: float = 0.2,
                                  show_prompt=False) -> Dict[str, Any]:
    """
    Generate churn explainability using OpenAI SDK directly
    """
    
    # Initialize OpenAI client
    client = openai.OpenAI(api_key=api_key)
    
    # Step 1: Select top features by absolute SHAP value
    top_features = sorted(
        shap_values.items(), key=lambda x: abs(x[1]), reverse=True
    )[:top_n]

    # Step 2: Preprocess features for LLM input with explicit ranking
    feature_list: list[str] = []
    for i, (feat, val) in enumerate(top_features, 1):
        direction = "increases churn" if val > 0 else "decreases churn"
        magnitude = map_shap_magnitude(val)
        description = feat.replace("_", " ").title()
        # Include ranking information to help with evaluation
        feature_list.append(f"- #{i} (MOST IMPORTANT): {description} - {direction}, {magnitude} effect (SHAP: {val:.3f})" if i == 1 
                          else f"- #{i}: {description} - {direction}, {magnitude} effect (SHAP: {val:.3f})")

    feature_text = "\n".join(feature_list)

    explanation_prompt = f"""
The model predicted this customer will {'churn' if result == 1 else 'not churn'}.
Top {top_n} features affecting the prediction (ranked by importance, #1 is MOST important):
{feature_text}

Write a concise, business-friendly explanation for a non-technical user (50 words).
IMPORTANT: Address features in order of their importance (#1 FIRST, then #2, etc.).
Start with the most important factor (#1), then mention other key factors in descending order.
Use clear, empathetic language and avoid SHAP jargon or the phrase "based on".
"""

    system_prompt = """You are a churn explanation assistant. Provide clear, descriptive, business-oriented narratives that non-technical users can understand. 

CRITICAL: Always address the most important features first in your explanation. Follow the ranking order provided (#1, #2, #3, etc.). The #1 feature should be mentioned first and emphasized as the primary factor."""
    
    if show_prompt is True:
        print("PROMPT TEMPLATE FOR generate_churn_explainability function")
        print("===========")
        print("# System Prompt")
        print(system_prompt)
        print("\n# User Prompt")
        print(explanation_prompt)
        print("===========")
    
    response = client.chat.completions.create(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": explanation_prompt}
        ]
    )
    explanation = response.choices[0].message.content.strip()
    
    # Return results in the same format as the original function
    return {
        "top_features": feature_list,
        'explanation': explanation
    }

- Demo for the first row to `generate_churn_explainability`

In [14]:
# Generate explanation for the first row only
output_list = []

# Get the first user_id
first_user_id = X_test_sampled_with_id['id'].iloc[0]

# Prepare input data
result, shap_values = prepare_input_data(
    first_user_id, parsed_json, json_structures
)

# Generate explanation
output = generate_churn_explainability(
    result=result,
    shap_values=shap_values,
    api_key=openai_api_key,
    show_prompt=True
)

# Attach extra info
output["shap_values"] = shap_values
output["predicted_label"] = result

# Store in list
output_list.append(output)

# Demo print
print("Demo: first row output")
print(json.dumps(output, indent=2))


PROMPT TEMPLATE FOR generate_churn_explainability function
# System Prompt
You are a churn explanation assistant. Provide clear, descriptive, business-oriented narratives that non-technical users can understand. 

CRITICAL: Always address the most important features first in your explanation. Follow the ranking order provided (#1, #2, #3, etc.). The #1 feature should be mentioned first and emphasized as the primary factor.

# User Prompt

The model predicted this customer will not churn.
Top 5 features affecting the prediction (ranked by importance, #1 is MOST important):
- #1 (MOST IMPORTANT): Membership Category - decreases churn, very strong effect (SHAP: -8.528)
- #2: Avg Frequency Login Days - decreases churn, strong effect (SHAP: -1.584)
- #3: Age - increases churn, moderate effect (SHAP: 0.412)
- #4: Used Special Discount Yes - increases churn, weak effect (SHAP: 0.183)
- #5: Internet Option Wi-Fi - increases churn, weak effect (SHAP: 0.174)

Write a concise, business-friendly e

In [15]:
# Looping to generate the LLM-generated narratives for SHAP explanations
output_list = []

# total number of samples
n_samples = len(X_test_sampled_with_id)

for idx, user_id in enumerate(X_test_sampled_with_id['id'], start=1):
    result, shap_values = prepare_input_data(user_id, parsed_json, json_structures)

    # Then generate explanations
    output = generate_churn_explainability(
        result=result,
        shap_values=shap_values,
        api_key=openai_api_key
    )
    
    output["shap_values"] = shap_values
    output["predicted_label"] = result
    output_list.append(output)

In [16]:
_ = os.makedirs("../../data/output", exist_ok=True)

shap_output_df = pd.DataFrame.from_records(output_list)
shap_output_df.to_csv("../../data/output/llm_generated_narratives_on_shap.csv.gz", index=False)

In [17]:
shap_output_df.head(3)

Unnamed: 0,top_features,explanation,shap_values,predicted_label
0,[- #1 (MOST IMPORTANT): Membership Category - ...,The model predicts this customer is unlikely t...,"{'gender_M': 0.09423382580280304, 'region_cate...",0
1,[- #1 (MOST IMPORTANT): Membership Category - ...,The prediction indicates this customer is like...,"{'gender_M': -0.03530433401465416, 'region_cat...",1
2,[- #1 (MOST IMPORTANT): Membership Category - ...,The prediction indicates this customer is like...,"{'gender_M': -0.017987653613090515, 'region_ca...",1


In [18]:
# double check for the dimension of the shap_output_df
shap_output_df.shape

(30, 4)

In [19]:
# make sure both target variable having equal distribution
shap_output_df["predicted_label"].value_counts()

predicted_label
1    18
0    12
Name: count, dtype: int64

## Part 3: LLM As A Judge

### Prompt Template for LLM As A Judge

```markdown
# System Prompt
You are an impartial judge and an expert in machine learning explainability.  
Your task is to evaluate the alignment between a generated churn narrative and the provided SHAP values.

You will receive:
- The model prediction result
- The customer's profile description
- The top 5 SHAP values
- A generated narrative to evaluate

Your job is to check which features in the narrative align with the SHAP values in terms of **direction** (towards churn or away from churn) and **magnitude** (strong, weak, etc.).

# User Prompt
**[Context Data]**
- Predicted Churn Result: <PREDICTED_LABEL> (0 = will not churn, 1 = will churn)
- Top 5 SHAP Values: <TOP_5_SHAP_VALUES>

**[Generated Narrative to Evaluate]**
<GENERATED_NARRATIVE>

Please return your evaluation in this format:

```json
[
  {
    "feature": "<feature_name>",
    "feature_mentioned": <True|False>,
    "direction_text": "<direction extracted from text_direction>",
    "direction_shap": "<direction extracted from shap_direction_magnitude>",
    "magnitude_shap": "<magnitude extracted from shap_direction_magnitude>",
    "direction_match": <True|False>,
    "magnitude_match": <True|False>
  },
  ...
]
```

In [20]:
# Load the data
prepare_for_judge_df = pd.read_csv('../../data/output/llm_generated_narratives_on_shap.csv.gz', compression='gzip')

In [21]:
prepare_for_judge_df.head(3)

Unnamed: 0,top_features,explanation,shap_values,predicted_label
0,['- #1 (MOST IMPORTANT): Membership Category -...,The model predicts this customer is unlikely t...,"{'gender_M': 0.09423382580280304, 'region_cate...",0
1,['- #1 (MOST IMPORTANT): Membership Category -...,The prediction indicates this customer is like...,"{'gender_M': -0.03530433401465416, 'region_cat...",1
2,['- #1 (MOST IMPORTANT): Membership Category -...,The prediction indicates this customer is like...,"{'gender_M': -0.017987653613090515, 'region_ca...",1


In [22]:
prepare_for_judge_df.shape

(30, 4)

In [23]:
# get top 5 most important features from SHAP values
import ast

def get_top_shap_values_dict(shap_str, top_n=5):
    # Convert string to dict if needed
    if isinstance(shap_str, str):
        shap_dict = ast.literal_eval(shap_str)
    else:
        shap_dict = shap_str
    
    # Sort by absolute values (largest to smallest), but keep original values
    sorted_items = sorted(shap_dict.items(), key=lambda x: abs(x[1]), reverse=True)[:top_n]
    
    # Create dictionary with feature names and values
    top_shap_dict = {}
    for feature_name, value in sorted_items:
        top_shap_dict[feature_name] = value
    
    return top_shap_dict

prepare_for_judge_df['top_5_shap_values'] = prepare_for_judge_df['shap_values'].apply(
        lambda x: get_top_shap_values_dict(x, top_n=5))



In [24]:
import pandas as pd
import json
from openai import OpenAI

def evaluate_narrative_with_llm(explanation_text, top_5_shap_values, predicted_label, show_prompt=False):
    """
    Use OpenAI LLM as a judge to evaluate narrative quality feature-by-feature
    """
    
    # Create magnitude mapping info for the judge
    magnitude_info = """
    SHAP Magnitude Mapping (IMPORTANT - use this exact mapping):
    - very strong: absolute value > 5
    - strong: absolute value > 1 
    - moderate: absolute value > 0.3
    - weak: absolute value ≤ 0.3
    
    Direction Mapping:
    - Positive SHAP value = "towards churn" 
    - Negative SHAP value = "away from churn"
    """
    
    system_prompt = f"""You are an impartial judge and an expert in machine learning explainability. 
Your task is to evaluate the alignment between a generated churn narrative and the provided SHAP values.

{magnitude_info}

You will receive:
- The model prediction result
- The top 5 SHAP values (ordered by importance, #1 most important first)
- A generated narrative to evaluate

Your job is to check which features in the narrative align with the SHAP values in terms of:

1. **Direction Agreement (sign_agreement)**: 
   - True if the narrative direction matches SHAP direction
   - False if they contradict

2. **Ranking Agreement (rank_agreement)**: 
   - True if features appear in the narrative in roughly the same importance order as SHAP ranking
   - False if a less important SHAP feature (#4, #5) is emphasized more than a more important one (#1, #2, #3)
   - If feature is not mentioned, automatically False

CRITICAL: Use the exact magnitude mapping provided above. Check SHAP values carefully against the thresholds.
"""
    user_prompt = f"""
**[Context Data]**
- Predicted Churn Result: {predicted_label} (0 = will not churn, 1 = will churn)
- Top 5 SHAP Values (ordered by importance): {top_5_shap_values}

**[Generated Narrative to Evaluate]**
{explanation_text}

Evaluate each of the top 5 SHAP features and return your evaluation in this format:

```json
[
  {{
    "feature": "<exact_feature_name_from_shap>",
    "feature_mentioned": <True|False>,
    "direction_text": "<direction extracted from narrative text, or 'not mentioned'>",
    "direction_shap": "<'towards churn' if SHAP > 0, 'away from churn' if SHAP < 0>",
    "magnitude_shap": "<very strong|strong|moderate|weak using the mapping above>",
    "sign_agreement": <True if directions match, False if they contradict, False if not mentioned>,
    "rank_agreement": <True if feature appears in appropriate importance order in text, False otherwise>
  }},
  ...
]
```

IMPORTANT: 
- Evaluate ALL 5 SHAP features, even if not mentioned in the narrative
- Use the exact magnitude thresholds provided
- For rank_agreement: compare narrative emphasis order with SHAP importance ranking
"""

    # Initialize OpenAI client
    client = OpenAI(api_key=openai_api_key)
    
    if show_prompt is True:
        print("PROMPT TEMPLATE FOR evaluate_narrative_with_llm function")
        print("===========")
        print("# System Prompt")
        print(system_prompt)
        print("\n# User Prompt")
        print(user_prompt)
        print("===========")

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.1,
            max_tokens=1500
        )
        
        return response.choices[0].message.content

    except Exception as e:
        print(f"Error calling OpenAI API: {e}")
        return None

In [25]:
# Just evaluate the first explanation
import re

results = []

print("Evaluating first explanation only...")

first_row = prepare_for_judge_df.dropna(subset=['explanation']).iloc[0]

evaluation = evaluate_narrative_with_llm(
    first_row['explanation'],
    first_row['top_5_shap_values'],
    first_row['predicted_label'],
    show_prompt=True
)

if evaluation:
    try:
        # Extract JSON inside ```json ... ```
        match = re.search(r"```json\s*(\[.*?\])\s*```", evaluation, re.DOTALL)
        if match:
            clean_eval = match.group(1)
        else:
            clean_eval = evaluation.strip()

        feature_data = json.loads(clean_eval)  # list of per-feature dicts

        # ---- Compute faithfulness score (0–10) ----
        sign_points = sum(1 for f in feature_data if f.get("sign_agreement"))
        rank_points = sum(1 for f in feature_data if f.get("rank_agreement"))
        correct_points = sign_points + rank_points
        max_points = 2 * len(feature_data)
        faithfulness_scores = (correct_points / max_points) * 10 if max_points > 0 else 0

        # ---- Scale completeness (from LLM 1–5 → 1–10) ----
        # If you already collect LLM completeness score in 1–5
        # ---- Calculate completeness based on mentioned features ----
        mentioned_features = sum(1 for f in feature_data if f.get("feature_mentioned"))
        completeness_raw = mentioned_features  # Count of features mentioned in text (0-5)
        completeness_score_10 = round((completeness_raw / 5) * 10, 1)  # Scale to 0-10

        # ---- Build eval_data summary ----
        eval_data = {
            "completeness": {"score": completeness_score_10},
            "faithfulness": {
                "score": round(faithfulness_scores, 1),
                "sign_points": sign_points,
                "rank_points": rank_points
            },
            "raw_features": feature_data
        }

        results.append({
            'index': first_row.name,
            'explanation': first_row['explanation'],
            'predicted_label': first_row['predicted_label'],
            "top_5_shap_magnitudes": {feature: map_shap_magnitude(value) for feature, value in first_row['top_5_shap_values'].items()},
            "top_5_shap_values": first_row['top_5_shap_values'],
            'evaluation': eval_data
        })

    except json.JSONDecodeError:
        print("Failed to parse JSON for the first row, storing raw output.")
        results.append({
            'index': first_row.name,
            'evaluation': evaluation
        })

# Save results
evaluation_df = pd.DataFrame(results)
print("\nDemo:\n=========")
print("\nPredicted Label: ", evaluation_df["predicted_label"][0])
print("\nExplanation: ", evaluation_df["explanation"][0])
print("\nTop 5 SHAP magnitudes: ", evaluation_df["top_5_shap_magnitudes"][0])
print("\nTop 5 SHAP values: ", evaluation_df["top_5_shap_values"][0])
print("\nEvaluation: ", json.dumps(evaluation_df["evaluation"][0], indent=2))


Evaluating first explanation only...
PROMPT TEMPLATE FOR evaluate_narrative_with_llm function
# System Prompt
You are an impartial judge and an expert in machine learning explainability. 
Your task is to evaluate the alignment between a generated churn narrative and the provided SHAP values.


    SHAP Magnitude Mapping (IMPORTANT - use this exact mapping):
    - very strong: absolute value > 5
    - strong: absolute value > 1 
    - moderate: absolute value > 0.3
    - weak: absolute value ≤ 0.3
    
    Direction Mapping:
    - Positive SHAP value = "towards churn" 
    - Negative SHAP value = "away from churn"
    

You will receive:
- The model prediction result
- The top 5 SHAP values (ordered by importance, #1 most important first)
- A generated narrative to evaluate

Your job is to check which features in the narrative align with the SHAP values in terms of:

1. **Direction Agreement (sign_agreement)**: 
   - True if the narrative direction matches SHAP direction
   - False if t

In [26]:
# Just evaluate all explanationsmp
import re

results = []

print(f"Evaluating {len(prepare_for_judge_df)} explanations...")

for idx, row in prepare_for_judge_df.iterrows():
    if pd.isna(row['explanation']):
        continue

    print(f"Evaluating explanation {idx+1}/{len(prepare_for_judge_df)}")
    
    evaluation = evaluate_narrative_with_llm(
        row['explanation'],
        row['top_5_shap_values'],
        row['predicted_label']
    )

    if evaluation:
        try:
            # Extract JSON inside ```json ... ```
            match = re.search(r"```json\s*(\[.*?\])\s*```", evaluation, re.DOTALL)
            if match:
                clean_eval = match.group(1)
            else:
                clean_eval = evaluation.strip()

            feature_data = json.loads(clean_eval)  # list of per-feature dicts

            # ---- Compute 
            # score (0–10) ----
            sign_points = sum(1 for f in feature_data if f.get("sign_agreement"))
            rank_points = sum(1 for f in feature_data if f.get("rank_agreement"))
            correct_points = sign_points + rank_points
            max_points = 2 * len(feature_data)
            faithfulness_scores = (correct_points / max_points) * 10 if max_points > 0 else 0

            # ---- Scale completeness (from LLM 1–5 → 1–10) ----
            mentioned_features = sum(1 for f in feature_data if f.get("feature_mentioned"))
            completeness_raw = mentioned_features  # Count of features mentioned in text (0-5)
            completeness_score_10 = round((completeness_raw / 5) * 10, 1)  # Scale to 0-10

            # ---- Build eval_data summary ----
            eval_data = {
                "completeness": {"score": completeness_score_10},
                "faithfulness": {
                    "score": round(faithfulness_scores, 1),
                    "sign_points": sign_points,
                    "rank_points": rank_points
                },
                "raw_features": feature_data
            }

            results.append({
                'index': idx,
                'explanation': row['explanation'],
                'predicted_label': row['predicted_label'],
                "top_5_shap_magnitudes": {feature: map_shap_magnitude(value) for feature, value in row['top_5_shap_values'].items()},
                "top_5_shap_values": row['top_5_shap_values'],
                'evaluation': eval_data
            })

        except json.JSONDecodeError:
            print(f"Failed to parse JSON for row {idx}, storing raw output.")
            results.append({
                'index': idx,
                'evaluation': evaluation
            })

# Save results
evaluation_df = pd.DataFrame(results)


Evaluating 30 explanations...
Evaluating explanation 1/30
Evaluating explanation 2/30
Evaluating explanation 2/30
Evaluating explanation 3/30
Evaluating explanation 3/30
Evaluating explanation 4/30
Evaluating explanation 4/30
Evaluating explanation 5/30
Evaluating explanation 5/30
Evaluating explanation 6/30
Evaluating explanation 6/30
Evaluating explanation 7/30
Evaluating explanation 7/30
Evaluating explanation 8/30
Evaluating explanation 8/30
Evaluating explanation 9/30
Evaluating explanation 9/30
Evaluating explanation 10/30
Evaluating explanation 10/30
Evaluating explanation 11/30
Evaluating explanation 11/30
Evaluating explanation 12/30
Evaluating explanation 12/30
Evaluating explanation 13/30
Evaluating explanation 13/30
Evaluating explanation 14/30
Evaluating explanation 14/30
Evaluating explanation 15/30
Evaluating explanation 15/30
Evaluating explanation 16/30
Evaluating explanation 16/30
Evaluating explanation 17/30
Evaluating explanation 17/30
Evaluating explanation 18/30
E

In [27]:
# Save results
evaluation_df = pd.DataFrame(results)
evaluation_df.to_csv('../../data/output/llm_judge_evaluation_results.csv.gz', index=False)

print("Results saved to '../../data/output/llm_judge_evaluation_results.csv.gz'")

Results saved to '../../data/output/llm_judge_evaluation_results.csv.gz'


In [28]:
# double check the dimension of evaluation_df
evaluation_df.shape

(30, 6)

In [29]:
evaluation_df.head(3)

Unnamed: 0,index,explanation,predicted_label,top_5_shap_magnitudes,top_5_shap_values,evaluation
0,0,The model predicts this customer is unlikely t...,0,"{'membership_category': 'very strong', 'avg_fr...","{'membership_category': -8.527676582336426, 'a...","{'completeness': {'score': 8.0}, 'faithfulness..."
1,1,The prediction indicates this customer is like...,1,"{'membership_category': 'strong', 'avg_frequen...","{'membership_category': 2.813594102859497, 'av...","{'completeness': {'score': 10.0}, 'faithfulnes..."
2,2,The prediction indicates this customer is like...,1,"{'membership_category': 'strong', 'avg_frequen...","{'membership_category': 3.10160756111145, 'avg...","{'completeness': {'score': 10.0}, 'faithfulnes..."


## Part 4: Evaluation Results Analysis

In [30]:
import ast

# Convert string representations of dicts to actual dicts if needed
evaluation_df['evaluation'] = evaluation_df['evaluation'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Extract individual scores for each row
faithfulness_scores = evaluation_df['evaluation'].apply(lambda x: x['faithfulness']['score']).values
completeness_scores = evaluation_df['evaluation'].apply(lambda x: x['completeness']['score']).values

# Print individual scores for each row
print("Individual Scores for Each Row:")
print("=" * 50)
for idx, (faith, comp) in enumerate(zip(faithfulness_scores, completeness_scores)):
    print(f"Row {idx+1}: Faithfulness = {faith}, Completeness = {comp}")

print("\n" + "=" * 50)
print("Summary Statistics:")
print(f"Average Faithfulness Score: {faithfulness_scores.mean():.2f}")
print(f"Average Completeness Score: {completeness_scores.mean():.2f}")
print(f"Faithfulness Score Range: {faithfulness_scores.min():.1f} - {faithfulness_scores.max():.1f}")
print(f"Completeness Score Range: {completeness_scores.min():.1f} - {completeness_scores.max():.1f}")

# Optional: Create a DataFrame for easier viewing
scores_df = pd.DataFrame({
    'Row': range(1, len(faithfulness_scores) + 1),
    'Faithfulness_Score': faithfulness_scores,
    'Completeness_Score': completeness_scores,
    'Predicted_Label': evaluation_df['predicted_label'].values
})

print("\nDetailed Scores DataFrame:")
print(scores_df)

Individual Scores for Each Row:
Row 1: Faithfulness = 8.0, Completeness = 8.0
Row 2: Faithfulness = 10.0, Completeness = 10.0
Row 3: Faithfulness = 10.0, Completeness = 10.0
Row 4: Faithfulness = 10.0, Completeness = 10.0
Row 5: Faithfulness = 10.0, Completeness = 10.0
Row 6: Faithfulness = 10.0, Completeness = 10.0
Row 7: Faithfulness = 10.0, Completeness = 10.0
Row 8: Faithfulness = 10.0, Completeness = 10.0
Row 9: Faithfulness = 10.0, Completeness = 10.0
Row 10: Faithfulness = 10.0, Completeness = 10.0
Row 11: Faithfulness = 10.0, Completeness = 10.0
Row 12: Faithfulness = 10.0, Completeness = 10.0
Row 13: Faithfulness = 10.0, Completeness = 10.0
Row 14: Faithfulness = 10.0, Completeness = 10.0
Row 15: Faithfulness = 8.0, Completeness = 10.0
Row 16: Faithfulness = 10.0, Completeness = 10.0
Row 17: Faithfulness = 10.0, Completeness = 10.0
Row 18: Faithfulness = 7.0, Completeness = 8.0
Row 19: Faithfulness = 10.0, Completeness = 10.0
Row 20: Faithfulness = 8.0, Completeness = 10.0
Row

In [31]:
# Key statistical analysis
print("\n" + "=" * 60)
print("KEY STATISTICAL ANALYSIS")
print("=" * 60)

# 1. Score distributions
print("\n1. SCORE DISTRIBUTIONS:")
print("-" * 40)
print("Faithfulness Score Distribution:")
faithfulness_counts = pd.Series(faithfulness_scores).value_counts().sort_index()
print(faithfulness_counts)

print("\nCompleteness Score Distribution:")
completeness_counts = pd.Series(completeness_scores).value_counts().sort_index()
print(completeness_counts)



KEY STATISTICAL ANALYSIS

1. SCORE DISTRIBUTIONS:
----------------------------------------
Faithfulness Score Distribution:
6.0      1
7.0      2
8.0      3
9.0      1
10.0    23
Name: count, dtype: int64

Completeness Score Distribution:
8.0      4
10.0    26
Name: count, dtype: int64


In [32]:

# 2. Performance by predicted label (most important)
print("\n2. PERFORMANCE BY PREDICTED LABEL:")
print("-" * 40)
label_analysis = pd.DataFrame({
    'Predicted_Label': evaluation_df['predicted_label'].values,
    'Faithfulness_Score': faithfulness_scores,
    'Completeness_Score': completeness_scores
})

label_stats = label_analysis.groupby('Predicted_Label').agg({
    'Faithfulness_Score': ['min', 'mean', 'median', 'max', 'count'],
    'Completeness_Score': ['min', 'mean', 'median', 'max', 'count']
}).round(2)
label_stats


2. PERFORMANCE BY PREDICTED LABEL:
----------------------------------------


Unnamed: 0_level_0,Faithfulness_Score,Faithfulness_Score,Faithfulness_Score,Faithfulness_Score,Faithfulness_Score,Completeness_Score,Completeness_Score,Completeness_Score,Completeness_Score,Completeness_Score
Unnamed: 0_level_1,min,mean,median,max,count,min,mean,median,max,count
Predicted_Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
0,7.0,9.33,10.0,10.0,12,8.0,9.67,10.0,10.0,12
1,6.0,9.5,10.0,10.0,18,8.0,9.78,10.0,10.0,18


In [33]:
# 3. High-quality explanations count (business-relevant)
print("\n3. HIGH-QUALITY EXPLANATIONS:")
print("-" * 40)
high_faithfulness = (faithfulness_scores >= 8).sum()
high_completeness = (completeness_scores >= 8).sum()
both_high = ((faithfulness_scores >= 8) & (completeness_scores >= 8)).sum()

print(f"High faithfulness (≥8): {high_faithfulness}/{len(faithfulness_scores)} ({high_faithfulness/len(faithfulness_scores)*100:.1f}%)")
print(f"High completeness (≥8): {high_completeness}/{len(completeness_scores)} ({high_completeness/len(completeness_scores)*100:.1f}%)")
print(f"Both high (≥8): {both_high}/{len(faithfulness_scores)} ({both_high/len(faithfulness_scores)*100:.1f}%)")


3. HIGH-QUALITY EXPLANATIONS:
----------------------------------------
High faithfulness (≥8): 27/30 (90.0%)
High completeness (≥8): 30/30 (100.0%)
Both high (≥8): 27/30 (90.0%)
