# Evaluation on LLM-generated narratives for SHAP explanations

## Part 1: Data loading & Processing

In [1]:
# output the cleaned data
import pandas as pd

merged_data_final = pd.read_csv("../../data/processed/cleaned_data.csv.gz", compression="gzip")

X = merged_data_final.drop(['churn_risk_score'], axis = 1)
y = merged_data_final['churn_risk_score']

In [2]:
X = merged_data_final.drop(['churn_risk_score'], axis = 1)
y = merged_data_final['churn_risk_score']

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(
                    X, y, train_size=0.6, 
                    stratify= y,
                    random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(
                    X_temp, y_temp, train_size=0.5,
                    stratify=y_temp,
                    random_state=42)

In [4]:
# Export train and test dataset to `data` folder
pd.concat([X_train, y_train], axis=1).to_csv("../../data/input/train.csv.gz", index=False)
pd.concat([X_valid, y_valid], axis=1).to_csv("../../data/input/valid.csv.gz", index=False)
pd.concat([X_test, y_test], axis=1).to_csv("../../data/input/test.csv.gz", index=False)

### Text Representation with LLM embedding

In [5]:
import os
import pandas as pd 
from utils.prepare_llm_embedding import generate_embeddings_from_series 

EMBEDDING_TRAIN = "../../data/processed/llm_embedding_train.csv.gz"
if os.path.exists(EMBEDDING_TRAIN):
    pass
else:
    processed_text_series = pd.Series(X_train["feedback"].to_list(),
                                    index=X_train['id'].to_list()) 
    llm_embedding_train = generate_embeddings_from_series(processed_text_series,
                            additional_data={"churn_risk_score": y_train.to_list()},
                            output_csv_path="../../data/processed/llm_embedding_train.csv.gz",
                            max_workers=20)
    print(llm_embedding_train)
    

In [6]:
import os
import pandas as pd 
from utils.prepare_llm_embedding import generate_embeddings_from_series 

EMBEDDING_VALID = "../../data/processed/llm_embedding_valid.csv.gz"
if os.path.exists(EMBEDDING_VALID):
    pass
else:
    processed_text_series = pd.Series(X_valid["feedback"].to_list(),
                                    index=X_valid['id'].to_list()) 
    llm_embedding_valid = generate_embeddings_from_series(processed_text_series,
                            additional_data={"churn_risk_score": y_valid.to_list()},
                            output_csv_path="../../data/processed/llm_embedding_valid.csv.gz",
                            max_workers=20)
    print(llm_embedding_valid)

In [7]:
import os
import pandas as pd 
from utils.prepare_llm_embedding import generate_embeddings_from_series 

EMBEDDING_TEST = "../../data/processed/llm_embedding_test.csv.gz"
if os.path.exists(EMBEDDING_TEST):
    pass
else:
    processed_text_series = pd.Series(X_test["feedback"].to_list(),
                                    index=X_test['id'].to_list()) 
    llm_embedding_test= generate_embeddings_from_series(processed_text_series,
                            additional_data={"churn_risk_score": y_test.to_list()},
                            output_csv_path="../../data/processed/llm_embedding_test.csv.gz",
                            max_workers=20)
    print(llm_embedding_test)

### Combine LLM embeddings with structured data

In [8]:
train_text_vectorized = pd.read_csv("../../data/processed/llm_embedding_train.csv.gz", compression="gzip")
valid_text_vectorized = pd.read_csv("../../data/processed/llm_embedding_valid.csv.gz", compression="gzip")
test_text_vectorized = pd.read_csv("../../data/processed/llm_embedding_test.csv.gz", compression="gzip")

In [9]:
import ast

# Create train df which combines all structured data and textual data
train_df = pd.merge(X_train, train_text_vectorized[['id', 'embedding_json', 'churn_risk_score']], on='id', how='outer')
train_df['embedding_json'] = train_df['embedding_json'].apply(ast.literal_eval)

# create dataframe with columns with 'unstructured data'
X_train_unstructured_llm_embedding = pd.DataFrame(train_df['embedding_json'].tolist())
X_train_unstructured_llm_embedding.columns = [f"text_feature_{i+1}" for i in range(X_train_unstructured_llm_embedding.shape[1])]

# create dataframe with columns with 'structured data'
X_train_structured_with_id = train_df.drop(columns=['feedback', 'embedding_json', 'churn_risk_score'])
X_train_structured = X_train_structured_with_id.drop(columns=["id"])

# Concatenate with both dataframe with structured and unstructured data
X_train_vectorized = pd.concat([X_train_structured_with_id, X_train_unstructured_llm_embedding], axis=1).drop(columns=['id'])
X_train_vectorized_with_id = pd.concat([X_train_structured_with_id, X_train_unstructured_llm_embedding], axis=1)

# Create target variable
y_train = train_df['churn_risk_score']

In [10]:
# Create validation df which combines all structured data and textual data
valid_df = pd.merge(X_valid, valid_text_vectorized[['id', 'embedding_json', 'churn_risk_score']], on='id', how='outer')
valid_df['embedding_json'] = valid_df['embedding_json'].apply(ast.literal_eval)

# create dataframe with columns with 'unstructured data'
X_valid_unstructured_llm_embedding = pd.DataFrame(valid_df['embedding_json'].tolist())
X_valid_unstructured_llm_embedding.columns = [f"text_feature_{i+1}" for i in range(X_valid_unstructured_llm_embedding.shape[1])]

# create dataframe with columns with 'structured data'
X_valid_structured_with_id = valid_df.drop(columns=['feedback', 'embedding_json', 'churn_risk_score'])
X_valid_structured = X_valid_structured_with_id.drop(columns=["id"])

# Concatenate with both dataframe with structured and unstructured data
X_valid_vectorized  = pd.concat([X_valid_structured_with_id, X_valid_unstructured_llm_embedding], axis=1).drop(columns=['id'])
X_valid_vectorized_with_id  = pd.concat([X_valid_structured_with_id, X_valid_unstructured_llm_embedding], axis=1)

# Create target variable
y_valid = valid_df['churn_risk_score']

In [11]:
# Create test df which combines all structured data and textual data
test_df = pd.merge(X_test, test_text_vectorized[['id', 'embedding_json', 'churn_risk_score']], on='id', how='outer')
test_df['embedding_json'] = test_df['embedding_json'].apply(ast.literal_eval)

# create dataframe with columns with 'unstructured data'
X_test_unstructured_llm_embedding = pd.DataFrame(test_df['embedding_json'].tolist())
X_test_unstructured_llm_embedding.columns = [f"text_feature_{i+1}" for i in range(X_test_unstructured_llm_embedding.shape[1])]

# create dataframe with columns with 'structured data'
X_test_structured_with_id = test_df.drop(columns=['feedback', 'embedding_json', 'churn_risk_score'])
X_test_structured = X_test_structured_with_id.drop(columns=["id"])

## Concatenate with original dataframe (drop embedding_json)
X_test_vectorized = pd.concat([X_test_structured_with_id, X_test_unstructured_llm_embedding], axis=1).drop(columns=['id'])
X_test_vectorized_with_id = pd.concat([X_test_structured_with_id, X_test_unstructured_llm_embedding], axis=1)

# Create target variable
y_test = test_df['churn_risk_score']

### Pick the best model - "XGBoost with Structured Data"

In [12]:
import xgboost as xgb
from sklearn.metrics import (f1_score, accuracy_score, precision_score, 
                            recall_score, classification_report, confusion_matrix)

# Train the model
xgb_model = xgb.XGBClassifier(max_depth=10,
                            random_state=42,
                            # Introduce randomness to make training faster and reduce overfitting
                            subsample=0.8, ## Uses 80% of the data for each tree.
                            colsample_bytree=0.8, ## Uses 80% of the features for each tree.
                            # the parameters below make the model trained faster by enabling parallelism
                            n_jobs = -1)
xgb_model.fit(X_train_structured, y_train)

# Predictions on training and test sets
y_train_pred_xgb = xgb_model.predict(X_train_structured)
y_valid_pred_xgb = xgb_model.predict(X_valid_structured)
y_test_pred_xgb = xgb_model.predict(X_test_structured)

# Accuracy scores
train_accuracy = accuracy_score(y_train, y_train_pred_xgb)
valid_accuracy = accuracy_score(y_valid, y_valid_pred_xgb)
test_accuracy = accuracy_score(y_test, y_test_pred_xgb)

# F1 scores
train_f1_score = f1_score(y_train, y_train_pred_xgb, average='weighted')
valid_f1_score = f1_score(y_valid, y_valid_pred_xgb, average='weighted')
test_f1_score = f1_score(y_test, y_test_pred_xgb, average='weighted')

# Precision scores
train_precision = precision_score(y_train, y_train_pred_xgb, average='weighted')
valid_precision = precision_score(y_valid, y_valid_pred_xgb, average='weighted')
test_precision = precision_score(y_test, y_test_pred_xgb, average='weighted')

# Recall scores
train_recall = recall_score(y_train, y_train_pred_xgb, average='weighted')
valid_recall = recall_score(y_valid, y_valid_pred_xgb, average='weighted')
test_recall = recall_score(y_test, y_test_pred_xgb, average='weighted')

# Output
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Valid Accuracy:  {valid_accuracy:.4f}")
print(f"Test Accuracy:  {test_accuracy:.4f}\n")

print(f"Train F1-score: {train_f1_score:.4f}")
print(f"Valid F1-score:  {valid_f1_score:.4f}")
print(f"Test F1-score:  {test_f1_score:.4f}\n")

print(f"Train Precision: {train_precision:.4f}")
print(f"Valid Precision:  {valid_precision:.4f}")
print(f"Test Precision:  {test_precision:.4f}\n")

print(f"Train Recall: {train_recall:.4f}")
print(f"Valid Recall:  {valid_recall:.4f}")
print(f"Test Recall:  {test_recall:.4f}\n")

print("Classification Report (Test):")
print(classification_report(y_test, y_test_pred_xgb))

print("Confusion Matrix (Test):")
print(confusion_matrix(y_test, y_test_pred_xgb))

Train Accuracy: 1.0000
Valid Accuracy:  0.9273
Test Accuracy:  0.9273

Train F1-score: 1.0000
Valid F1-score:  0.9271
Test F1-score:  0.9272

Train Precision: 1.0000
Valid Precision:  0.9279
Test Precision:  0.9275

Train Recall: 1.0000
Valid Recall:  0.9273
Test Recall:  0.9273

Classification Report (Test):
              precision    recall  f1-score   support

           0       0.94      0.90      0.92      3396
           1       0.92      0.95      0.93      4003

    accuracy                           0.93      7399
   macro avg       0.93      0.93      0.93      7399
weighted avg       0.93      0.93      0.93      7399

Confusion Matrix (Test):
[[3070  326]
 [ 212 3791]]


## Part 2: Generate SHAP explanations

### 2.1 Prepare SHAP values to be feed into LLM

In [13]:
# Calculate SHAP values for the test data (excluding CustomerId)
import os
import joblib

import shap
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

n_samples = 30

# Extract the model from the pipeline
best_model = xgb_model

# Create SHAP TreeExplainer using the extracted model
explainer = shap.TreeExplainer(best_model)

# Perform stratified sampling on the test data to select 'n_samples' instances
sss = StratifiedShuffleSplit(n_splits=1, test_size=n_samples, random_state=42)
for _, test_index in sss.split(X_test_structured_with_id, y_test):
    X_test_sampled_with_id = X_test_structured_with_id.iloc[test_index]

# Calculate SHAP values
shap_values = explainer.shap_values(X_test_sampled_with_id.drop(columns=['id']))
expected_value = explainer.expected_value

print("SHAP values calculated")

SHAP values calculated


In [14]:
# Convert SHAP values to DataFrame for easier manipulation
shap_df = pd.DataFrame(shap_values, columns=X_test_structured_with_id.drop(columns=['id']).columns)

# Add 'Id' column to shap_df for alignment
shap_df['id'] = X_test_sampled_with_id['id'].values

# Initialize a dictionary to store the JSON structures
json_structures = {}

# Generate a JSON structure for each row in shap_df
for index, row in shap_df.iterrows():
    # Create a dictionary for the current row
    row_dict = row.to_dict()

    # Use id as the key for the JSON structure and remove it from the values
    customer_id = row_dict.pop('id')
    json_structures[customer_id] = row_dict

In [15]:
def sort_and_get_top_features(features):
    sorted_features = sorted(features.items(), key=lambda item: abs(item[1]), reverse=True)
    top_features = sorted_features[:10]
    return top_features 

# Create an empty DataFrame
features_shap_values = pd.DataFrame()

# Iterate over each ID key
for id_key, features in json_structures.items():
    sorted_features_df = sort_and_get_top_features(features)
    keys = [key for key, _ in sorted_features_df]
    values = [value for _, value in sorted_features_df]
    features_shap_values = pd.concat([features_shap_values, pd.DataFrame({"ID": id_key,
                                                                          "top10_feature": [keys],
                                                                          "top10_shap_values":[values]})])
                                     
features_shap_values = features_shap_values.reset_index(drop=True)

In [16]:
# Combine predict and predict_proba in a DataFrame
import pandas as pd
import json

# Predict labels and probabilities
labels = xgb_model.predict(X_test_structured)
proba = xgb_model.predict_proba(X_test_structured)

# Create predictions DataFrame
predictions = pd.DataFrame(proba, columns=[f"prediction_score_{cls}" for cls in xgb_model.classes_])
predictions.insert(0, "prediction_label", labels)

# Reset index if necessary (to ensure alignment during concat)
X_test_vectorized_with_id = X_test_vectorized_with_id.reset_index(drop=True)
predictions = predictions.reset_index(drop=True)

# Combine features with predictions
combined_df = pd.concat([X_test_vectorized_with_id, predictions], axis=1)

# Convert to JSON (list of dicts)
parsed_json = json.loads(combined_df.to_json(orient='records'))

# Example output
print(parsed_json[:1])

[{'gender_M': 0, 'region_category_Town': 0, 'region_category_Village': 0, 'joined_through_referral_Yes': 0, 'preferred_offer_types_Gift Vouchers/Coupons': 1, 'preferred_offer_types_Without Offers': 0, 'medium_of_operation_Desktop': 1, 'medium_of_operation_Smartphone': 0, 'internet_option_Mobile_Data': 1, 'internet_option_Wi-Fi': 0, 'used_special_discount_Yes': 1, 'offer_application_preference_Yes': 0, 'past_complaint_Yes': 1, 'id': 2, 'years_since_joining': 4, 'membership_category': 5.0, 'complaint_status': 3.0, 'age': -0.3225616825, 'days_since_last_login': 0.5927148131, 'avg_time_spent': -0.8047866074, 'avg_transaction_value': -0.8451602355, 'avg_frequency_login_days': -0.8803947756, 'points_in_wallet': 0.0337832844, 'text_feature_1': 0.02197793, 'text_feature_2': -0.046620663, 'text_feature_3': -0.03335001, 'text_feature_4': -0.03717942, 'text_feature_5': 0.012610546, 'text_feature_6': -0.005627555, 'text_feature_7': 0.010386296, 'text_feature_8': -0.022074727, 'text_feature_9': -0.

In [17]:
def prepare_input_data(user_id, parsed_json, json_structures):
    desired_data = next(item for item in parsed_json if item['id'] == int(user_id))

    # get predicted_label from parsed_json
    result = desired_data['prediction_label']
    
    shap_values = json_structures[int(user_id)]
    
    return result, shap_values

### 2.2 Prompt Template to generate narratives for SHAP explanations

```markdown
# System Prompt
You are a churn explanation assistant. Provide clear, descriptive, business-oriented narratives that non-technical users can understand.

# User Prompt
The model predicted this customer will <CHURN_STATUS>.
Top <TOP_N> features affecting the prediction:
- <FEATURE_1>: <direction - in what direction it affects churn, either increase or decrease>, <magnitude - how strong the impact of the feature is> effect
- <FEATURE_2>: <direction>, <magnitude> effect
...
- <FEATURE_N>: <direction>, <magnitude> effect

Write a concise, business-friendly explanation for a non-technical user (50 words).  
Describe why these features contribute to the prediction, using clear, empathetic language.  
Do not use SHAP jargon or the phrase "based on".
```

---

For example:

```markdown
# System Prompt
You are a churn explanation assistant. Provide clear, descriptive, business-oriented narratives that non-technical users can understand.

# User Prompt
The model predicted this customer will not churn.
Top 5 features affecting the prediction:
- Joined Through Referral Yes: decreases churn, very strong effect
- Past Complaint Yes: increases churn, moderate effect
- Membership Category: decreases churn, strong effect
- Avg Frequency Login Days: decreases churn, strong effect
- Points In Wallet: decreases churn, moderate effect

Write a concise, business-friendly explanation for a non-technical user (50 words).  
Describe why these features contribute to the prediction, using clear, empathetic language.  
Do not use SHAP jargon or the phrase "based on".
```

In [18]:
import os
from typing import Dict, Any
import openai

from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

openai_api_key = os.getenv("OPENAI_API_KEY")

# Feature magnitude mapping for SHAP values to reduce ambiguity
def map_shap_magnitude(value):
    abs_val = abs(value)
    if abs_val > 5:
        return "very strong"
    elif abs_val > 1:
        return "strong"
    elif abs_val > 0.3:
        return "moderate"
    else:
        return "weak"

# Generate LLM-generated narratives for ML shap explanations
def generate_churn_explainability(result: int, 
                                  shap_values: dict,
                                  api_key: str,
                                  top_n: int = 5, # Top 5 features are mentioned
                                  model: str = "gpt-4o-mini",
                                  temperature: float = 0.2,
                                  show_prompt=False) -> Dict[str, Any]:
    """
    Generate churn explainability using OpenAI SDK directly
    """
    
    # Initialize OpenAI client
    client = openai.OpenAI(api_key=api_key)
    
    # Step 1: Select top features by absolute SHAP value
    top_features = sorted(
        shap_values.items(), key=lambda x: abs(x[1]), reverse=True
    )[:top_n]

    # Step 2: Preprocess features for LLM input
    feature_list: list[str] = []
    for feat, val in top_features:
        direction = "increases churn" if val > 0 else "decreases churn"
        magnitude = map_shap_magnitude(val)
        description = feat.replace("_", " ").title()
        feature_list.append(f"- {description}: {direction}, {magnitude} effect")

    feature_text = "\n".join(feature_list)

    # Step 3: Create prompt for explanation
    explanation_prompt = f"""
The model predicted this customer will {'churn' if result == 1 else 'not churn'}.
Top {top_n} features affecting the prediction:
{feature_text}

Write a concise, business-friendly explanation for a non-technical user (50 words).
Describe why these features contribute to the prediction, using clear, empathetic language.
Do not use SHAP jargon or the phrase "based on".
"""
    system_prompt = "You are a churn explanation assistant. Provide clear, descriptive, business-oriented narratives that non-technical users can understand."
    
    if show_prompt is True:
        print("PROMPT TEMPLATE FOR generate_churn_explainability function")
        print("===========")
        print("# System Prompt")
        print(system_prompt)
        print("\n# User Prompt")
        print(explanation_prompt)
        print("===========")
    
    response = client.chat.completions.create(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": explanation_prompt}
        ]
    )
    explanation = response.choices[0].message.content.strip()
    
    # Return results in the same format as the original function
    return {
        "top_features": feature_list,
        'explanation': explanation
    }


- Demo for the first row to `generate_churn_explainability`

In [19]:
# Generate explanation for the first row only
output_list = []

# Get the first user_id
first_user_id = X_test_sampled_with_id['id'].iloc[0]

# Prepare input data
result, shap_values = prepare_input_data(
    first_user_id, parsed_json, json_structures
)

# Generate explanation
output = generate_churn_explainability(
    result=result,
    shap_values=shap_values,
    api_key=openai_api_key,
    show_prompt=True
)

# Attach extra info
output["shap_values"] = shap_values
output["predicted_label"] = result

# Store in list
output_list.append(output)

# Demo print
print("Demo: first row output")
print(json.dumps(output, indent=2))


PROMPT TEMPLATE FOR generate_churn_explainability function
# System Prompt
You are a churn explanation assistant. Provide clear, descriptive, business-oriented narratives that non-technical users can understand.

# User Prompt

The model predicted this customer will not churn.
Top 5 features affecting the prediction:
- Membership Category: decreases churn, very strong effect
- Avg Transaction Value: decreases churn, strong effect
- Avg Frequency Login Days: decreases churn, moderate effect
- Age: increases churn, weak effect
- Avg Time Spent: decreases churn, weak effect

Write a concise, business-friendly explanation for a non-technical user (50 words).
Describe why these features contribute to the prediction, using clear, empathetic language.
Do not use SHAP jargon or the phrase "based on".

Demo: first row output
{
  "top_features": [
    "- Membership Category: decreases churn, very strong effect",
    "- Avg Transaction Value: decreases churn, strong effect",
    "- Avg Frequency 

In [20]:
# Looping to generate the LLM-generated narratives for SHAP explanations
output_list = []

# total number of samples
n_samples = len(X_test_sampled_with_id)

for idx, user_id in enumerate(X_test_sampled_with_id['id'], start=1):
    result, shap_values = prepare_input_data(user_id, parsed_json, json_structures)

    # Then generate explanations
    output = generate_churn_explainability(
        result=result,
        shap_values=shap_values,
        api_key=openai_api_key
    )
    
    output["shap_values"] = shap_values
    output["predicted_label"] = result
    output_list.append(output)

In [21]:
_ = os.makedirs("../../data/output", exist_ok=True)

shap_output_df = pd.DataFrame.from_records(output_list)
shap_output_df.to_csv("../../data/output/llm_generated_narratives_on_shap.csv.gz", index=False)

In [22]:
shap_output_df.head(3)

Unnamed: 0,top_features,explanation,shap_values,predicted_label
0,"[- Membership Category: decreases churn, very ...",The model suggests this customer is likely to ...,"{'gender_M': -0.027097957208752632, 'region_ca...",0
1,"[- Membership Category: increases churn, stron...",The model indicates this customer may leave du...,"{'gender_M': 0.02640422061085701, 'region_cate...",1
2,"[- Membership Category: increases churn, stron...",The model indicates this customer is likely to...,"{'gender_M': 0.09955321252346039, 'region_cate...",1


In [23]:
# double check for the dimension of the shap_output_df
shap_output_df.shape

(30, 4)

In [24]:
# make sure both target variable having equal distribution
shap_output_df["predicted_label"].value_counts()

predicted_label
1    16
0    14
Name: count, dtype: int64

## Part 3: LLM As A Judge

### Prompt Template for LLM As A Judge

```markdown
# System Prompt
You are an impartial judge and an expert in machine learning explainability.  
Your task is to evaluate the alignment between a generated churn narrative and the provided SHAP values.

You will receive:
- The model prediction result
- The customer's profile description
- The top 5 SHAP values
- A generated narrative to evaluate

Your job is to check which features in the narrative align with the SHAP values in terms of **direction** (towards churn or away from churn) and **magnitude** (strong, weak, etc.).

# User Prompt
**[Context Data]**
- Predicted Churn Result: <PREDICTED_LABEL> (0 = will not churn, 1 = will churn)
- Top 5 SHAP Values: <TOP_5_SHAP_VALUES>

**[Generated Narrative to Evaluate]**
<GENERATED_NARRATIVE>

Please return your evaluation in this format:

```json
[
  {
    "feature": "<feature_name>",
    "mentioned_in_text": <True|False>,
    "direction_text": "<direction extracted from text_direction>",
    "direction_shap": "<direction extracted from shap_direction_magnitude>",
    "magnitude_shap": "<magnitude extracted from shap_direction_magnitude>",
    "direction_match": <True|False>,
    "magnitude_match": <True|False>
  },
  ...
]
```

In [25]:
# Load the data
prepare_for_judge_df = pd.read_csv('../../data/output/llm_generated_narratives_on_shap.csv.gz', compression='gzip')

In [26]:
prepare_for_judge_df.head(3)

Unnamed: 0,top_features,explanation,shap_values,predicted_label
0,"['- Membership Category: decreases churn, very...",The model suggests this customer is likely to ...,"{'gender_M': -0.027097957208752632, 'region_ca...",0
1,"['- Membership Category: increases churn, stro...",The model indicates this customer may leave du...,"{'gender_M': 0.02640422061085701, 'region_cate...",1
2,"['- Membership Category: increases churn, stro...",The model indicates this customer is likely to...,"{'gender_M': 0.09955321252346039, 'region_cate...",1


In [27]:
# get top 5 most important features from SHAP values
def get_top_shap_values_dict(shap_str, top_n=5):
    # Convert string to dict if needed
    if isinstance(shap_str, str):
        shap_dict = ast.literal_eval(shap_str)
    else:
        shap_dict = shap_str
    
    # Sort by absolute values (largest to smallest), but keep original values
    sorted_items = sorted(shap_dict.items(), key=lambda x: abs(x[1]), reverse=True)[:top_n]
    
    # Create dictionary with feature names and values
    top_shap_dict = {}
    for feature_name, value in sorted_items:
        top_shap_dict[feature_name] = value
    
    return top_shap_dict

prepare_for_judge_df['top_5_shap_values'] = prepare_for_judge_df['shap_values'].apply(
        lambda x: get_top_shap_values_dict(x, top_n=5))



In [28]:
import pandas as pd
import json
from openai import OpenAI

def evaluate_narrative_with_llm(explanation_text, top_5_shap_values, predicted_label, show_prompt=False):
    """
    Use OpenAI LLM as a judge to evaluate narrative quality feature-by-feature
    """
    
    system_prompt = f"""You are an impartial judge and an expert in machine learning explainability. 
Your task is to evaluate the alignment between a generated churn narrative and the provided SHAP values.

You will receive:
- The model prediction result
- The customer's profile description
- The top 5 SHAP values
- A generated narrative to evaluate

Your job is to check which features in the narrative align with the SHAP values in terms of **direction** 
(towards churn or away from churn) and **magnitude** (strong, weak, etc.).
"""

    user_prompt = f"""
**[Context Data]**
- Predicted Churn Result: {predicted_label} (0 = will not churn, 1 = will churn)
- Top 5 SHAP Values: {top_5_shap_values}

**[Generated Narrative to Evaluate]**
{explanation_text}

Please return your evaluation in this format:

```json
[
  {{
    "feature": "<feature_name>",
    "mentioned_in_text": <True|False>,
    "direction_text": "<direction extracted from text_direction>",
    "direction_shap": "<direction extracted from shap_direction_magnitude>",
    "magnitude_shap": "<magnitude extracted from shap_direction_magnitude>",
    "direction_match": <True|False>,
    "magnitude_match": <True|False>
  }},
  ...
]
```
"""
    # Initialize OpenAI client
    client = OpenAI(api_key=openai_api_key)
    
    if show_prompt is True:
        print("PROMPT TEMPLATE FOR evaluate_narrative_with_llm function")
        print("===========")
        print("# System Prompt")
        print(system_prompt)
        print("\n# User Prompt")
        print(user_prompt)
        print("===========")

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.1,
            max_tokens=1000
        )
        
        return response.choices[0].message.content

    except Exception as e:
        print(f"Error calling OpenAI API: {e}")
        return None


In [29]:
# Just evaluate the first explanation
import re

results = []

print("Evaluating first explanation only...")

first_row = prepare_for_judge_df.dropna(subset=['explanation']).iloc[0]

evaluation = evaluate_narrative_with_llm(
    first_row['explanation'],
    first_row['top_5_shap_values'],
    first_row['predicted_label'],
    show_prompt=True
)

if evaluation:
    try:
        # Extract JSON inside ```json ... ```
        match = re.search(r"```json\s*(\[.*?\])\s*```", evaluation, re.DOTALL)
        if match:
            clean_eval = match.group(1)
        else:
            clean_eval = evaluation.strip()

        feature_data = json.loads(clean_eval)  # list of per-feature dicts

        # ---- Compute factual accuracy score (0–10) ----
        direction_points = sum(1 for f in feature_data if f.get("direction_match"))
        magnitude_points = sum(1 for f in feature_data if f.get("magnitude_match"))
        correct_points = direction_points + magnitude_points
        max_points = 2 * len(feature_data)
        accuracy_score = (correct_points / max_points) * 10 if max_points > 0 else 0

        # ---- Scale completeness (from LLM 1–5 → 1–10) ----
        # If you already collect LLM completeness score in 1–5
        # ---- Calculate completeness based on mentioned features ----
        mentioned_features = sum(1 for f in feature_data if f.get("mentioned_in_text"))
        completeness_raw = mentioned_features  # Count of features mentioned in text (0-5)
        completeness_score_10 = round((completeness_raw / 5) * 10, 1)  # Scale to 0-10

        # ---- Build eval_data summary ----
        eval_data = {
            "completeness": {"score": completeness_score_10},
            "accuracy": {
                "score": round(accuracy_score, 1),
                "direction_points": direction_points,
                "magnitude_points": magnitude_points
            },
            "raw_features": feature_data
        }

        results.append({
            'index': first_row.name,
            'explanation': first_row['explanation'],
            'predicted_label': first_row['predicted_label'],
            "top_5_shap_magnitudes": {feature: map_shap_magnitude(value) for feature, value in first_row['top_5_shap_values'].items()},
            "top_5_shap_values": first_row['top_5_shap_values'],
            'evaluation': eval_data
        })

    except json.JSONDecodeError:
        print("Failed to parse JSON for the first row, storing raw output.")
        results.append({
            'index': first_row.name,
            'evaluation': evaluation
        })

# Save results
evaluation_df = pd.DataFrame(results)
print("\nDemo:\n=========")
print("\nPredicted Label: ", evaluation_df["predicted_label"][0])
print("\nExplanation: ", evaluation_df["explanation"][0])
print("\nTop 5 SHAP magnitudes: ", evaluation_df["top_5_shap_magnitudes"][0])
print("\nTop 5 SHAP values: ", evaluation_df["top_5_shap_values"][0])
print("\nEvaluation: ", json.dumps(evaluation_df["evaluation"][0], indent=2))


Evaluating first explanation only...
PROMPT TEMPLATE FOR evaluate_narrative_with_llm function
# System Prompt
You are an impartial judge and an expert in machine learning explainability. 
Your task is to evaluate the alignment between a generated churn narrative and the provided SHAP values.

You will receive:
- The model prediction result
- The customer's profile description
- The top 5 SHAP values
- A generated narrative to evaluate

Your job is to check which features in the narrative align with the SHAP values in terms of **direction** 
(towards churn or away from churn) and **magnitude** (strong, weak, etc.).


# User Prompt

**[Context Data]**
- Predicted Churn Result: 0 (0 = will not churn, 1 = will churn)
- Top 5 SHAP Values: {'membership_category': -7.076250076293945, 'avg_transaction_value': -3.1553311347961426, 'avg_frequency_login_days': -0.8629294037818909, 'age': 0.22118932008743286, 'avg_time_spent': -0.1626385748386383}

**[Generated Narrative to Evaluate]**
The model s

In [30]:
# Just evaluate all explanationsmp
import re

results = []

print(f"Evaluating {len(prepare_for_judge_df)} explanations...")

for idx, row in prepare_for_judge_df.iterrows():
    if pd.isna(row['explanation']):
        continue
    
    print(f"Evaluating explanation {idx+1}/{len(prepare_for_judge_df)}")
    
    evaluation = evaluate_narrative_with_llm(
        row['explanation'],
        row['top_5_shap_values'],
        row['predicted_label']
    )

    if evaluation:
        try:
            # Extract JSON inside ```json ... ```
            match = re.search(r"```json\s*(\[.*?\])\s*```", evaluation, re.DOTALL)
            if match:
                clean_eval = match.group(1)
            else:
                clean_eval = evaluation.strip()

            feature_data = json.loads(clean_eval)  # list of per-feature dicts

            # ---- Compute factual accuracy score (0–10) ----
            direction_points = sum(1 for f in feature_data if f.get("direction_match"))
            magnitude_points = sum(1 for f in feature_data if f.get("magnitude_match"))
            correct_points = direction_points + magnitude_points
            max_points = 2 * len(feature_data)
            accuracy_score = (correct_points / max_points) * 10 if max_points > 0 else 0

            # ---- Scale completeness (from LLM 1–5 → 1–10) ----
            mentioned_features = sum(1 for f in feature_data if f.get("mentioned_in_text"))
            completeness_raw = mentioned_features  # Count of features mentioned in text (0-5)
            completeness_score_10 = round((completeness_raw / 5) * 10, 1)  # Scale to 0-10

            # ---- Build eval_data summary ----
            eval_data = {
                "completeness": {"score": completeness_score_10},
                "accuracy": {
                    "score": round(accuracy_score, 1),
                    "direction_points": direction_points,
                    "magnitude_points": magnitude_points
                },
                "raw_features": feature_data
            }

            results.append({
                'index': idx,
                'explanation': row['explanation'],
                'predicted_label': row['predicted_label'],
                "top_5_shap_magnitudes": {feature: map_shap_magnitude(value) for feature, value in row['top_5_shap_values'].items()},
                "top_5_shap_values": row['top_5_shap_values'],
                'evaluation': eval_data
            })

        except json.JSONDecodeError:
            print(f"Failed to parse JSON for row {idx}, storing raw output.")
            results.append({
                'index': idx,
                'evaluation': evaluation
            })

# Save results
evaluation_df = pd.DataFrame(results)


Evaluating 30 explanations...
Evaluating explanation 1/30
Evaluating explanation 2/30
Evaluating explanation 3/30
Evaluating explanation 4/30
Evaluating explanation 5/30
Evaluating explanation 6/30
Evaluating explanation 7/30
Evaluating explanation 8/30
Evaluating explanation 9/30
Evaluating explanation 10/30
Evaluating explanation 11/30
Evaluating explanation 12/30
Evaluating explanation 13/30
Evaluating explanation 14/30
Evaluating explanation 15/30
Evaluating explanation 16/30
Evaluating explanation 17/30
Evaluating explanation 18/30
Evaluating explanation 19/30
Evaluating explanation 20/30
Evaluating explanation 21/30
Evaluating explanation 22/30
Evaluating explanation 23/30
Evaluating explanation 24/30
Evaluating explanation 25/30
Evaluating explanation 26/30
Evaluating explanation 27/30
Evaluating explanation 28/30
Evaluating explanation 29/30
Evaluating explanation 30/30


In [31]:
# Save results
evaluation_df = pd.DataFrame(results)
evaluation_df.to_csv('../../data/output/llm_judge_evaluation_results.csv.gz', index=False)

print("Results saved to 'data/output/llm_judge_evaluation_results.csv.gz'")

Results saved to 'data/output/llm_judge_evaluation_results.csv.gz'


In [32]:
# double check the dimension of evaluation_df
evaluation_df.shape

(30, 6)

In [33]:
evaluation_df.head(3)

Unnamed: 0,index,explanation,predicted_label,top_5_shap_magnitudes,top_5_shap_values,evaluation
0,0,The model suggests this customer is likely to ...,0,"{'membership_category': 'very strong', 'avg_tr...","{'membership_category': -7.076250076293945, 'a...","{'completeness': {'score': 10.0}, 'accuracy': ..."
1,1,The model indicates this customer may leave du...,1,"{'membership_category': 'strong', 'avg_frequen...","{'membership_category': 2.8404221534729004, 'a...","{'completeness': {'score': 10.0}, 'accuracy': ..."
2,2,The model indicates this customer is likely to...,1,"{'membership_category': 'strong', 'avg_frequen...","{'membership_category': 2.958069324493408, 'av...","{'completeness': {'score': 10.0}, 'accuracy': ..."


## Part 4: Evaluation Results Analysis

In [34]:
import ast

# Convert string representations of dicts to actual dicts if needed
evaluation_df['evaluation'] = evaluation_df['evaluation'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Extract individual scores for each row
accuracy_scores = evaluation_df['evaluation'].apply(lambda x: x['accuracy']['score']).values
completeness_scores = evaluation_df['evaluation'].apply(lambda x: x['completeness']['score']).values

# Print individual scores for each row
print("Individual Scores for Each Row:")
print("=" * 50)
for idx, (acc, comp) in enumerate(zip(accuracy_scores, completeness_scores)):
    print(f"Row {idx+1}: Accuracy = {acc}, Completeness = {comp}")

print("\n" + "=" * 50)
print("Summary Statistics:")
print(f"Average Accuracy Score: {accuracy_scores.mean():.2f}")
print(f"Average Completeness Score: {completeness_scores.mean():.2f}")
print(f"Accuracy Score Range: {accuracy_scores.min():.1f} - {accuracy_scores.max():.1f}")
print(f"Completeness Score Range: {completeness_scores.min():.1f} - {completeness_scores.max():.1f}")

# Optional: Create a DataFrame for easier viewing
scores_df = pd.DataFrame({
    'Row': range(1, len(accuracy_scores) + 1),
    'Accuracy_Score': accuracy_scores,
    'Completeness_Score': completeness_scores,
    'Predicted_Label': evaluation_df['predicted_label'].values
})

print("\nDetailed Scores DataFrame:")
print(scores_df)

Individual Scores for Each Row:
Row 1: Accuracy = 10.0, Completeness = 10.0
Row 2: Accuracy = 10.0, Completeness = 10.0
Row 3: Accuracy = 10.0, Completeness = 10.0
Row 4: Accuracy = 8.0, Completeness = 8.0
Row 5: Accuracy = 8.0, Completeness = 8.0
Row 6: Accuracy = 10.0, Completeness = 10.0
Row 7: Accuracy = 7.0, Completeness = 8.0
Row 8: Accuracy = 9.0, Completeness = 10.0
Row 9: Accuracy = 10.0, Completeness = 10.0
Row 10: Accuracy = 10.0, Completeness = 10.0
Row 11: Accuracy = 10.0, Completeness = 10.0
Row 12: Accuracy = 8.0, Completeness = 10.0
Row 13: Accuracy = 9.0, Completeness = 10.0
Row 14: Accuracy = 10.0, Completeness = 10.0
Row 15: Accuracy = 10.0, Completeness = 10.0
Row 16: Accuracy = 10.0, Completeness = 10.0
Row 17: Accuracy = 9.0, Completeness = 10.0
Row 18: Accuracy = 10.0, Completeness = 10.0
Row 19: Accuracy = 9.0, Completeness = 10.0
Row 20: Accuracy = 10.0, Completeness = 10.0
Row 21: Accuracy = 8.0, Completeness = 8.0
Row 22: Accuracy = 8.0, Completeness = 8.0
Ro

In [35]:
# Key statistical analysis
print("\n" + "=" * 60)
print("KEY STATISTICAL ANALYSIS")
print("=" * 60)

# 1. Score distributions
print("\n1. SCORE DISTRIBUTIONS:")
print("-" * 40)
print("Accuracy Score Distribution:")
accuracy_counts = pd.Series(accuracy_scores).value_counts().sort_index()
print(accuracy_counts)

print("\nCompleteness Score Distribution:")
completeness_counts = pd.Series(completeness_scores).value_counts().sort_index()
print(completeness_counts)



KEY STATISTICAL ANALYSIS

1. SCORE DISTRIBUTIONS:
----------------------------------------
Accuracy Score Distribution:
4.0      1
7.0      2
8.0      9
9.0      4
10.0    14
Name: count, dtype: int64

Completeness Score Distribution:
8.0      9
10.0    21
Name: count, dtype: int64


In [36]:

# 2. Performance by predicted label (most important)
print("\n2. PERFORMANCE BY PREDICTED LABEL:")
print("-" * 40)
label_analysis = pd.DataFrame({
    'Predicted_Label': evaluation_df['predicted_label'].values,
    'Accuracy_Score': accuracy_scores,
    'Completeness_Score': completeness_scores
})

label_stats = label_analysis.groupby('Predicted_Label').agg({
    'Accuracy_Score': ['mean', 'count'],
    'Completeness_Score': ['mean', 'count']
}).round(2)
print(label_stats)


2. PERFORMANCE BY PREDICTED LABEL:
----------------------------------------
                Accuracy_Score       Completeness_Score      
                          mean count               mean count
Predicted_Label                                              
0                         9.07    14               9.29    14
1                         8.69    16               9.50    16


In [37]:

# 3. High-quality explanations count (business-relevant)
print("\n3. HIGH-QUALITY EXPLANATIONS:")
print("-" * 40)
high_accuracy = (accuracy_scores >= 7).sum()
high_completeness = (completeness_scores >= 7).sum()
both_high = ((accuracy_scores >= 7) & (completeness_scores >= 7)).sum()

print(f"High accuracy (≥7): {high_accuracy}/{len(accuracy_scores)} ({high_accuracy/len(accuracy_scores)*100:.1f}%)")
print(f"High completeness (≥7): {high_completeness}/{len(completeness_scores)} ({high_completeness/len(completeness_scores)*100:.1f}%)")
print(f"Both high (≥7): {both_high}/{len(accuracy_scores)} ({both_high/len(accuracy_scores)*100:.1f}%)")


3. HIGH-QUALITY EXPLANATIONS:
----------------------------------------
High accuracy (≥7): 29/30 (96.7%)
High completeness (≥7): 30/30 (100.0%)
Both high (≥7): 29/30 (96.7%)
