# Evaluation on LLM-generated narratives for SHAP explanations

## Part 1: Data loading & Processing

In [2]:
# output the cleaned data
import pandas as pd

merged_data_final = pd.read_csv("../../data/processed/cleaned_data.csv.gz", compression="gzip")

X = merged_data_final.drop(['churn_risk_score'] ,axis = 1)
y = merged_data_final['churn_risk_score']

In [3]:
X = merged_data_final.drop(['churn_risk_score'] ,axis = 1)
y = merged_data_final['churn_risk_score']

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(
                    X, y, train_size=0.6, 
                    stratify= y,
                    random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(
                    X_temp, y_temp, train_size=0.5,
                    stratify=y_temp,
                    random_state=42)

In [5]:
# Export train and test dataset to `data` folder
pd.concat([X_train, y_train], axis=1).to_csv("../../data/input/train.csv.gz", index=False)
pd.concat([X_valid, y_valid], axis=1).to_csv("../../data/input/valid.csv.gz", index=False)
pd.concat([X_test, y_test], axis=1).to_csv("../../data/input/test.csv.gz", index=False)

### Text Representation with LLM embedding

In [6]:
import os
import pandas as pd 
from utils.prepare_llm_embedding import generate_embeddings_from_series 

EMBEDDING_TRAIN = "../../data/processed/llm_embedding_train.csv.gz"
if os.path.exists(EMBEDDING_TRAIN):
    pass
else:
    processed_text_series = pd.Series(X_train["feedback"].to_list(),
                                    index=X_train['id'].to_list()) 
    llm_embedding_train = generate_embeddings_from_series(processed_text_series,
                            additional_data={"churn_risk_score": y_train.to_list()},
                            output_csv_path="../../data/processed/llm_embedding_train.csv.gz",
                            max_workers=20)
    print(llm_embedding_train)
    

In [7]:
import os
import pandas as pd 
from utils.prepare_llm_embedding import generate_embeddings_from_series 

EMBEDDING_VALID = "../../data/processed/llm_embedding_valid.csv.gz"
if os.path.exists(EMBEDDING_VALID):
    pass
else:
    processed_text_series = pd.Series(X_valid["feedback"].to_list(),
                                    index=X_valid['id'].to_list()) 
    llm_embedding_valid = generate_embeddings_from_series(processed_text_series,
                            additional_data={"churn_risk_score": y_valid.to_list()},
                            output_csv_path="../../data/processed/llm_embedding_valid.csv.gz",
                            max_workers=20)
    print(llm_embedding_valid)

In [8]:
import os
import pandas as pd 
from utils.prepare_llm_embedding import generate_embeddings_from_series 

EMBEDDING_TEST = "../../data/processed/llm_embedding_test.csv.gz"
if os.path.exists(EMBEDDING_TEST):
    pass
else:
    processed_text_series = pd.Series(X_test["feedback"].to_list(),
                                    index=X_test['id'].to_list()) 
    llm_embedding_test= generate_embeddings_from_series(processed_text_series,
                            additional_data={"churn_risk_score": y_test.to_list()},
                            output_csv_path="../../data/processed/llm_embedding_test.csv.gz",
                            max_workers=20)
    print(llm_embedding_test)

### Combine LLM embeddings with structured data

In [9]:
train_text_vectorized = pd.read_csv("../../data/processed/llm_embedding_train.csv.gz", compression="gzip")
valid_text_vectorized = pd.read_csv("../../data/processed/llm_embedding_valid.csv.gz", compression="gzip")
test_text_vectorized = pd.read_csv("../../data/processed/llm_embedding_test.csv.gz", compression="gzip")

In [10]:
import ast

# Create train df which combines all structured data and textual data
train_df = pd.merge(X_train, train_text_vectorized[['id', 'embedding_json', 'churn_risk_score']], on='id', how='outer')
train_df['embedding_json'] = train_df['embedding_json'].apply(ast.literal_eval)

# create dataframe with columns with 'unstructured data'
X_train_unstructured_llm_embedding = pd.DataFrame(train_df['embedding_json'].tolist())
X_train_unstructured_llm_embedding.columns = [f"text_feature_{i+1}" for i in range(X_train_unstructured_llm_embedding.shape[1])]

# create dataframe with columns with 'structured data'
X_train_structured = train_df.drop(columns=['feedback', 'embedding_json', 'churn_risk_score'])

# Concatenate with both dataframe with structured and unstructured data
X_train_vectorized = pd.concat([X_train_structured, X_train_unstructured_llm_embedding], axis=1).drop(columns=['id'])
X_train_vectorized_with_id = pd.concat([X_train_structured, X_train_unstructured_llm_embedding], axis=1)

# Create target variable
y_train = train_df['churn_risk_score']

In [11]:
# Create validation df which combines all structured data and textual data
valid_df = pd.merge(X_valid, valid_text_vectorized[['id', 'embedding_json', 'churn_risk_score']], on='id', how='outer')
valid_df['embedding_json'] = valid_df['embedding_json'].apply(ast.literal_eval)

# create dataframe with columns with 'unstructured data'
X_valid_unstructured_llm_embedding = pd.DataFrame(valid_df['embedding_json'].tolist())
X_valid_unstructured_llm_embedding.columns = [f"text_feature_{i+1}" for i in range(X_valid_unstructured_llm_embedding.shape[1])]

# create dataframe with columns with 'structured data'
X_valid_structured = valid_df.drop(columns=['feedback', 'embedding_json', 'churn_risk_score'])

# Concatenate with both dataframe with structured and unstructured data
X_valid_vectorized  = pd.concat([X_valid_structured, X_valid_unstructured_llm_embedding], axis=1).drop(columns=['id'])
X_valid_vectorized_with_id  = pd.concat([X_valid_structured, X_valid_unstructured_llm_embedding], axis=1)

# Create target variable
y_valid = valid_df['churn_risk_score']

In [12]:
# Create test df which combines all structured data and textual data
test_df = pd.merge(X_test, test_text_vectorized[['id', 'embedding_json', 'churn_risk_score']], on='id', how='outer')
test_df['embedding_json'] = test_df['embedding_json'].apply(ast.literal_eval)

# create dataframe with columns with 'unstructured data'
X_test_unstructured_llm_embedding = pd.DataFrame(test_df['embedding_json'].tolist())
X_test_unstructured_llm_embedding.columns = [f"text_feature_{i+1}" for i in range(X_test_unstructured_llm_embedding.shape[1])]

# create dataframe with columns with 'structured data'
X_test_structured = test_df.drop(columns=['feedback', 'embedding_json', 'churn_risk_score'])

## Concatenate with original dataframe (drop embedding_json)
X_test_vectorized = pd.concat([X_test_structured, X_test_unstructured_llm_embedding], axis=1).drop(columns=['id'])
X_test_vectorized_with_id = pd.concat([X_test_structured, X_test_unstructured_llm_embedding], axis=1)

# Create target variable
y_test = test_df['churn_risk_score']

### Pick the best model

In [13]:
import xgboost as xgb
from sklearn.metrics import (f1_score, accuracy_score, precision_score, 
                            recall_score, classification_report, confusion_matrix)

# Train the model
xgb_model = xgb.XGBClassifier(max_depth=10,
                            random_state=42,
                            # Introduce randomness to make training faster and reduce overfitting
                            subsample=0.8, ## Uses 80% of the data for each tree.
                            colsample_bytree=0.8, ## Uses 80% of the features for each tree.
                            # the parameters below make the model trained faster by enabling parallelism
                            n_jobs = -1)
xgb_model.fit(X_train_vectorized, y_train)

# Predictions on training and test sets
y_train_pred_xgb = xgb_model.predict(X_train_vectorized)
y_valid_pred_xgb = xgb_model.predict(X_valid_vectorized)
y_test_pred_xgb = xgb_model.predict(X_test_vectorized)

# Accuracy scores
train_accuracy = accuracy_score(y_train, y_train_pred_xgb)
valid_accuracy = accuracy_score(y_valid, y_valid_pred_xgb)
test_accuracy = accuracy_score(y_test, y_test_pred_xgb)

# F1 scores
train_f1_score = f1_score(y_train, y_train_pred_xgb, average='weighted')
valid_f1_score = f1_score(y_valid, y_valid_pred_xgb, average='weighted')
test_f1_score = f1_score(y_test, y_test_pred_xgb, average='weighted')

# Precision scores
train_precision = precision_score(y_train, y_train_pred_xgb, average='weighted')
valid_precision = precision_score(y_valid, y_valid_pred_xgb, average='weighted')
test_precision = precision_score(y_test, y_test_pred_xgb, average='weighted')

# Recall scores
train_recall = recall_score(y_train, y_train_pred_xgb, average='weighted')
valid_recall = recall_score(y_valid, y_valid_pred_xgb, average='weighted')
test_recall = recall_score(y_test, y_test_pred_xgb, average='weighted')

# Output
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Valid Accuracy:  {valid_accuracy:.4f}")
print(f"Test Accuracy:  {test_accuracy:.4f}\n")

print(f"Train F1-score: {train_f1_score:.4f}")
print(f"Valid F1-score:  {valid_f1_score:.4f}")
print(f"Test F1-score:  {test_f1_score:.4f}\n")

print(f"Train Precision: {train_precision:.4f}")
print(f"Valid Precision:  {valid_precision:.4f}")
print(f"Test Precision:  {test_precision:.4f}\n")

print(f"Train Recall: {train_recall:.4f}")
print(f"Valid Recall:  {valid_recall:.4f}")
print(f"Test Recall:  {test_recall:.4f}\n")

print("Classification Report (Test):")
print(classification_report(y_test, y_test_pred_xgb))

print("Confusion Matrix (Test):")
print(confusion_matrix(y_test, y_test_pred_xgb))


Train Accuracy: 1.0000
Valid Accuracy:  0.9278
Test Accuracy:  0.9284

Train F1-score: 1.0000
Valid F1-score:  0.9276
Test F1-score:  0.9282

Train Precision: 1.0000
Valid Precision:  0.9286
Test Precision:  0.9287

Train Recall: 1.0000
Valid Recall:  0.9278
Test Recall:  0.9284

Classification Report (Test):
              precision    recall  f1-score   support

           0       0.94      0.90      0.92      3396
           1       0.92      0.95      0.93      4003

    accuracy                           0.93      7399
   macro avg       0.93      0.93      0.93      7399
weighted avg       0.93      0.93      0.93      7399

Confusion Matrix (Test):
[[3068  328]
 [ 202 3801]]


## Part 2: Generate SHAP explanations

In [14]:
# note: add 1 more sequential chain which interpret sentiment from textual data

api_key = os.getenv('OPENAI_API_KEY')

In [15]:
import shap
import pandas as pd

# Extract the model from the pipeline
best_model = xgb_model

# Create SHAP TreeExplainer using the extracted model
explainer = shap.TreeExplainer(xgb_model)

In [16]:
# Calculate SHAP values for the test data (excluding CustomerId)
# This took more than 1 hour to run...
import os
import joblib

shap_output_path = '../../models/shap_output.pkl'

if os.path.exists(shap_output_path):
    # Load precomputed SHAP values
    shap_values, expected_value = joblib.load(shap_output_path)
    print("Loaded SHAP values from cache.")
else:
    # Calculate SHAP values
    shap_values = explainer.shap_values(X_test_vectorized_with_id.drop(columns=['id']))
    expected_value = explainer.expected_value

    # Save SHAP values and base value for reuse
    joblib.dump((shap_values, expected_value), shap_output_path)
    print("SHAP values calculated and saved.")


Loaded SHAP values from cache.


In [17]:
# Convert SHAP values to DataFrame for easier manipulation
## [:,:,1] means i want to get shap values for positive class...
shap_df = pd.DataFrame(shap_values, columns=X_test_vectorized_with_id.drop(columns=['id']).columns)

# Add 'CustomerId' column to shap_df for alignment
shap_df['id'] = X_test_vectorized_with_id['id'].values

# Initialize a dictionary to store the JSON structures
json_structures = {}

# Generate a JSON structure for each row in shap_df
for index, row in shap_df.iterrows():
    # Create a dictionary for the current row
    row_dict = row.to_dict()

    # Use CustomerId as the key for the JSON structure and remove it from the values
    customer_id = row_dict.pop('id')
    json_structures[customer_id] = row_dict


In [18]:
from langchain.llms import OpenAI
import time
from langchain_openai.chat_models import ChatOpenAI
from langchain.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.chains import LLMChain, SequentialChain

In [19]:
def sort_and_get_top_features(features):
    sorted_features = sorted(features.items(), key=lambda item: abs(item[1]), reverse=True)
    top_features = sorted_features[:10]
    return top_features 

# Create an empty DataFrame
features_shap_values = pd.DataFrame()

# Iterate over each ID key
for id_key, features in json_structures.items():
    sorted_features_df = sort_and_get_top_features(features)
    keys = [key for key, _ in sorted_features_df]
    values = [value for _, value in sorted_features_df]
    features_shap_values = pd.concat([features_shap_values, pd.DataFrame({"ID": id_key,
                                                                          "top10_feature": [keys],
                                                                          "top10_shap_values":[values]})])
                                     
features_shap_values = features_shap_values.reset_index(drop=True)

In [20]:
# Combine predict and predict_proba in a DataFrame
import pandas as pd
import json

# Predict labels and probabilities
labels = rf_model.predict(X_test_vectorized)
proba = rf_model.predict_proba(X_test_vectorized)

# Create predictions DataFrame
predictions = pd.DataFrame(proba, columns=[f"prediction_score_{cls}" for cls in rf_model.classes_])
predictions.insert(0, "prediction_label", labels)

# Reset index if necessary (to ensure alignment during concat)
X_test_vectorized_with_id = X_test_vectorized_with_id.reset_index(drop=True)
predictions = predictions.reset_index(drop=True)

# Combine features with predictions
combined_df = pd.concat([X_test_vectorized_with_id, predictions], axis=1)

# Convert to JSON (list of dicts)
parsed_json = json.loads(combined_df.to_json(orient='records'))

# Example output
print(parsed_json[:1])

NameError: name 'rf_model' is not defined

In [None]:
def prepare_input_data(user_id, parsed_json, json_structures):
    desired_data = next(item for item in parsed_json if item['id'] == int(user_id))

    # get features from parsed_json, but exclude the text features
    meta_data = {
        'id': desired_data['id'],
        'gender_M': desired_data['gender_M'],
        'region_category_Town': desired_data['region_category_Town'],
        'region_category_Village': desired_data['region_category_Village'],
        'joined_through_referral_Yes': desired_data['joined_through_referral_Yes'],
        'preferred_offer_types_Gift Vouchers/Coupons': desired_data['preferred_offer_types_Gift Vouchers/Coupons'],
        'preferred_offer_types_Without Offers': desired_data['preferred_offer_types_Without Offers'],
        'medium_of_operation_Desktop': desired_data['medium_of_operation_Desktop'],
        'medium_of_operation_Smartphone': desired_data['medium_of_operation_Smartphone'],
        'internet_option_Mobile_Data': desired_data['internet_option_Mobile_Data'],
        'internet_option_Wi-Fi': desired_data['internet_option_Wi-Fi'],
        'used_special_discount_Yes': desired_data['used_special_discount_Yes'],
        'offer_application_preference_Yes': desired_data['offer_application_preference_Yes'],
        'past_complaint_Yes': desired_data['past_complaint_Yes'],
        'year': desired_data['year'],
        'membership_category': desired_data['membership_category'],
        'complaint_status': desired_data['complaint_status'],
        'age': desired_data['age'],
        'days_since_last_login': desired_data['days_since_last_login'],
        'avg_time_spent': desired_data['avg_time_spent'],
        'avg_transaction_value': desired_data['avg_transaction_value'],
        'avg_frequency_login_days': desired_data['avg_frequency_login_days'],
        'points_in_wallet': desired_data['points_in_wallet'],
    }

    print('metadata:')
    print(meta_data)
    print('\n')

    # get predicted_label from parsed_json
    result = desired_data['prediction_label']
    
    shap_values = json_structures[int(user_id)]
    
    return meta_data, result, shap_values

In [None]:
def generate_churn_explainability(meta_data: dict, 
                                  result: int, shap_values: dict, 
                                  ## input the customer feedback if you want to perform text analysis on customer feedback via LLM
                                  customer_feedback: str):

    # Create Model
    llm = ChatOpenAI(model ="gpt-4.1-mini", temperature=0.4, openai_api_key = api_key)
    
    columns = ['gender_M', 'region_category_Town', 'region_category_Village', 'joined_through_referral_Yes', 
            'preferred_offer_types_Gift Vouchers/Coupons', 'preferred_offer_types_Without Offers',
            'medium_of_operation_Desktop', 'medium_of_operation_Smartphone', 'internet_option_Mobile_Data', 
            'internet_option_Wi-Fi', 'used_special_discount_Yes', 'offer_application_preference_Yes',
            'past_complaint_Yes', 'year', 'membership_category', 'complaint_status', 'age', 'days_since_last_login', 
            'avg_time_spent', 'avg_transaction_value', 'avg_frequency_login_days', 'points_in_wallet'
    ]

    # Get columns  for user 
    template0 = "Explain the significance and potential usage of the following data columns in a dataset: {columns}. \
                  Provide a simple description for each column."
    prompt0 = ChatPromptTemplate.from_template(template0)
    chain_0 = LLMChain(llm=llm,
                   prompt=prompt0,
                   output_key="column_descriptions")


    # Template 1: Get meta data description for user 
    template1 = "Explain {meta_data} based on the information provided by {column_descriptions}."
    prompt1 = ChatPromptTemplate.from_template(template1)
    chain_1 = LLMChain(llm=llm,
                       prompt=prompt1,
                       output_key="meta_data_description")

    # Template 2: Get explainability for the user
    system_template = "You are a churn problem explainability assistant. Your goal is to understand the  complexities\
                       of customer loss and uncover why customers are leaving a service or product in business models.\
                       The explanations should be simple and understandable for users to analyze."
    system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)

    human_template = "The churn result of a customer, which is {result} value has been obtained in a machine\
                     learning model using the {meta_data_description}. The {shap_values} represent the Shapley\
                     values for each feature, indicating why the model produced this particular result. Shapley\
                     values provide insights into the contribution of each feature to the final prediction. \
                     Explain why such a result was obtained in a way that the user can understand. Don't explain\
                     what the Shapley value is in a technical manner, using language that a non-technical\
                     person can understand. Maximum text length should be 200 words. Provide a concise explanation\
                     without using the phrase 'based on."
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
    prompt2 = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
    chain_2 = LLMChain(llm=llm,
                       prompt=prompt2,
                       output_key="explanation")

    ## Template 3: 
    template3 = "Identify the main pain points, complaints, or issues mentioned in this customer feedback: {customer_feedback}. \
            Extract the specific problems the customer is experiencing and categorize them (e.g., product issues, \
            service problems, pricing concerns, usability issues)."
    prompt3 = ChatPromptTemplate.from_template(template3)
    chain_3 = LLMChain(llm=llm,
                           prompt=prompt3,
                           output_key="sentiment")
    
    # Template 4: 
    template4 = "Provide strategies based on {explanation} to prevent user loss. Explain in language that a non-technical \
                 person can understand. Maximum text length should be 200 words. Provide a concise explanation without using the \
                 phrase 'based on."
    prompt4 = ChatPromptTemplate.from_template(template4)
    chain_4 = LLMChain(llm=llm,
                       prompt=prompt4,
                       output_key="recommendation")
    

    # Define the  chain of operations with the new chain
    seq_chain = SequentialChain(chains=[chain_0, chain_1, chain_2, chain_3, chain_4],
                                input_variables=['columns', 'meta_data', 'result', 'shap_values', 'customer_feedback'],
                                output_variables=['column_descriptions', 'meta_data_description', 
                                                  'explanation', 'sentiment', 'recommendation'],
                                verbose=True)

    input_data = {
        'columns': columns,
        'meta_data': meta_data,
        'result': result,
        'shap_values': shap_values,
        'customer_feedback': customer_feedback
    }

    output = seq_chain.invoke(input_data)

    return output

In [None]:
sample_customer_id = 9809
customer_feedback = X_test[X_test.id == sample_customer_id]['feedback'].iloc[0]

meta_datas, results, shap_valuess = prepare_input_data(sample_customer_id, parsed_json, json_structures)
churn_explainability_sample_1 = generate_churn_explainability(meta_datas, results, shap_valuess, customer_feedback)
idx = features_shap_values.loc[features_shap_values.ID == sample_customer_id].index
features_shap_values.loc[idx,'explanation'] = churn_explainability_sample_1['explanation']
features_shap_values.loc[idx, 'sentiment'] = churn_explainability_sample_1['sentiment']
features_shap_values.loc[idx, 'recommendation'] = churn_explainability_sample_1['recommendation']

time.sleep(60)