Global Score for Causal SHAP IBS

In [33]:
import pickle

# Define the path to the pickle file
file_path = '../../../result/Causal_SHAP_IBS_456.pkl'

# Load the data
with open(file_path, 'rb') as f:
    phi_normalized_list = pickle.load(f)

# Print or inspect the loaded data
print(phi_normalized_list)


[{'xylose': np.float64(0.14148265471196314), 'xanthosine': np.float64(0.0018015195116561462), 'uracil': np.float64(-0.0058156159290696325), 'ribulose/xylulose': np.float64(0.03853922308869769), 'valylglutamine': np.float64(-0.00041513023957153106), 'tryptophylglycine': np.float64(0.0005679225155662201), 'succinate': np.float64(-0.03205775976381568), 'valine betaine': np.float64(0.027949149114343524), 'ursodeoxycholate sulfate (1)': np.float64(7.026059695047335e-05), 'tricarballylate': np.float64(0.0006420994000578663), 'succinimide': np.float64(-0.00029781811723972963), 'thymine': np.float64(-0.0029820161614509226), 'syringic acid': np.float64(4.156094724588505e-05), 'serotonin': np.float64(0.10951437016443413), 'ribitol': np.float64(0.005021032124745758)}, {'xylose': np.float64(0.08605359050093922), 'xanthosine': np.float64(0.0022162541307095478), 'uracil': np.float64(-0.0024667580064970562), 'ribulose/xylulose': np.float64(0.10883440294053695), 'valylglutamine': np.float64(-4.8461078

In [34]:
print(phi_normalized_list[0])

{'xylose': np.float64(0.14148265471196314), 'xanthosine': np.float64(0.0018015195116561462), 'uracil': np.float64(-0.0058156159290696325), 'ribulose/xylulose': np.float64(0.03853922308869769), 'valylglutamine': np.float64(-0.00041513023957153106), 'tryptophylglycine': np.float64(0.0005679225155662201), 'succinate': np.float64(-0.03205775976381568), 'valine betaine': np.float64(0.027949149114343524), 'ursodeoxycholate sulfate (1)': np.float64(7.026059695047335e-05), 'tricarballylate': np.float64(0.0006420994000578663), 'succinimide': np.float64(-0.00029781811723972963), 'thymine': np.float64(-0.0029820161614509226), 'syringic acid': np.float64(4.156094724588505e-05), 'serotonin': np.float64(0.10951437016443413), 'ribitol': np.float64(0.005021032124745758)}


In [35]:
sums = {}
counts = {}

for instance in phi_normalized_list:
    for metabolite, value in instance.items():
        sums[metabolite] = sums.get(metabolite, 0) + abs(value)
        counts[metabolite] = counts.get(metabolite, 0) + 1

means = {metabolite: sums[metabolite] / counts[metabolite] for metabolite in sums}

sorted_means = sorted(means.items(), key=lambda item: abs(item[1]), reverse=True)

causal_output_list = [(metabolite, mean_value) for metabolite, mean_value in sorted_means]

print(causal_output_list)

[('xylose', np.float64(0.16149886610084832)), ('ribulose/xylulose', np.float64(0.15125226147977863)), ('serotonin', np.float64(0.05847924581749287)), ('valine betaine', np.float64(0.05025168083788193)), ('thymine', np.float64(0.021627073480483903)), ('uracil', np.float64(0.015860090956949558)), ('succinate', np.float64(0.013363389721575608)), ('ribitol', np.float64(0.008482407060685588)), ('succinimide', np.float64(0.0033349212251750407)), ('xanthosine', np.float64(0.0024575893611374746)), ('tryptophylglycine', np.float64(0.0015227042112552723)), ('valylglutamine', np.float64(0.0010843002457708065)), ('tricarballylate', np.float64(0.0005110006779311907)), ('syringic acid', np.float64(0.0004981860958519981)), ('ursodeoxycholate sulfate (1)', np.float64(0.0003280757279722092))]


Original SHAP

In [36]:
import argparse
from pathlib import Path
from data_processing import DataProcessor
from models import ModelTrainer
from feature_selection import FeatureSelector
from visualization import Visualizer
from causal_inference import CausalInference
import shap
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import matplotlib
from evaluation import iterative_feature_deletion_with_rmse, iterative_feature_addition_with_rmse

import warnings
warnings.filterwarnings("ignore")



In [37]:
print("Starting ML Pipeline...")
base_dir = '../../../'
print(f"Base directory set to: {base_dir}")

data_path = base_dir + 'dataset/' + 'data_full.xlsx'
raw_data_path = base_dir + 'dataset/' + 'result_raw.xlsx'
result_dir = base_dir + 'result/R/'

report_file_path = result_dir + 'report.txt'

print("Loading data...")
data_processor = DataProcessor(data_path=str(data_path))
df = data_processor.load_data_metabolites()
print("Data loaded successfully.")

print("Encoding labels...")
df_encoded, label_encoder = data_processor.encode_labels(df, label_column='Group')
print("Labels encoded successfully.")
X = df_encoded.drop(columns=['Group'])
y = df_encoded['Group']

Starting ML Pipeline...
Base directory set to: ../../../
Loading data...
Data loaded successfully.
Encoding labels...
Labels encoded successfully.


In [38]:
X = X[["xylose", "xanthosine", "uracil", "ribulose/xylulose", "valylglutamine", "tryptophylglycine", "succinate", "valine betaine", "ursodeoxycholate sulfate (1)", "tricarballylate","succinimide", "thymine", "syringic acid", "serotonin", "ribitol" ]]

y = df_encoded['Group']

print("Training Random Forest model...")
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 7],
    'min_samples_leaf': [1, 2, 4]
}
model_trainer = ModelTrainer(X, y, random_state=456)
model, best_params = model_trainer.train_random_forest(param_dist)

Training Random Forest model...
Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [39]:
print("Applying SHAP for explainability...")

explainer = shap.TreeExplainer(model, model_trainer.X_train)

shap_values = explainer.shap_values(model_trainer.X_test)

feature_names = model_trainer.X_test.columns

mean_abs_shap_values = np.mean(np.abs(shap_values[1]), axis=1)

shap_importance = list(zip(feature_names, mean_abs_shap_values))

shap_importance_sorted = sorted(shap_importance, key=lambda x: x[1], reverse=True)

shap_output_list = [(feature, importance) for feature, importance in shap_importance_sorted]

print(shap_output_list)
    


Applying SHAP for explainability...
[('xylose', np.float64(0.08958445512378603)), ('ribulose/xylulose', np.float64(0.08626370182586471)), ('serotonin', np.float64(0.04826839788389861)), ('xanthosine', np.float64(0.02587966803306699)), ('succinate', np.float64(0.021902265883809377)), ('valine betaine', np.float64(0.01638332578043901)), ('succinimide', np.float64(0.014574887492840388)), ('tryptophylglycine', np.float64(0.01447617734898813)), ('ribitol', np.float64(0.012748171891676065)), ('tricarballylate', np.float64(0.010856002873288162)), ('syringic acid', np.float64(0.009819061418111232)), ('valylglutamine', np.float64(0.009290180740754295)), ('uracil', np.float64(0.007322421847802615)), ('thymine', np.float64(0.0048281132448028076)), ('ursodeoxycholate sulfate (1)', np.float64(0.0018655916490479285))]


In [40]:
print(causal_output_list)
print(shap_output_list)

[('xylose', np.float64(0.16149886610084832)), ('ribulose/xylulose', np.float64(0.15125226147977863)), ('serotonin', np.float64(0.05847924581749287)), ('valine betaine', np.float64(0.05025168083788193)), ('thymine', np.float64(0.021627073480483903)), ('uracil', np.float64(0.015860090956949558)), ('succinate', np.float64(0.013363389721575608)), ('ribitol', np.float64(0.008482407060685588)), ('succinimide', np.float64(0.0033349212251750407)), ('xanthosine', np.float64(0.0024575893611374746)), ('tryptophylglycine', np.float64(0.0015227042112552723)), ('valylglutamine', np.float64(0.0010843002457708065)), ('tricarballylate', np.float64(0.0005110006779311907)), ('syringic acid', np.float64(0.0004981860958519981)), ('ursodeoxycholate sulfate (1)', np.float64(0.0003280757279722092))]
[('xylose', np.float64(0.08958445512378603)), ('ribulose/xylulose', np.float64(0.08626370182586471)), ('serotonin', np.float64(0.04826839788389861)), ('xanthosine', np.float64(0.02587966803306699)), ('succinate', 

In [41]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

def iterative_feature_deletion_with_rmse_over_testset(model, X_test, attribution_scores, top_k=None):
    """
    Iteratively deletes features based on attribution scores and computes the average RMSE over all instances in the test set.

    Parameters:
    - model: Trained Scikit-learn classifier.
    - X_test (pd.DataFrame): Test set features.
    - attribution_scores (dict): Feature attributions (e.g., SHAP values).
    - top_k (int): Number of top features to delete. If None, deletes all features.

    Returns:
    - avg_rmse (float): Average RMSE after deletions over the test set.
    """
    if top_k is None:
        top_k = X_test.shape[1]

    # Ensure attribution_scores is a dict
    if not isinstance(attribution_scores, dict):
        raise TypeError("Attribution scores must be a dictionary with feature names as keys and scores as values.")

    # Sort features by absolute attribution scores in descending order
    sorted_features = sorted(X_test.columns, key=lambda x: -abs(attribution_scores.get(x, 0)))

    # List to store RMSE values for each instance
    rmse_values = []

    # Iterate over each instance in the test set
    for index, input_features in X_test.iterrows():
        # Original prediction probability for class 1
        y_predicted = model.predict_proba([input_features.values])[0][1]

        # Copy the input features to modify
        modified_input = input_features.copy().astype(float)

        # List to store RMSE values at each step for this instance
        instance_rmse_values = []

        # Iteratively delete features and compute RMSE
        for i in range(top_k):
            feature_to_delete = sorted_features[i]
            modified_input[feature_to_delete] = 0.0  # Set feature to zero
            prediction = model.predict_proba([modified_input.values])[0][1]
            rmse = np.sqrt(mean_squared_error([y_predicted], [prediction]))
            instance_rmse_values.append(rmse)

        # Average RMSE for this instance
        instance_avg_rmse = np.mean(instance_rmse_values)
        rmse_values.append(instance_avg_rmse)

    # Compute overall average RMSE across all instances
    overall_avg_rmse = np.mean(rmse_values)
    return overall_avg_rmse


def iterative_feature_addition_with_rmse_over_testset(model, X_test, attribution_scores, top_k=None):
    """
    Iteratively adds features based on attribution scores and computes the average RMSE over all instances in the test set.

    Parameters:
    - model: Trained Scikit-learn classifier.
    - X_test (pd.DataFrame): Test set features.
    - attribution_scores (dict): Feature attributions (e.g., SHAP values).
    - top_k (int): Number of top features to add. If None, adds all features.

    Returns:
    - avg_rmse (float): Average RMSE after additions over the test set.
    """
    if top_k is None:
        top_k = X_test.shape[1]

    # Ensure attribution_scores is a dict
    if not isinstance(attribution_scores, dict):
        raise TypeError("Attribution scores must be a dictionary with feature names as keys and scores as values.")

    # Sort features by absolute attribution scores in descending order
    sorted_features = sorted(X_test.columns, key=lambda x: -abs(attribution_scores.get(x, 0)))

    # List to store RMSE values for each instance
    rmse_values = []

    # Iterate over each instance in the test set
    for index, input_features in X_test.iterrows():
        # Original prediction probability for class 1
        y_predicted = model.predict_proba([input_features.values])[0][1]

        # Initialize modified input with zeros
        modified_input = pd.Series(0.0, index=input_features.index)

        # List to store RMSE values at each step for this instance
        instance_rmse_values = []

        # Iteratively add features and compute RMSE
        for i in range(top_k):
            feature_to_add = sorted_features[i]
            modified_input[feature_to_add] = input_features[feature_to_add].astype(float)
            prediction = model.predict_proba([modified_input.values])[0][1]
            rmse = np.sqrt(mean_squared_error([y_predicted], [prediction]))
            instance_rmse_values.append(rmse)

        # Average RMSE for this instance
        instance_avg_rmse = np.mean(instance_rmse_values)
        rmse_values.append(instance_avg_rmse)

    # Compute overall average RMSE across all instances
    overall_avg_rmse = np.mean(rmse_values)
    return overall_avg_rmse



In [42]:
# Convert causal_output_list to a dictionary
causal_attribution_scores = dict(causal_output_list)

# Convert shap_output_list to a dictionary
shap_attribution_scores = dict(shap_output_list)


In [43]:
# Compute RMSE over the test set using causal attributions
causal_rmse_deletion = iterative_feature_deletion_with_rmse_over_testset(
    model, model_trainer.X_test, causal_attribution_scores)

causal_rmse_addition = iterative_feature_addition_with_rmse_over_testset(
    model, model_trainer.X_test, causal_attribution_scores)

# Compute RMSE over the test set using SHAP attributions
shap_rmse_deletion = iterative_feature_deletion_with_rmse_over_testset(
    model, model_trainer.X_test, shap_attribution_scores)

shap_rmse_addition = iterative_feature_addition_with_rmse_over_testset(
    model, model_trainer.X_test, shap_attribution_scores)



In [44]:
print("Using Global Score to sort Features")
print("SHAP RMSE after feature deletion over test set:", shap_rmse_deletion)
print("Causal RMSE after feature deletion over test set:", causal_rmse_deletion)
print("SHAP RMSE after feature addition over test set:", shap_rmse_addition)
print("Causal RMSE after feature addition over test set:", causal_rmse_addition)

Using Global Score to sort Features
SHAP RMSE after feature deletion over test set: 0.22010251647751647
Causal RMSE after feature deletion over test set: 0.2401425661050661
SHAP RMSE after feature addition over test set: 0.11379787906037905
Causal RMSE after feature addition over test set: 0.0999792409167409
