In [1]:
import argparse
from pathlib import Path
from data_processing import DataProcessor
from models import ModelTrainer
from feature_selection import FeatureSelector
from visualization import Visualizer
from causal_inference import CausalInference
import shap
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import matplotlib

import warnings
warnings.filterwarnings("ignore")

import logging 
log = logging.getLogger('shap')
log.setLevel(logging.WARNING)


  from .autonotebook import tqdm as notebook_tqdm
DEBUG:matplotlib:matplotlib data path: c:\Users\snorl\Desktop\FYP\venv\Lib\site-packages\matplotlib\mpl-data
DEBUG:matplotlib:CONFIGDIR=C:\Users\snorl\.matplotlib
DEBUG:matplotlib:interactive is False
DEBUG:matplotlib:platform is win32
DEBUG:matplotlib:CACHEDIR=C:\Users\snorl\.matplotlib
DEBUG:matplotlib.font_manager:Using fontManager instance from C:\Users\snorl\.matplotlib\fontlist-v390.json
DEBUG:pydot:pydot initializing
DEBUG:pydot:pydot 3.0.2
DEBUG:pydot.dot_parser:pydot dot_parser module initializing
DEBUG:pydot.core:pydot core module initializing


In [2]:
print("Starting ML Pipeline...")
base_dir = '../../../'
print(f"Base directory set to: {base_dir}")

data_path = base_dir + 'dataset/' + 'data_full.xlsx'
raw_data_path = base_dir + 'dataset/' + 'result_raw.xlsx'
result_dir = base_dir + 'result/R/'

report_file_path = result_dir + 'report.txt'

print("Loading data...")
data_processor = DataProcessor(data_path=str(data_path))
df = data_processor.load_data_metabolites()
print("Data loaded successfully.")
print("Preprocessing raw data...")
raw_df = data_processor.preprocess_raw_data(raw_data_path=str(raw_data_path))
print("Raw data preprocessed successfully.")

print("Encoding labels...")
df_encoded, label_encoder = data_processor.encode_labels(df, label_column='Group')
print("Labels encoded successfully.")

X = df_encoded.drop(columns=['Group'])
y = df_encoded['Group']


Starting ML Pipeline...
Base directory set to: ../../../
Loading data...
Data loaded successfully.
Preprocessing raw data...
Raw data preprocessed successfully.
Encoding labels...
Labels encoded successfully.


In [3]:
X = X[["xylose", "xanthosine", "uracil", "ribulose/xylulose", "valylglutamine", "tryptophylglycine", "succinate", "valine betaine", "ursodeoxycholate sulfate (1)", "tricarballylate","succinimide", "thymine", "syringic acid", "serotonin", "ribitol" ]]

y = df_encoded['Group']

print("Training Random Forest model...")
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 7],
    'min_samples_leaf': [1, 2, 4]
}
model_trainer = ModelTrainer(X, y)
model, best_params = model_trainer.train_random_forest(param_dist)

Training Random Forest model...
Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [4]:
print("Evaluating model...")
accuracy, report = model_trainer.evaluate_model()
print(accuracy)
print(report)
print("Model evaluation completed.")

print("Saving trained model...")
model_trainer.save_model(str(result_dir + 'best_random_forest_model.pkl'))
print("Model saved successfully.")



Evaluating model...
0.8108108108108109
              precision    recall  f1-score   support

           0       0.77      0.65      0.71        26
           1       0.83      0.90      0.86        48

    accuracy                           0.81        74
   macro avg       0.80      0.77      0.78        74
weighted avg       0.81      0.81      0.81        74

Model evaluation completed.
Saving trained model...
Model saved successfully.


In [5]:
from causal_inference import CausalInference
from evaluation import iterative_feature_addition_with_rmse, iterative_feature_deletion_with_rmse

In [6]:
print("Applying causal SHAP values...")
ci = CausalInference(data=model_trainer.X_train, model=model, target_variable='Prob_Class_1')
ci.load_causal_strengths(result_dir + 'Mean_Causal_Effect_IBS.json')
print(ci.gamma)
X_train_scaled_df = pd.DataFrame(model_trainer.X_train, columns=model_trainer.X_train.columns)
X_test_scaled_df = pd.DataFrame(model_trainer.X_test, columns=model_trainer.X_test.columns)

Applying causal SHAP values...
{'xylose': 0.21286483817631863, 'xanthosine': 0.02394692431616487, 'uracil': 0.09473882478767023, 'ribulose/xylulose': 0.1846557343833871, 'valylglutamine': 0.006014187808621013, 'tryptophylglycine': 0.010263874030019977, 'succinate': 0.05945970043865379, 'valine betaine': 0.11760414785893997, 'ursodeoxycholate sulfate (1)': 0.0019060014073747102, 'tricarballylate': 0.0036239283407113083, 'succinimide': 0.018407825414330586, 'thymine': 0.08182038074786072, 'syringic acid': 0.004553557947259814, 'serotonin': 0.1350346545925681, 'ribitol': 0.0451054197501192}


In [7]:
print("Applying Causal SHAP for explainability...")
import shap

background_data = shap.kmeans(model_trainer.X_train, 100)
explainer = shap.CausalKernelExplainer(model.predict_proba, background_data, ci.ida_graph, ci.gamma, feature_names=model_trainer.X_train.columns.tolist())

causal_shap_values = explainer.shap_values(model_trainer.X_test.iloc[10])

print("Applying Kernel SHAP for explainability...")

# Use KMeans background data for SHAP Kernel Explainer
explainer = shap.KernelExplainer(model.predict_proba, background_data)

# Calculate SHAP values
shap_values = explainer.shap_values(model_trainer.X_test.iloc[10])

Applying Causal SHAP for explainability...
Applying Kernel SHAP for explainability...


In [8]:
print(causal_shap_values)

[[-0.08872605  0.08872605]
 [ 0.02983786 -0.02983786]
 [ 0.          0.        ]
 [-0.17084405  0.17084405]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [-0.04535461  0.04535461]
 [ 0.00796015 -0.00796015]
 [ 0.01311511 -0.01311511]
 [ 0.02841278 -0.02841278]
 [-0.04633111  0.04633111]
 [-0.02035613  0.02035613]
 [-0.00575002  0.00575002]
 [ 0.10378905 -0.10378905]
 [ 0.04569806 -0.04569806]]


In [9]:
import numpy as np
import pandas as pd

# Compute mean absolute SHAP values for each feature across all samples
global_shap_causal = np.abs(causal_shap_values).mean(axis=0)

# Create a DataFrame for better readability and sorting
feature_importance_causal = pd.DataFrame({
    'Feature': model_trainer.X_train.columns,
    'Mean Absolute SHAP Value': global_shap_causal[:, 1]
})

# Sort features by importance in descending order
feature_importance_causal = feature_importance_causal.sort_values(
    by='Mean Absolute SHAP Value', ascending=False
).reset_index(drop=True)

# Print the global SHAP values
print("Global SHAP Values from CausalKernelExplainer:")
print(feature_importance_causal)


IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [None]:
import numpy as np
import pandas as pd

# Compute mean absolute SHAP values for each feature across all samples
global_shap = np.abs(shap_values).mean(axis=0)

# Create a DataFrame for better readability and sorting
feature_importance = pd.DataFrame({
    'Feature': model_trainer.X_train.columns,
    'Mean Absolute SHAP Value': global_shap[:, 1]
})

# Sort features by importance in descending order
feature_importance = feature_importance.sort_values(
    by='Mean Absolute SHAP Value', ascending=False
).reset_index(drop=True)

# Print the global SHAP values
print("Global SHAP Values from KernelExplainer:")
print(feature_importance)


Global SHAP Values from KernelExplainer:
                         Feature  Mean Absolute SHAP Value
0                         xylose                  0.124013
1                      serotonin                  0.064839
2              ribulose/xylulose                  0.050595
3                 valine betaine                  0.034927
4                      succinate                  0.024271
5                        ribitol                  0.015648
6              tryptophylglycine                  0.013389
7                tricarballylate                  0.012819
8   ursodeoxycholate sulfate (1)                  0.012815
9                     xanthosine                  0.012612
10                   succinimide                  0.012147
11                       thymine                  0.009973
12                valylglutamine                  0.008555
13                        uracil                  0.005946
14                 syringic acid                  0.005843


In [10]:
i = 10
print(sum(causal_shap_values[:,1]))
print(sum(shap_values[:,1]))


x_instance = pd.Series(X_test_scaled_df.iloc[i], index=X_test_scaled_df.columns)
# Compare standard SHAP and causal SHAP values
standard_shap_series = pd.Series(shap_values[:,1], index=X_test_scaled_df.columns)
causal_shap_series = pd.Series(causal_shap_values[:,1], index=X_test_scaled_df.columns)
y_predicted = model.predict(X_test_scaled_df.iloc[i].values.reshape(1, -1))
print(f"Instance: {i}")
print(f"Instance Predicted Value: {y_predicted}")

# Apply iterative feature deletion and addition to compare SHAP values
print("Comparing SHAP and Causal SHAP using feature deletion and addition...")
avg_output_standard_deletion = iterative_feature_deletion_with_rmse(
    model=model, input_features=x_instance, attribution_scores=standard_shap_series, y_predicted=y_predicted)

avg_output_causal_deletion = iterative_feature_deletion_with_rmse(
    model=model, input_features=x_instance, attribution_scores=causal_shap_series, y_predicted=y_predicted)

avg_output_standard_addition = iterative_feature_addition_with_rmse(
    model=model, input_features=x_instance, attribution_scores=standard_shap_series, y_predicted=y_predicted)

avg_output_causal_addition = iterative_feature_addition_with_rmse(
    model=model, input_features=x_instance, attribution_scores=causal_shap_series, y_predicted=y_predicted)

print(f"Average RMSE (Standard SHAP - Deletion): {avg_output_standard_deletion}")
print(f"Average RMSE (Causal SHAP - Deletion): {avg_output_causal_deletion}")
print(f"Average RMSE (Standard SHAP - Addition): {avg_output_standard_addition}")
print(f"Average RMSE (Causal SHAP - Addition): {avg_output_causal_addition}")


0.14854895456343487
0.14854895456343487
Instance: 10
Instance Predicted Value: [1]
Comparing SHAP and Causal SHAP using feature deletion and addition...
Average RMSE (Standard SHAP - Deletion): 0.1976272264772264
Average RMSE (Causal SHAP - Deletion): 0.1795235028062613
Average RMSE (Standard SHAP - Addition): 0.23733154367981962
Average RMSE (Causal SHAP - Addition): 0.2495047826375412
