[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Imaging-AI-for-Health-virtual-lab/SHAP-in-repeated-nested-CV/blob/main/regression_ICBM.ipynb)

# Tree-based Feature selection - Testing on glucose DATASET

Install dependencies and import modules

In [None]:
####### Import packages ########################
import shap 
import sklearn 
import pandas as pd 
import numpy as np 
import tensorflow as tf 
from sklearn import preprocessing
import warnings
import random
warnings.filterwarnings(action='ignore')
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from h2o4gpu.solvers.elastic_net import ElasticNet 
import h2o4gpu.util.import_data as io
import h2o4gpu.util.metrics as metrics

carbon_source = "glc" # glucose condition
output_name = "glc"

Define training dataset

In [None]:

X_data_raw  = pd.read_feather("simulated_fluxes("+carbon_source+").feather").set_index("index")
X_train_scaled = sklearn.preprocessing.StandardScaler().fit_transform(X_data_raw)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_data_raw.columns)

#Extracting growth data for target data
growth_data = pd.read_feather("biomass_data.feather").set_index("index")
y_data_raw =  growth_data[carbon_source]
y_data = y_data_raw[y_data_raw.index.isin(X_data_raw.index)]
y_train =y_data

feature selection

In [None]:
rf = RandomForestRegressor(random_state=0)
rf.fit(X_train_scaled,y_train)
rf_model = SelectFromModel(rf, prefit=True)
X_train_new = rf_model.transform(X_train_scaled)
rf_model.get_support()
selector_true =[ i for i, f in enumerate(rf_model.get_support()) if f ]
selector_rxn = [X_data_raw.columns[i] for i in selector_true]
X_data = pd.DataFrame(X_train_new, columns=selector_rxn)

Model training and reaction prediction with ElasticNet regression

In [None]:
random_seed = 0 # random seed
n_alphas    = 100 # number of alphas along the regularization path
max_iter    = 1e4 # maximum number of iterations
tol         = 1e-6 # tolerance for the optimization
cv_folds    = 300 # number of cross validation folds
l1_ratio    = 1e-2 # scaling between l1 and l2 penalties

# Standardize data

# Shuffle the data
y_train = y_data

X_train_train, y_train = sklearn.utils.shuffle(X_data, y_train, random_state=random_seed)

# Train the data
enlr = ElasticNet(max_iter=max_iter,
                  n_alphas=n_alphas,
                  tol=tol,
                  n_folds=cv_folds,
                  l1_ratio=l1_ratio,
                  random_state=random_seed
                  )

enlr.fit(X_train_train, y_train)


#Extract each reaction's coefficient
raw_coefs_data = pd.Series(enlr.coef_, index=X_data.columns , name=  "Coefficient").to_frame()

#Filter out transport and external reactions
memote_pure_rxn = open("util/memote_pure_rxns.txt", 'r').read().strip('"').split('","')

#Separate beneficial(+) and detrimental(-) reactions based on coefficient value
coefs_pos = raw_coefs_data[raw_coefs_data.iloc[:, 0] > 0]
coefs_neg = raw_coefs_data[raw_coefs_data.iloc[:, 0] < 0]

#Filter out reactions with negligible coefficient value
avg_coefs_pos = coefs_pos.iloc[:, 0].mean()
avg_coefs_neg = coefs_neg.iloc[:, 0].mean()

final_pos_coefs = coefs_pos[coefs_pos.iloc[:,0] >=  0.1*avg_coefs_pos]
final_pos_coefs = final_pos_coefs[final_pos_coefs.index.isin(memote_pure_rxn) == True]
final_neg_coefs = coefs_neg[abs(coefs_neg.iloc[:,0]) >= abs(0.1*avg_coefs_neg)]
final_neg_coefs = final_neg_coefs[final_neg_coefs.index.isin(memote_pure_rxn) == True]

#Sort and extract to csv
filtered_coefs = final_pos_coefs.append(final_neg_coefs)
filtered_coefs  = filtered_coefs.sort_values(ascending=True, by="Coefficient")
filtered_coefs.to_csv("output/glc_en_tree.csv")

Model training and reaction prediction with MLP

In [None]:
model = tf.keras.models.Sequential([
            tf.keras.layers.Input(shape=(len(X_data.columns),)),
            tf.keras.layers.Dense(units=1000, activation="relu", kernel_constraint=tf.keras.constraints.max_norm(3)),
            tf.keras.layers.Dropout(rate=0.6),
            tf.keras.layers.Dense(units=1000, activation="relu", kernel_constraint=tf.keras.constraints.max_norm(3)),
            tf.keras.layers.Dropout(rate=0.6),
            tf.keras.layers.Dense(units=1000, activation="relu", kernel_constraint=tf.keras.constraints.max_norm(3)),
            tf.keras.layers.Dropout(rate=0.6),
            tf.keras.layers.Dense(units=1000, activation="relu", kernel_constraint=tf.keras.constraints.max_norm(3)),
            tf.keras.layers.Dropout(rate=0.6),
            tf.keras.layers.Dense(1, activation="linear")
        ])
model.compile(optimizer=tf.optimizers.RMSprop(lr=0.005), loss="mse", metrics=["mse"])

# Set the list of random seeds for MLP training & SHAP values
seed_num_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
total_shap_df = pd.DataFrame(index=X_data.columns)

for seed_num in seed_num_list:
    tf.random.set_seed(seed_num)
    random.seed(seed_num)

    # Standardize data
    X_train_scaled = sklearn.preprocessing.StandardScaler().fit_transform(X_data)

    y_train = y_data
    # Shuffle data
    X_train_scaled, y_train = sklearn.utils.shuffle(X_train_scaled, y_train, random_state=seed_num)

    # Artificial Neural Network build
    with tf.device("cpu:0"):
        model = tf.keras.models.Sequential([
            tf.keras.layers.Input(shape=(len(X_data.columns),)),
            tf.keras.layers.Dense(units=1000, activation="relu", kernel_constraint=tf.keras.constraints.max_norm(3)),
            tf.keras.layers.Dropout(rate=0.6),
            tf.keras.layers.Dense(units=1000, activation="relu", kernel_constraint=tf.keras.constraints.max_norm(3)),
            tf.keras.layers.Dropout(rate=0.6),
            tf.keras.layers.Dense(units=1000, activation="relu", kernel_constraint=tf.keras.constraints.max_norm(3)),
            tf.keras.layers.Dropout(rate=0.6),
            tf.keras.layers.Dense(units=1000, activation="relu", kernel_constraint=tf.keras.constraints.max_norm(3)),
            tf.keras.layers.Dropout(rate=0.6),
            tf.keras.layers.Dense(1, activation="linear")
        ])

        # Compile model
        model.compile(optimizer=tf.optimizers.RMSprop(lr=0.005), loss="mse", metrics=["mse"])

        # Train model
        model.fit(x=X_train_scaled, y=y_train, epochs=40, validation_split=0.1)

    # SHAP computation
    background = X_train_scaled
    explainer = shap.DeepExplainer(model, background)  # create the background set
    shap_values = explainer.shap_values(X_train_scaled)  # train the explainer
    shap_df = pd.DataFrame(shap_values[0], columns=X_data.columns)
    median_shap = pd.DataFrame(shap_df.median())
    # median_shap = median_shap.sort_values(ascending=False)
    total_shap_df = pd.merge(total_shap_df, median_shap, left_index=True, right_index=True)

# The average SHAP values will be the representative for each features
total_shap_df_mean = total_shap_df.mean(axis=1)

total_shap_df_mean = total_shap_df_mean.sort_values(ascending=False)

#Extract each reaction's SHAP value
raw_SHAP_values = total_shap_df_mean.to_frame()

#Filter out transport and external reactions
memote_pure_rxn = open("util/memote_pure_rxns.txt", 'r').read().strip('"').split('","')

#Separate beneficial(+) and detrimental(-) reactions based on SHAP value
SHAP_pos = raw_SHAP_values[raw_SHAP_values.iloc[:, 0] > 0]
SHAP_neg = raw_SHAP_values[raw_SHAP_values.iloc[:, 0] < 0]

#Filter out reactions with negligible SHAP value
avg_coefs_pos = SHAP_pos.iloc[:, 0].mean()
avg_coefs_neg = SHAP_neg.iloc[:, 0].mean()

final_pos_SHAPs = SHAP_pos[SHAP_pos.iloc[:,0] >=  0.1*avg_coefs_pos]
final_pos_SHAPs = final_pos_SHAPs[final_pos_SHAPs.index.isin(memote_pure_rxn) == True]
final_neg_SHAPs = SHAP_neg[abs(SHAP_neg.iloc[:,0]) >= abs(0.1*avg_coefs_neg)]
final_neg_SHAPs = final_neg_SHAPs[final_neg_SHAPs.index.isin(memote_pure_rxn) == True]

#Sort and extract to csv
filtered_SHAPs = final_pos_SHAPs.append(final_neg_SHAPs)
filtered_SHAPs = filtered_SHAPs.sort_values(ascending=False, by=0)
filtered_SHAPs.columns = ["SHAP value"]
filtered_SHAPs.to_csv("output/glc_mlp_tree.csv")