In [1]:
import pandas as pd
import numpy as np
import shap

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

  from .autonotebook import tqdm as notebook_tqdm


# SHAP importance can be different if we calculate depending if we calculate it with train data or not (even if the data comes from the same distribution)

In [2]:
N_FEATURES = 20

X, y = \
make_classification(n_samples=1000,
                    n_features=N_FEATURES,
                    n_informative=2,
                    n_redundant=2,
                    n_classes=2,
                    flip_y=0.1,
                    shuffle=False,
                    random_state=42)

X = pd.DataFrame(X, columns=[f'column_{i+1}' for i in range(N_FEATURES)])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

rfc = RandomForestClassifier(random_state=42).fit(X_train, y_train)

In [3]:
def return_df_imp_shap(X, explainer):
    shap_vals = explainer.shap_values(X)

    df_imp_shap = \
    (pd.DataFrame(list(zip(X.columns, np.abs(shap_vals[0]).mean(axis=0))),
                  columns=['feature_name', 'shap_importance'])
     .sort_values(by='shap_importance', ascending=False)
     .reset_index(drop=True)
    )
    
    return df_imp_shap

In [4]:
explainer = shap.TreeExplainer(rfc)

df_imp_shap_train = return_df_imp_shap(X_train, explainer)
df_imp_shap_test = return_df_imp_shap(X_test, explainer)

df_imp_shap_train_test = \
df_imp_shap_train.merge(df_imp_shap_test, 
                        on='feature_name',
                        suffixes = ('_train', '_test')
                       )

df_imp_shap_train_test

Unnamed: 0,feature_name,shap_importance_train,shap_importance_test
0,column_2,0.208237,0.20866
1,column_3,0.097596,0.096554
2,column_4,0.038523,0.034153
3,column_1,0.033912,0.035134
4,column_6,0.012097,0.009675
5,column_10,0.011362,0.009502
6,column_9,0.010714,0.010362
7,column_8,0.010399,0.008021
8,column_11,0.010168,0.01004
9,column_12,0.009237,0.008552


# It's close, but different enough to lead to different sorting!

In [5]:
(np.array(df_imp_shap_train
  .sort_values(by='shap_importance')
  .feature_name
  .to_list())
,
 np.array(df_imp_shap_test
  .sort_values(by='shap_importance')
  .feature_name
  .to_list()))

(array(['column_18', 'column_5', 'column_13', 'column_15', 'column_14',
        'column_20', 'column_17', 'column_19', 'column_16', 'column_7',
        'column_12', 'column_11', 'column_8', 'column_9', 'column_10',
        'column_6', 'column_1', 'column_4', 'column_3', 'column_2'],
       dtype='<U9'),
 array(['column_18', 'column_5', 'column_13', 'column_20', 'column_14',
        'column_15', 'column_17', 'column_16', 'column_19', 'column_8',
        'column_12', 'column_7', 'column_10', 'column_6', 'column_11',
        'column_9', 'column_4', 'column_1', 'column_3', 'column_2'],
       dtype='<U9'))

# Reserving a slice of your dataset to calculate SHAP importance

For instance, we could calculate the SHAP importance using the test data. This [`XSHAPImportanceRandomForestClassifier`](?) classifier I implemented holds the dataset `X_shap` when we instantiate the class and is used when we ask for it's `feature_importances_` as you can see from the [source code here](?).

In [6]:
from shap_feature_importances_ import XSHAPImportanceRandomForestClassifier

rfc_shap = XSHAPImportanceRandomForestClassifier(random_state=42, X_shap=X_test).fit(X_train, y_train)
rfc_shap.feature_importances_

array([0.035134  , 0.20866036, 0.09655425, 0.03415335, 0.00440922,
       0.00967484, 0.00875638, 0.00802079, 0.0103625 , 0.00950239,
       0.01004044, 0.00855213, 0.00504655, 0.0053958 , 0.00547095,
       0.00647194, 0.00557922, 0.00428836, 0.00653862, 0.00538897])