In [1]:
# Modify the file A00_setup.  Note the working directory is changed
import os
os.chdir(os.getcwd().rsplit(os.path.sep + 'code')[0] + os.path.sep + 'code')
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./_A_embeddings/A00_setup.py").load_module()

## SHAP Values
Get SHAP values for the models, focusing on the k=3 cluster, and also NULLs.  Note that to do this I need to re-fit models with embeddings as features

I run into an issue with using the embeddings with SHAP DeepExplainer: https://webapps.stackexchange.com/questions/103374/where-can-i-see-all-my-comments-on-issues-on-github

To work around this, I will use the model starting from the concatenation of the numeric and entity embedding inputs.  

*This script takes about 10 minutes on my MacBook Air*

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [3]:
from pathlib import Path
import importlib, pickle

In [4]:
import os
import re

import keras
from tensorflow.keras import layers, optimizers, losses, metrics, Model
from sklearn import preprocessing, feature_extraction, model_selection
from IPython.display import display, HTML
import matplotlib.pyplot as plt
from keras.utils import plot_model
%matplotlib inline

In [5]:
import shap

In [6]:
from sba_nn.sba_nn import sbnn_model

## Input Data

##### Base data 

In [7]:
sba_loans = pd.read_parquet(Path(setup.parent_path).joinpath('01_DATA_transformed_nomiss.parquet'))

##### Feature info

In [8]:
with open(Path(setup.parent_path).joinpath('01_DATA_features.pkl'), 'rb') as fin:
    imputer_features = pickle.load(fin)

In [9]:
# List numeric features features
features_numeric = [f for f in imputer_features if 'NAICS' not in f]
features_numeric_len = len(features_numeric)
print(features_numeric)
print(len(features_numeric))

['NoEmp', 'CreateJob', 'LowDoc', 'DisbursementGross', 'new_business', 'urban_flag', 'franchise_flag', 'missingindicator_LowDoc', 'missingindicator_new_business', 'missingindicator_urban_flag']
10


In [10]:
features_orig = [c + '_orig' for c in features_numeric if c + '_orig' in sba_loans.columns]
print(features_orig)

['NoEmp_orig', 'CreateJob_orig', 'LowDoc_orig', 'DisbursementGross_orig', 'new_business_orig', 'urban_flag_orig', 'franchise_flag_orig']


##### NAICS info - from A12
Get the cluster information

In [11]:
cluster_sum_df = pd.read_parquet(Path(setup.temp_path).joinpath('A12_DATA_tsne_info.parquet'))

In [12]:
# NAICS feature 
features_naics = ['NAICS', 'NAICS_4', 'NAICS_3', 'NAICS_sector']
features_naics_orig = [c + '_orig' for c in features_naics]

##### Model Specs
Get dictionaries containing relevant info for selected models

In [13]:
model_info = pd.read_csv(Path(setup.temp_path).joinpath('A10_REPORT_model_info.csv'))
model_info

Unnamed: 0,model,model_path,model_hier,model_rand,model_layer_name
0,03,../data/2024_05_16/03_DATA_model.keras,False,False,NAICS
1,11,../data/2024_05_16/11_DATA_model.keras,False,True,NAICS
2,04,../data/2024_05_16/04_DATA_model.keras,True,False,NAICS
3,12,../data/2024_05_16/12_DATA_model.keras,True,True,NAICS
4,A01,../data/2024_06_04/A01_DATA_model.keras,True,True,embedding_int
5,A02,../data/2024_06_04/A02_DATA_model.keras,True,False,embedding_int


In [14]:
# Dictionary of model values
model_path_dict = model_info[['model', 'model_path']].set_index('model') \
    ['model_path'].to_dict()

In [15]:
# Hierarchical dict
model_hier_dict = model_info[['model', 'model_hier']].set_index('model') \
    ['model_hier'].to_dict()

In [16]:
# Dictionary of NAICS features
model_naics_feat_dict = {m:features_naics if model_hier_dict[m]
                         else ['NAICS'] for m in model_info['model'].to_list()}
model_naics_feat_dict

{'03': ['NAICS'],
 '11': ['NAICS'],
 '04': ['NAICS', 'NAICS_4', 'NAICS_3', 'NAICS_sector'],
 '12': ['NAICS', 'NAICS_4', 'NAICS_3', 'NAICS_sector'],
 'A01': ['NAICS', 'NAICS_4', 'NAICS_3', 'NAICS_sector'],
 'A02': ['NAICS', 'NAICS_4', 'NAICS_3', 'NAICS_sector']}

## Data for SHAP

##### Get data for creating the explainer
Use a simple random sample, from the training

In [17]:
X_for_explainer = sba_loans[sba_loans['dset'] == 'train'] \
    .sample(n=1000, random_state = 24243)

##### Get data for explanations
  * Random, stratify by NAICS (higher volume NAICS)
  * Totally random 
  * Random with mullified NAICS (all set to 1)

In [18]:
naics_count = sba_loans.groupby(['NAICS_orig', 'dset_naics_holdout']) \
    ['LoanNr_ChkDgt'].agg('count').rename('count').reset_index()
naics_high = naics_count[naics_count['count'] >= setup.plot_thresh]
naics_high.shape

(280, 3)

In [19]:
X_to_explain = sba_loans \
    .merge(naics_high[['NAICS_orig']], on='NAICS_orig') \
    .groupby('NAICS_orig') \
    .sample(n=50, random_state = 2343) \
    .merge(cluster_sum_df[['NAICS_orig', 'cluster_003']], on='NAICS_orig') \
    .reset_index(drop=True) \
    .sort_index() 

In [20]:
X_to_explain.shape

(14000, 56)

##### Full random

In [21]:
X_to_explain_rand = sba_loans[sba_loans['dset'] == 'test'] \
    .sample(n=5000, random_state = 24243) \
    .reset_index(drop=True) \
    .sort_index() 
X_to_explain_rand['dset_naics_holdout'].value_counts(dropna=False)

dset_naics_holdout
0.0    3151
1.0    1849
Name: count, dtype: int64

##### Random wih null NAICS

In [22]:
X_to_explain_null = X_to_explain_rand.copy()
X_to_explain_null[features_naics] = 1

## Create subdirectory for SHAP value output

In [23]:
this_out = Path(setup.temp_path).joinpath('A20_SHAP_data')
this_out.mkdir(parents=True, exist_ok=True)

## Function to explain model
See https://webapps.stackexchange.com/questions/103374/where-can-i-see-all-my-comments-on-issues-on-github for more information about why a workaround is needed for the explainer

In [24]:
def model_expl(full_path, data_for_explainer,
               data_to_explain,
               out_path = None,
               out_prefix = 'XX', 
               layer_name = 'NAICS',
               features_numeric = features_numeric,
               features_naics = features_naics,
               keep_cols = ['LoanNr_ChkDgt', 'cluster_003',
                           'dset', 'dset_naics_holdout', 'NAICS_orig']):
    
    # Load this model, save diagram if applicable
    model = keras.models.load_model(full_path)
        
    # Create a new model object to access the inputs after concatenation
    # Note that for the model with a hidden layer for all the NAICS, we lose
    # the ability to explain separate levels of the NAICS hierarchy
    # See the GitHub link above
    sub_imp_model = Model(inputs=model.input, outputs=model.get_layer('input_concat').output)
    
    # Get data inputs after concatenation using sub_imp_model
    X_expl = data_for_explainer[features_numeric +  features_naics]
    sub_data_for_explainer = sub_imp_model.predict([X_expl[features_numeric]]+ 
                                                   [X_expl[f] for f in features_naics])
    
    # Same for the cases to be explained
    X = data_to_explain[features_numeric +  features_naics].copy() \
        .set_index(data_to_explain.index)
    sub_data_to_explain = sub_imp_model.predict([X[features_numeric]]+ 
                                                [X[f] for f in features_naics])
    
    # Explain the model below the concatenated inputs
    rem_model = Model(inputs=model.get_layer('input_concat').output, 
                      outputs=model.output)
    
    explainer = shap.DeepExplainer(rem_model, sub_data_for_explainer)
    
    shap_vals = explainer.shap_values(sub_data_to_explain)
    
    # Get Pandas version (wide format)
    shap_df = pd.DataFrame(shap_vals[0], index=data_to_explain.index)
    
    # Name columns - NAICS at intermediate / embedding layer so generic names
    num_len = len(features_numeric)
    cat_len = len(shap_df.columns) - num_len
    shap_df.columns = [f'NAICS_emb_{i:03d}' for i in range (0, cat_len)] + \
        features_numeric
    
    # Append some info columns
    info_cols = [c for c in keep_cols if c in data_to_explain.columns]
    shap_df = pd.concat([data_to_explain[info_cols],
                        shap_df], axis=1)
    
    # Save raw SHAP and pandas data, if applicable
    if out_path is not None:
        with open(out_path.joinpath(out_prefix + '_shap_values.pkl'), 'wb') as fo:
                  pickle.dump(shap_vals, fo)
        shap_df.to_parquet(out_path.joinpath(out_prefix + '_shap_values.parquet'))

    return

## Get SHAP Values for the Models

##### Full random cases

In [25]:
[model_expl(model_path_dict[m], X_for_explainer, X_to_explain_rand,
            features_naics = model_naics_feat_dict[m],
            out_path = this_out,
            out_prefix = m + '_DATA_random')
    for m in model_info['model'].to_list()]

2024-07-22 12:11:13.287992: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-07-22 12:11:13.288015: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-07-22 12:11:13.288018: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-07-22 12:11:13.288238: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-07-22 12:11:13.288617: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)




2024-07-22 12:11:13.637355: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.
`tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.
2024-07-22 12:11:14.191240: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-07-22 12:11:31.475005: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.
`tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.
2024-07-22 12:11:31.707218: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-07-22 12:11:49.753404: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.
`tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.
2024-07-22 12:11:50.096074: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-07-22 12:12:09.212253: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.
`tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.
2024-07-22 12:12:09.535380: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-07-22 12:12:28.016650: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.
`tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.
2024-07-22 12:12:28.367707: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-07-22 12:12:46.521335: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.
`tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.
2024-07-22 12:12:46.871769: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


[None, None, None, None, None, None]

##### Nullified cases

In [26]:
[model_expl(model_path_dict[m], X_for_explainer, X_to_explain_null,
            features_naics = model_naics_feat_dict[m],
            out_path = this_out,
            out_prefix = m + '_DATA_naics_null')
    for m in model_info['model'].to_list()]



2024-07-22 12:13:05.521353: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-07-22 12:13:05.784952: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.
`tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.
2024-07-22 12:13:05.967093: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-07-22 12:13:23.794733: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-07-22 12:13:23.867998: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.
`tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.
2024-07-22 12:13:24.055502: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-07-22 12:13:41.989031: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-07-22 12:13:42.107392: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.
`tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.
2024-07-22 12:13:42.392411: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-07-22 12:14:00.990021: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-07-22 12:14:01.106715: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.
`tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.
2024-07-22 12:14:01.392127: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


  1/157 [..............................] - ETA: 8s

2024-07-22 12:14:19.928358: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-07-22 12:14:20.056957: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.
`tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.
2024-07-22 12:14:20.357419: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


  1/157 [..............................] - ETA: 8s

2024-07-22 12:14:38.988507: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-07-22 12:14:39.118452: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.
`tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.
2024-07-22 12:14:39.429003: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


[None, None, None, None, None, None]

##### Random NAICS

In [27]:
[model_expl(model_path_dict[m], X_for_explainer, X_to_explain,
            features_naics = model_naics_feat_dict[m],
            out_path = this_out,
            out_prefix = m + '_DATA_naics_samp')
    for m in model_info['model'].to_list()]



2024-07-22 12:14:57.521551: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.
`tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.
2024-07-22 12:14:58.053129: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-07-22 12:15:49.442294: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.
`tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.
2024-07-22 12:15:49.922274: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


 46/438 [==>...........................] - ETA: 0s

2024-07-22 12:16:42.829201: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.
`tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.
2024-07-22 12:16:43.532689: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


 47/438 [==>...........................] - ETA: 0s

2024-07-22 12:17:35.369784: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.
`tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.
2024-07-22 12:17:36.060781: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


 42/438 [=>............................] - ETA: 0s

2024-07-22 12:18:27.204183: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.
`tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.
2024-07-22 12:18:27.947903: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


 43/438 [=>............................] - ETA: 0s

2024-07-22 12:19:19.954235: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.
`tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.
2024-07-22 12:19:20.695531: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


[None, None, None, None, None, None]