In [5]:
# Modify the file A00_setup.  Note the working directory is changed
import os
os.chdir(os.getcwd().rsplit(os.path.sep + 'code')[0] + os.path.sep + 'code')
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./A_target_count_encoding/A00_setup.py").load_module()
os.getcwd()

'/Users/valeriecarey/Documents/projects/2023_10_blog_gnn_sba/code'

# SHAP: NAICS hierarchy
Shapley comparisons for NAICS hierarchy model.  Look at importance of NAICS features

*This script takes about 5 minutes on my MacBook Air*

In [6]:
import pandas as pd
import numpy as np
import xgboost as xgb
import warnings
import matplotlib.pyplot as plt
import pickle, importlib

In [7]:
from pathlib import Path

In [8]:
import shap
import xgboost as xgb

## Data

In [9]:
sba_loans = pd.read_parquet(Path(setup.parent_path).joinpath('01_DATA_transformed.parquet')) 

In [10]:
train_df = sba_loans[sba_loans['dset'] == 'train']

## Models 
Load a bunch of models and create a dictionary.  Also get model-related data into dictionaries.

In [11]:
# Function to load the required information from models
def model_load(prefix = 'X01', path=setup.temp_path):
    model = xgb.Booster()
    model.load_model(Path(path).joinpath(prefix + '_MODEL_xgboost.json'))
    features_values = pd.read_parquet(Path(path).joinpath(prefix + '_DATA_encodings.parquet'))
    features_columns = [c for c in features_values.columns \
                        if c not in ['LoanNr_ChkDgt', 'dset', 'dset_naics_holdout', 'NAICS']]
    with open(Path(path).joinpath(prefix + '_DATA_features_predict.pkl'), 'rb') as fin:
        features_predict = pickle.load(fin)
        
    return model, features_values, features_columns, features_predict

##### Models

In [12]:
# Standard target encoding of NAICS
xgb_model_menc, features_menc_values, features_menc_columns, features_predict_menc = \
    model_load('X04', setup.parent_path)

In [13]:
# Hierarchical target encoding by NAICS
xgb_model_mhier, features_mhier_values, features_mhier_columns, features_predict_mhier = \
    model_load('X10', setup.parent_path)

In [14]:
# Standard target encoding of all levels NAICS (separately)
xgb_model_menc_all, features_menc_all_values, features_menc_all_columns, features_predict_menc_all = \
    model_load('A01', setup.temp_path)

In [15]:
# Target+Count encoding, all levels NAICS
xgb_model_tc, features_tc_values, features_tc_columns, features_predict_tc = \
    model_load('A20', setup.temp_path)

In [16]:
# Target-Thresh encoding, all levels NAICS
xgb_model_tt, features_tt_values, features_tt_columns, features_predict_tt = \
    model_load('A21', setup.temp_path)

In [17]:
# Encoding column into dictionary
feature_dict = {'menc': features_menc_columns, 
                'mhier':features_mhier_columns,
                'menc_all': features_menc_all_columns,
                'tc':features_tc_columns,
                 'tt':features_tt_columns}

In [18]:
# Order for plots and tables
feature_order = ['menc', 'mhier', 'menc_all', 'tc', 'tt']
num_models = len(feature_order)

In [19]:
# Models into a dictionary
model_dict = {'menc': xgb_model_menc, 
                'mhier':xgb_model_mhier,
                'menc_all': xgb_model_menc_all,
                'tc':xgb_model_tc,
                 'tt':xgb_model_tt}

In [20]:
# Predictor dictionary for compactness
predictor_dict = {'menc': features_predict_menc, 
                'mhier':features_predict_mhier,
                'menc_all': features_predict_menc_all,
                'tc':features_predict_tc,
                 'tt':features_predict_tt}

In [21]:
predictor_dict.keys()

dict_keys(['menc', 'mhier', 'menc_all', 'tc', 'tt'])

## Datasets for Explanations
Wamt to use the same cases for all models.  Use the test holdout, and regular test data.  I need to append different features for each

In [26]:
holdout_base = sba_loans[sba_loans['dset_naics_holdout'] == 1]

## SHAP Explainers, Values
For the 3 models, generate explainers then explanations

##### Explainers

In [27]:
explainer_dict = {k: \
                  shap.TreeExplainer(model_dict[k],feature_names=predictor_dict[k]) \
              for k in predictor_dict.keys()}

In [28]:
explainer_dict

{'menc': <shap.explainers._tree.TreeExplainer at 0x335fd5490>,
 'mhier': <shap.explainers._tree.TreeExplainer at 0x17f06e400>,
 'menc_all': <shap.explainers._tree.TreeExplainer at 0x335fd5550>,
 'tc': <shap.explainers._tree.TreeExplainer at 0x335fd5640>,
 'tt': <shap.explainers._tree.TreeExplainer at 0x335fd5700>}

##### Holdout values

In [23]:
holdout_df = sba_loans[sba_loans['dset_naics_holdout'] == 1]

In [24]:
shap_holdout_dict = {k: 
                     explainer_dict[k].shap_values(holdout_df[predictor_dict[k]])
                     for k in feature_dict.keys()}

##### Random test values

In [25]:
test_nh_df = sba_loans[sba_loans['dset_naics_holdout'] == 0]

In [26]:
shap_test_dict = {k: 
                     explainer_dict[k].shap_values(test_nh_df[predictor_dict[k]])
                     for k in feature_dict.keys()}

## SHAP Dataframes

##### Wide version

In [27]:
# Wide dataframe for a single model
def get_shap_df(data, shap_values, predictor_features, enc_feature):
    df = pd.DataFrame(shap_values, columns=predictor_features,index = data.index) \
        .rename(columns={enc_feature:'enc_shap'})
    df = pd.concat([data[['LoanNr_ChkDgt', 'target', 'NAICS',
                          enc_feature]].rename(columns={enc_feature:'enc_val'}),
                    df], axis=1)
    return df

In [28]:
# Dataframe for multiple models
def get_wide_df(data, shap_dict,
               predictor_dict = predictor_dict,
               feature_dict = feature_dict,
               model_categories = feature_order):
    df = pd.concat([get_shap_df(data, shap_dict[k], predictor_dict[k], feature_dict[k])
                    for k in feature_dict.keys()],
                   keys=feature_dict.keys()) \
        .reset_index(level=0) \
        .rename(columns={'level_0':'model'}) \
        .reset_index(drop=True)
    
    cat_type = pd.CategoricalDtype(categories=feature_order,ordered=True)
    df['model'] =  df['model'].astype(cat_type)
    
    return df

In [29]:
holdout_wide_df = get_wide_df(holdout_df, shap_holdout_dict)

In [30]:
test_wide_df = get_wide_df(test_nh_df, shap_test_dict)

In [31]:
wide_df = pd.concat([holdout_wide_df, test_wide_df],
                    keys = [1, 0]) \
    .reset_index(level=0) \
    .rename(columns={'level_0':'dset_naics_holdout'}) \
    .reset_index(drop=True)

In [32]:
wide_df.head(3)

Unnamed: 0,dset_naics_holdout,model,LoanNr_ChkDgt,target,NAICS,enc_val,NoEmp,CreateJob,LowDoc,DisbursementGross,new_business,urban_flag,franchise_flag,enc_shap
0,1,menc,1000146010,0,445299,0.204655,0.088108,-0.00188,0.017768,-0.149431,0.193137,0.170542,0.029723,0.091684
1,1,menc,1000734007,0,512110,0.204655,0.044254,-0.010897,-0.250198,0.049062,-0.053356,-0.995948,-0.003018,0.078733
2,1,menc,1000745006,0,541810,0.204655,-0.000311,-0.352729,0.042809,0.397,-0.010701,-0.865773,7e-05,-0.021616


In [33]:
wide_df['model'].value_counts()

model
menc     688081
naics    688081
dgi      688081
Name: count, dtype: int64

In [34]:
wide_df.to_parquet(Path(setup.temp_path).joinpath('A30_DATA_shap_wide.parquet'))

##### Long version

In [35]:
shap_cols = setup.predictor_features + ['enc_shap']
id_cols = [c for c in wide_df.columns if c not in shap_cols]

In [36]:
id_cols

['dset_naics_holdout', 'model', 'LoanNr_ChkDgt', 'target', 'NAICS', 'enc_val']

In [37]:
long_df = wide_df.melt(id_vars=id_cols)
long_df['enc_feat'] = np.where(long_df['variable'] == 'enc_shap', 1, 0)

In [38]:
long_df.head(3)

Unnamed: 0,dset_naics_holdout,model,LoanNr_ChkDgt,target,NAICS,enc_val,variable,value,enc_feat
0,1,menc,1000146010,0,445299,0.204655,NoEmp,0.088108,0
1,1,menc,1000734007,0,512110,0.204655,NoEmp,0.044254,0
2,1,menc,1000745006,0,541810,0.204655,NoEmp,-0.000311,0


In [39]:
long_df.to_parquet(Path(setup.temp_path).joinpath('A30_DATA_shap_long.parquet'))