In [1]:
# Modify the file A00_setup.  Note the working directory is changed
import os
os.chdir(os.getcwd().rsplit(os.path.sep + 'code')[0] + os.path.sep + 'code')
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./A_target_count_encoding/A00_setup.py").load_module()
os.getcwd()

'/Users/valeriecarey/Documents/projects/2023_10_blog_gnn_sba/code'

# XGBoost, DGI-Based Hierarchical Encoding, Threshold Target Encoding
Mean encode NAICS using DGI blending. Use levels similar to NAICS bas

*This script takes about 5 minutes on my MacBook Air*

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
import warnings
import matplotlib.pyplot as plt
import pickle, importlib

In [3]:
from pathlib import Path

In [4]:
from sklearn.preprocessing import TargetEncoder

In [5]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [6]:
import sklearn as sk
from sklearn.inspection import permutation_importance, partial_dependence
from scikitplot.metrics import plot_lift_curve

In [7]:
from sba_gnn.sba_gnn import sg_plot, sg_thresh_encoder
from sba_gnn.sba_gnn.sg_thresh_encoder import TargetThreshEncoder

## Import Processed Datasets

##### Get loans data, append clusters

In [8]:
sba_loans = pd.read_parquet(Path(setup.parent_path).joinpath('01_DATA_transformed.parquet'))

In [9]:
sba_loans.filter(like='NAICS').columns

Index(['NAICS', 'NAICS_2', 'NAICS_sector', 'NAICS_sector_desc', 'NAICS_num',
       'NAICS_5', 'NAICS_4', 'NAICS_3', 'menc_NAICS', 'menc_NAICS_5',
       'menc_NAICS_4', 'menc_NAICS_3', 'menc_NAICS_sector', 'mhier_NAICS',
       'cenc_NAICS'],
      dtype='object')

In [10]:
embed_df = pd.read_parquet(Path(setup.parent_path).joinpath('63_DATA_embeddings_tsne_naics.parquet'))

##### NAICS levels to select NAICS-like clusters

In [11]:
naics_grp_stats = pd.read_csv(Path(setup.parent_path).joinpath('02_REPORT_naics_grp_stats_all.csv'))
naics_grp_k = naics_grp_stats['count_grp'].drop_duplicates().sort_values().to_list()
naics_grp_k

[20.0, 106.0, 354.0, 834.0]

In [12]:
cluster_cols = [c for c in embed_df if c.startswith('cluster_')]
cluster_cols

['cluster_003',
 'cluster_010',
 'cluster_020',
 'cluster_106',
 'cluster_354',
 'cluster_834']

In [13]:
cluster_cols_sel = sorted([c for c in cluster_cols if int(c.split('_')[1]) in naics_grp_k],
                         reverse=True)
cluster_cols_sel

['cluster_834', 'cluster_354', 'cluster_106', 'cluster_020']

In [14]:
cluster_cols_sel = ['cluster_020']

In [15]:
sba_loans = sba_loans.drop(columns = cluster_cols_sel, errors='ignore') \
    .merge(embed_df[['NAICS_orig'] + cluster_cols_sel] \
               .rename(columns={'NAICS_orig':'NAICS'}),
           on='NAICS', how='left')

In [16]:
sba_loans[cluster_cols_sel[0]].value_counts().head(10)

cluster_020
16    95480
2     78009
13    62578
14    61439
3     53243
8     48990
7     45313
6     38803
1     32257
17    31951
Name: count, dtype: int64

In [17]:
train_df = sba_loans[sba_loans['dset'] == 'train']

## Fit the Encoder
Also save it for later

In [18]:
importlib.reload(sg_thresh_encoder)
from sba_gnn.sba_gnn import sg_plot, sg_thresh_encoder
from sba_gnn.sba_gnn.sg_thresh_encoder import TargetThreshEncoder

In [19]:
naics_features = ['NAICS'] + cluster_cols_sel

In [20]:
# Create encoders
enc_dict = {f:TargetThreshEncoder(threshold = 100) for f in naics_features}

In [21]:
# fit encoders
[enc_dict[f].fit(train_df[f], train_df['target']) for f in naics_features]

[None, None]

In [22]:
with open(Path(setup.temp_path).joinpath('A10_DATA_encoder.pkl'), 'wb') as fout:
    pickle.dump(enc_dict, fout)

In [23]:
# Transform the data
enc_val = pd.concat([enc_dict[f].transform(sba_loans[f]) for f in naics_features],
                    axis=1)
enc_val.columns = ['tt_' + f for f in enc_val.columns]

In [24]:
enc_val.head(3)

Unnamed: 0_level_0,tt_NAICS,tt_cluster_020
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.229316,0.271491
1,0.141762,0.136235
2,0.040818,0.085137


In [25]:
enc_val.isna().sum()

tt_NAICS          95669
tt_cluster_020      123
dtype: int64

In [26]:
# Save the values
pd.concat([sba_loans[['LoanNr_ChkDgt', 'dset', 'dset_naics_holdout', 'NAICS']] \
               .drop(columns=['count', 'response'], errors='ignore'), enc_val], axis = 1) \
    .to_parquet(Path(setup.temp_path).joinpath('A05_DATA_encodings.parquet'))

In [27]:
sba_loans = pd.concat([sba_loans.drop(columns=enc_val.columns, errors='ignore'), enc_val], 
                      axis=1)
print(sba_loans.shape)

(688081, 61)


In [28]:
sba_loans[['target'] + list(enc_val.columns)].corr()

Unnamed: 0,target,tt_NAICS,tt_cluster_020
target,1.0,0.23591,0.178585
tt_NAICS,0.23591,1.0,0.741542
tt_cluster_020,0.178585,0.741542,1.0


In [29]:
train_df = sba_loans[sba_loans['dset'] == 'train']
test_df = sba_loans[sba_loans['dset'] == 'test']
val_df = sba_loans[sba_loans['dset'] == 'val']

In [30]:
predictor_features = setup.predictor_features + list(enc_val.columns)
print(predictor_features)

['NoEmp', 'CreateJob', 'LowDoc', 'DisbursementGross', 'new_business', 'urban_flag', 'franchise_flag', 'tt_NAICS', 'tt_cluster_020']


In [31]:
with open(Path(setup.temp_path).joinpath('A10_DATA_features_predict.pkl'), 'wb') as fout:
    pickle.dump(predictor_features, fout)

## Model Fit

In [32]:
best_params_df = pd.read_csv(Path(setup.parent_path).joinpath('03_REPORT_fit_parameter_selected.csv'))
best_params = best_params_df.to_dict(orient='records')[0]
best_params

{'subsample': 0.8,
 'scale_pos_weight': 1.0,
 'reg_lambda': 0.01,
 'reg_alpha': 0.001,
 'min_child_weight': 50,
 'max_depth': 8,
 'learning_rate': 0.1,
 'gamma': 2}

##### Fit using pre-determined parameters

In [33]:
xgb_params = dict(best_params, 
                   **{'objective':"binary:logistic", 'n_estimators':setup.xgb_n_estimators})

In [34]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic",
                             n_estimators =  setup.xgb_n_estimators)

In [35]:
xgb_model = xgb_model.set_params(**xgb_params)

In [36]:
xgb_model.fit(train_df[predictor_features], train_df['target'],
              eval_set = [(train_df[predictor_features], train_df['target']),
                         (val_df[predictor_features], val_df['target'])])

[0]	validation_0-logloss:0.50045	validation_1-logloss:0.50397
[1]	validation_0-logloss:0.49295	validation_1-logloss:0.49668
[2]	validation_0-logloss:0.48664	validation_1-logloss:0.49056
[3]	validation_0-logloss:0.48125	validation_1-logloss:0.48533
[4]	validation_0-logloss:0.47672	validation_1-logloss:0.48095
[5]	validation_0-logloss:0.47279	validation_1-logloss:0.47714
[6]	validation_0-logloss:0.46951	validation_1-logloss:0.47404
[7]	validation_0-logloss:0.46668	validation_1-logloss:0.47136
[8]	validation_0-logloss:0.46417	validation_1-logloss:0.46898
[9]	validation_0-logloss:0.46203	validation_1-logloss:0.46697
[10]	validation_0-logloss:0.46013	validation_1-logloss:0.46522
[11]	validation_0-logloss:0.45850	validation_1-logloss:0.46372
[12]	validation_0-logloss:0.45701	validation_1-logloss:0.46234
[13]	validation_0-logloss:0.45579	validation_1-logloss:0.46123
[14]	validation_0-logloss:0.45471	validation_1-logloss:0.46027
[15]	validation_0-logloss:0.45376	validation_1-logloss:0.45944
[1

In [37]:
train_df['target'].value_counts()

target
0    346866
1     89254
Name: count, dtype: int64

##### Save the model

In [38]:
xgb_model.save_model(Path(setup.temp_path).joinpath('A10_MODEL_xgboost.json'))

In [39]:
with open(Path(setup.temp_path).joinpath('A10_MODEL_xgboost.pkl'), 'wb') as po:
    pickle.dump(xgb_model, po)

## Probability Predictions

In [40]:
pred_df = pd.concat([sba_loans[['LoanNr_ChkDgt', 'dset', 'target', 'dset_naics_holdout']].reset_index(),
                     pd.DataFrame({'predict_prob':
                                   xgb_model.predict_proba(sba_loans[predictor_features])[:,1]})],
                    axis=1) \
    .set_index('index')

In [41]:
pred_df.groupby('target')['predict_prob'].agg(['mean', 'std', 'min', 'max', 'median'])

Unnamed: 0_level_0,mean,std,min,max,median
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.183323,0.115077,0.016432,0.642858,0.16845
1,0.28321,0.111203,0.016491,0.657774,0.286491


## Decision Threshold
Max f1, on training data

In [42]:
pred_train_df = pred_df[pred_df['dset'] == 'train']

In [43]:
thresh_tune_data = sg_plot.get_f1_frame(pred_train_df['target'], 
                                        pred_train_df['predict_prob'])

In [44]:
thresh_tune_data.to_csv(Path(setup.temp_path).joinpath('A10_REPORT_thresh_tune.csv'), index=False)

In [45]:
thresh_head = thresh_tune_data.sort_values('f1', ascending=False).head(2)
best_thresh = thresh_head['thresh'].iloc[0]
print(best_thresh)
thresh_head

0.24


Unnamed: 0,thresh,f1
12,0.24,0.455831
11,0.22,0.454567


##### Apply decision threshold
All datasets

In [46]:
pred_df['predict_bin'] = np.where(pred_df['predict_prob'] >= best_thresh, 1, 0)

In [47]:
pred_df['predict_bin'].value_counts()

predict_bin
0    410069
1    278012
Name: count, dtype: int64

In [48]:
pred_df.to_parquet(Path(setup.temp_path).joinpath('A10_DATA_combined_predictions.parquet'))

## Metrics

In [49]:
metrics_dset_df = pred_df.groupby('dset') \
    .apply(lambda x: sg_plot.dset_metrics(x.target, x.predict_bin, x.predict_prob)) \
    .reset_index()
metrics_test_df = pred_df[pred_df['dset'] == 'test'] \
    .groupby(['dset', 'dset_naics_holdout']) \
    .apply(lambda x: sg_plot.dset_metrics(x.target, x.predict_bin, x.predict_prob)) \
    .reset_index()
metrics_df = pd.concat([metrics_dset_df, metrics_test_df])
metrics_df.to_csv(Path(setup.temp_path).joinpath('A10_REPORT_metrics.csv'), index=True)
metrics_df

Unnamed: 0,dset,accuracy_score,f1_score,precision_score,recall_score,average_precision_score,roc_auc_score,dset_naics_holdout
0,test,0.673806,0.436935,0.330048,0.646212,0.369375,0.729286,
1,train,0.665315,0.455831,0.341575,0.684944,0.394593,0.736426,
2,val,0.661598,0.451364,0.340078,0.670913,0.388422,0.728154,
0,test,0.660232,0.449945,0.33729,0.675597,0.384056,0.729485,0.0
1,test,0.693307,0.41491,0.317533,0.598427,0.34564,0.727682,1.0


100: 0.383476 / 0.349328  50 : 0.385128 / 0.348482. 10: 0.384807/0.384807.  1000: 0.379283/0.348526

dgi 3 only 0.377552/0.355, dgi 20 0.384056 / 0.345640

## Lift
Test data

In [None]:
pred_test_df = pred_df[pred_df['dset'] == 'test']

In [None]:
plot_lift_curve(pred_test_df['target'], 
                pd.concat([1-pred_test_df['predict_prob'], pred_test_df['predict_prob']], axis=1), 
                title='Lift Curve', ax=None, figsize=(4,3), 
                title_fontsize='large', text_fontsize='medium')
plt.gca().legend(loc='upper right')
plt.gcf().savefig(Path(setup.temp_path).joinpath('A10_PLOT_lift_curve.png'),bbox_inches='tight')

## Feature Importance 

##### XGBoost gain

In [None]:
feat_impgain = pd.DataFrame(xgb_model.get_booster().get_score(importance_type='gain'),
                                     index=['importance']).T \
    .sort_values('importance', ascending=False)

In [None]:
feat_impgain.to_csv(Path(setup.temp_path).joinpath('A10_REPORT_importance_gain.csv'), index=True)

In [None]:
feat_impgain.head()

In [None]:
feat_impgain.plot(kind='barh', legend=None)
plt.gca().invert_yaxis()
plt.xlabel('gain importance')
plt.gcf().savefig(Path(setup.temp_path).joinpath('A10_PLOT_importance_gain.png'))

##### Permutation Importance

In [None]:
perm_importance = permutation_importance(xgb_model, 
                                         train_df[predictor_features],  train_df['target'],
                                         n_repeats=30, scoring = 'average_precision')

In [None]:
perm_importance_df = pd.DataFrame({'features':predictor_features,
                                   'importance': perm_importance['importances_mean']}) 
perm_importance_df['abs_importance'] = np.abs(perm_importance_df['importance'])
perm_importance_df = perm_importance_df.sort_values('abs_importance', ascending=False)

In [None]:
perm_importance_df.head(5)

In [None]:
perm_importance_df \
    .to_csv(Path(setup.temp_path).joinpath('A10_DATA_importance_perturb.csv'),
            index=False)

In [None]:
perm_importance_df.head(10).plot(kind='barh', x='features', y='importance', legend=None)
plt.gca().invert_yaxis()
plt.xlabel('permutation importance')
plt.ylabel(None)
plt.gcf().savefig(Path(setup.temp_path).joinpath('A10_PLOT_importance_perturb.png'))

## PDP Plot - NAICS mean encoded

In [None]:
results = partial_dependence(xgb_model, train_df[predictor_features], ['menc_dgi'])
pdp_df = pd.DataFrame({'grid_values':results["grid_values"][0],
                       'pdp':results["average"][0]})

In [None]:
plt.close()

In [None]:
fig, ax = plt.subplots(figsize = (8, 3))
pdp_df.plot(x='grid_values', y='pdp', legend=None, ax=ax)
ax.set_ylabel('effect')
fig.savefig(Path(setup.temp_path).joinpath('A10_PLOT_pdp_naics_meanenc.png'),
               bbox_inches='tight')

## NAICS variation plot

In [None]:
embed_df.filter(like='cluster').columns

In [None]:
sba_loans = sba_loans.drop(columns='cluster', errors='ignore') \
    .merge(embed_df[['NAICS_orig', 'cluster']] \
               .rename(columns={'NAICS_orig':'NAICS'}),
           on='NAICS', how='left')

In [None]:
sba_loans.filter(like='cluster').columns

In [None]:
sba_loans['cluster_desc'] = ''
naics_info = sg_plot.naics_variance_data(sba_loans,
                                        naics_sector_feat='cluster',
                                        naics_sector_desc_feat='cluster_desc')

In [None]:
sba_loans['cluster_desc'] = ''
naics_info_020 = sg_plot.naics_variance_data(sba_loans,
                                        naics_sector_feat='cluster_020',
                                        naics_sector_desc_feat='cluster_desc')

In [None]:
# Plot higher volume NAICS (k=10 cluster)
plt.close()
fig = sg_plot.naics_variance_plot(naics_info[naics_info['count_naics'] >= 100],
                                  naics_sector_feat='cluster',
                                  naics_sector_desc_feat='cluster_desc',
                                  num_sectors=5, xlabel='loan default rate')
fig.savefig(Path(setup.temp_path).joinpath('A10_PLOT_naics_counts_rates_100.png'),
           bbox_inches='tight')
plt.close()
fig = sg_plot.naics_variance_plot(naics_info[naics_info['count_naics'] >= 150],
                                  naics_sector_feat='cluster',
                                  naics_sector_desc_feat='cluster_desc',
                                  num_sectors=5, xlabel='loan default rate')
fig.savefig(Path(setup.temp_path).joinpath('A10_PLOT_naics_counts_rates_150.png'),
           bbox_inches='tight')
plt.close()

In [None]:
# Plot higher volume NAICS, 20 level cluster
plt.close()
fig = sg_plot.naics_variance_plot(naics_info_020[naics_info_020['count_naics'] >= 100],
                                  naics_sector_feat='cluster_020',
                                  naics_sector_desc_feat='cluster_desc',
                                  num_sectors=5, xlabel='loan default rate')
fig.savefig(Path(setup.temp_path).joinpath('A10_PLOT_naics_counts_rates_clus020_100.png'),
           bbox_inches='tight')
plt.close()
fig = sg_plot.naics_variance_plot(naics_info_020[naics_info_020['count_naics'] >= 150],
                                  naics_sector_feat='cluster_020',
                                  naics_sector_desc_feat='cluster_desc',
                                  num_sectors=5, xlabel='loan default rate')
fig.savefig(Path(setup.temp_path).joinpath('A10_PLOT_naics_counts_rates_clus020_150.png'),
           bbox_inches='tight')
plt.close()

In [None]:
# Plot top 8 volume NAICS, 20 level cluster
plt.close()
fig = sg_plot.naics_variance_plot(naics_info_020[naics_info_020['count_naics'] >= 100],
                                  naics_sector_feat='cluster_020',
                                  naics_sector_desc_feat='cluster_desc',
                                  num_sectors=8, xlabel='loan default rate')
fig.savefig(Path(setup.temp_path).joinpath('A10_PLOT_naics_counts_rates_clus020_100_top_8.png'),
           bbox_inches='tight')
plt.close()
fig = sg_plot.naics_variance_plot(naics_info_020[naics_info_020['count_naics'] >= 150],
                                  naics_sector_feat='cluster_020',
                                  naics_sector_desc_feat='cluster_desc',
                                  num_sectors=8, xlabel='loan default rate')
fig.savefig(Path(setup.temp_path).joinpath('A10_PLOT_naics_counts_rates_clus020_150_top_8.png'),
           bbox_inches='tight')

In [51]:
TO DO
Start at different levels, pairs for NAICS and DGI
Code cleanup 
Renaming
Neural network thresholding (for hier info)
Neural network embedding
Try different code combinations (pairs, parts of hierarchy)

SyntaxError: invalid syntax (1690945089.py, line 1)