In [1]:
# Modify the file A00_setup.  Note the working directory is changed
import os
os.chdir(os.getcwd().rsplit(os.path.sep + 'code')[0] + os.path.sep + 'code')
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./A_target_count_encoding/A00_setup.py").load_module()
os.getcwd()

'/Users/valeriecarey/Documents/projects/2023_10_blog_gnn_sba/code'

# Random Groups, Hierarchical Blending.
Make random groups to match NAICS counts.   Then do the hierarchical blend

*This script takes about 5 minutes on my MacBook Air*

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
import warnings
import matplotlib.pyplot as plt
import pickle

In [3]:
from pathlib import Path

In [4]:
import sklearn as sk
from sklearn.model_selection import RandomizedSearchCV
from sklearn.inspection import permutation_importance, partial_dependence
from scikitplot.metrics import plot_lift_curve

In [5]:
from sba_gnn.sba_gnn import sg_plot, sg_blender
from sba_gnn.sba_gnn.sg_blender import HierarchicalEncoder # Hierarchical blending

## Import Processed Datasets

In [6]:
sba_loans = pd.read_parquet(Path(setup.parent_path).joinpath('01_DATA_transformed.parquet')) 

In [7]:
best_params_df = pd.read_csv(Path(setup.parent_path).joinpath('03_REPORT_fit_parameter_selected.csv'))
best_params = best_params_df.to_dict(orient='records')[0]
best_params

{'subsample': 0.8,
 'scale_pos_weight': 1.0,
 'reg_lambda': 0.01,
 'reg_alpha': 0.001,
 'min_child_weight': 50,
 'max_depth': 8,
 'learning_rate': 0.1,
 'gamma': 2}

In [8]:
xgb_params = dict(best_params, 
                   **{'objective':"binary:logistic", 'n_estimators':setup.xgb_n_estimators})

In [9]:
predictor_features = setup.predictor_features + ['mhier_NAICS_alt']

## Make NAICS groups

In [10]:
# Group counts
naics_grp_counts = pd.read_csv(Path(setup.parent_path).joinpath('02_REPORT_naics_grp_stats_all.csv')) \
    [['level_0', 'count_grp']] \
    .drop_duplicates() \
    .set_axis(['naics_type', 'k'], axis=1)

In [11]:
naics_grp_counts

Unnamed: 0,naics_type,k
0,NAICS_5,834.0
5,NAICS_4,354.0
10,NAICS_3,106.0
15,NAICS_sector,20.0


In [12]:
naics_grp_k = naics_grp_counts['k'].drop_duplicates().sort_values().astype('int').to_list()
naics_grp_k

[20, 106, 354, 834]

In [13]:
naics_df = sba_loans[['NAICS']].drop_duplicates() 

In [14]:
def make_grp(data, k, grp_name = 'rgrp', seed=34535):
    np.random.seed(seed+k)
    return pd.Series(np.random.randint(0,k,size=len(data)),
                    name = grp_name)

In [15]:
naics_k_assign = pd.concat([make_grp(naics_df, k, f'rgrp_{k:03d}') for k in naics_grp_k], axis=1)

In [16]:
# Save the new columns
naics_rand_map = pd.concat([naics_df, naics_k_assign], axis=1) 
naics_rand_map.to_parquet(Path(setup.temp_path).joinpath('A42_DATA_naics_groups.parquet'))

In [17]:
# Apply to SBA loans

In [18]:
sba_loans = sba_loans.drop(columns=naics_k_assign.columns, errors='ignore') \
    .merge(naics_rand_map, how='left', on='NAICS')

In [19]:
# Save data
sba_loans[['LoanNr_ChkDgt', 'dset', 'dset_naics_holdout', 'NAICS'] + list(naics_k_assign.columns)] \
     .to_parquet(Path(setup.temp_path).joinpath('42_DATA_naics_groups_loans.parquet'))

## Encode NAICS and Groups

In [30]:
def get_metrics(data, enc_features = ['NAICS', 'NAICS_sector'],
               save_info = False):
    
    # Alternative encoding
    train_df = data[data['dset'] == 'train']
    hier_enc = HierarchicalEncoder(lambda_k = setup.selected_lambda_k)
    hier_enc.fit(train_df[enc_features],train_df['target'])
    hier_col = hier_enc.transform(data[enc_features]) \
        .rename('mhier_NAICS_alt')
    
    # Append to data
    data = pd.concat([data.drop(columns='mhier_NAICS_alt', errors='ignore'), 
                       hier_col], axis=1)
    train_df = data[data['dset'] == 'train']
    test_df = data[data['dset'] == 'test']
    val_df = data[data['dset'] == 'val']
    
    # Model fit
    xgb_model = xgb.XGBClassifier(objective="binary:logistic",
                             n_estimators =  setup.xgb_n_estimators)
    xgb_model = xgb_model.set_params(**xgb_params)
    xgb_model.fit(train_df[predictor_features], train_df['target'],
              eval_set = [(train_df[predictor_features], train_df['target']),
                         (val_df[predictor_features], val_df['target'])])
    
    # Save info for Shapley (optional)
    if save_info:
        pd.concat([data[['LoanNr_ChkDgt', 'dset', 'dset_naics_holdout', 'NAICS']] \
               .drop(columns=hier_col.name, errors='ignore'), hier_col], axis = 1) \
            .to_parquet(Path(setup.temp_path).joinpath('A42_DATA_encodings.parquet'))
        with open(Path(setup.temp_path).joinpath('A42_DATA_features_predict.pkl'), 'wb') as fout:
            pickle.dump(predictor_features, fout)
        xgb_model.save_model(Path(setup.temp_path).joinpath('A42_MODEL_xgboost.json'))
    
    # Predict
    pred_df = pd.concat([data[['LoanNr_ChkDgt', 'dset', 'target', 'dset_naics_holdout']].reset_index(),
                     pd.DataFrame({'predict_prob':
                                   xgb_model.predict_proba(data[predictor_features])[:,1]})],
                    axis=1) \
    .set_index('index')
    
    # Decision threshold
    pred_train_df = pred_df[pred_df['dset'] == 'train']
    thresh_tune_data = sg_plot.get_f1_frame(pred_train_df['target'], 
                                        pred_train_df['predict_prob'])
    thresh_head = thresh_tune_data.sort_values('f1', ascending=False).head(2)
    best_thresh = thresh_head['thresh'].iloc[0]
    pred_df['predict_bin'] = np.where(pred_df['predict_prob'] >= best_thresh, 1, 0)
    
    # Metrics
    metrics_dset_df = pred_df.groupby('dset') \
        .apply(lambda x: sg_plot.dset_metrics(x.target, x.predict_bin, x.predict_prob)) \
        .reset_index()
    metrics_test_df = pred_df[pred_df['dset'] == 'test'] \
        .groupby(['dset', 'dset_naics_holdout']) \
        .apply(lambda x: sg_plot.dset_metrics(x.target, x.predict_bin, x.predict_prob)) \
        .reset_index()
    metrics_df = pd.concat([metrics_dset_df, metrics_test_df])
    
    return pred_df, metrics_df

## Loop to get predictions, metrics

In [31]:
# Hierarchical encode from different levels
enc_features = sorted(list(naics_k_assign.columns), reverse=True)
enc_features

['rgrp_834', 'rgrp_354', 'rgrp_106', 'rgrp_020']

In [32]:
# Hierarchical encode from different levels
feature_sets = [['NAICS'] + enc_features[i:] \
              for i in range(len(enc_features))]
feature_sets

[['NAICS', 'rgrp_834', 'rgrp_354', 'rgrp_106', 'rgrp_020'],
 ['NAICS', 'rgrp_354', 'rgrp_106', 'rgrp_020'],
 ['NAICS', 'rgrp_106', 'rgrp_020'],
 ['NAICS', 'rgrp_020']]

In [33]:
%%capture
pred_df = pd.DataFrame()
metrics_df = pd.DataFrame()
save_info = True # Save first model data
for this_set in feature_sets:
    this_pred, this_met = get_metrics(sba_loans, this_set, save_info = save_info)
    this_pred['c'] = this_set[1]
    pred_df = pd.concat([pred_df, this_pred])
    this_met['c'] = this_set[1]
    metrics_df = pd.concat([metrics_df, this_met])
    save_info = False

In [34]:
metrics_df[metrics_df['dset_naics_holdout'] == 0]

Unnamed: 0,dset,accuracy_score,f1_score,precision_score,recall_score,average_precision_score,roc_auc_score,dset_naics_holdout,c
0,test,0.659291,0.44928,0.336529,0.675649,0.383335,0.728239,0.0,rgrp_834
0,test,0.659419,0.449278,0.336591,0.675389,0.38316,0.728418,0.0,rgrp_354
0,test,0.659141,0.450046,0.336796,0.678042,0.383442,0.728587,0.0,rgrp_106
0,test,0.633781,0.44975,0.325461,0.727618,0.383186,0.728328,0.0,rgrp_020


In [35]:
metrics_df[metrics_df['dset_naics_holdout'] == 1]

Unnamed: 0,dset,accuracy_score,f1_score,precision_score,recall_score,average_precision_score,roc_auc_score,dset_naics_holdout,c
1,test,0.702915,0.355628,0.29349,0.451146,0.291826,0.688923,1.0,rgrp_834
1,test,0.680456,0.373942,0.290338,0.525167,0.291951,0.691566,1.0,rgrp_354
1,test,0.672539,0.387416,0.293469,0.569833,0.295002,0.696876,1.0,rgrp_106
1,test,0.641948,0.41495,0.295095,0.698756,0.323272,0.71383,1.0,rgrp_020


In [36]:
pred_df.to_parquet(Path(setup.temp_path).joinpath('A42_DATA_combined_predictions.parquet'))
metrics_df.to_csv(Path(setup.temp_path).joinpath('A42_REPORT_metrics.csv'), index=True)