In [1]:
# Modify the file A00_setup.  Note the working directory is changed
import os
os.chdir(os.getcwd().rsplit(os.path.sep + 'code')[0] + os.path.sep + 'code')
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./A_target_count_encoding/A00_setup.py").load_module()

# XGBoost, Regular Target Encoding: Loop, Remove Low-Volume from Training Data
Back to standard target encoding, but different amounts of blending (lambda_k)

*This script takes about 5 minutes on my MacBook Air*

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
import warnings
import matplotlib.pyplot as plt
import pickle

In [3]:
from pathlib import Path

In [4]:
import sklearn as sk
from sklearn.model_selection import RandomizedSearchCV
from sklearn.inspection import permutation_importance, partial_dependence
from scikitplot.metrics import plot_lift_curve

In [5]:
from sba_gnn.sba_gnn import sg_plot, sg_blender
from sba_gnn.sba_gnn.sg_blender import HierarchicalEncoder # Target encoder

## Import Processed Datasets

In [6]:
sba_loans = pd.read_parquet(Path(setup.parent_path).joinpath('01_DATA_transformed.parquet')) 

In [7]:
sba_loans.shape

(688081, 58)

In [8]:
predictor_features = setup.predictor_features + ['menc_NAICS']
print(predictor_features)
with open(Path(setup.temp_path).joinpath('14_DATA_features_predict.pkl'), 'wb') as fout:
    pickle.dump(predictor_features, fout)

['NoEmp', 'CreateJob', 'LowDoc', 'DisbursementGross', 'new_business', 'urban_flag', 'franchise_flag', 'menc_NAICS']


In [9]:
best_params_df = pd.read_csv(Path(setup.parent_path).joinpath('03_REPORT_fit_parameter_selected.csv'))
best_params = best_params_df.to_dict(orient='records')[0]
best_params

{'subsample': 0.8,
 'scale_pos_weight': 1.0,
 'reg_lambda': 0.01,
 'reg_alpha': 0.001,
 'min_child_weight': 50,
 'max_depth': 8,
 'learning_rate': 0.1,
 'gamma': 2}

In [10]:
xgb_params = dict(best_params, 
                   **{'objective':"binary:logistic", 'n_estimators':setup.xgb_n_estimators})

## NAICS Count Dataset

In [11]:
train_counts = sba_loans[sba_loans['dset'] == 'train'] \
    .groupby('NAICS')['LoanNr_ChkDgt'].agg('count') \
    .rename('count') \
    .reset_index()

In [12]:
train_counts.shape

(1166, 2)

In [20]:
train_counts.to_parquet(Path(setup.temp_path).joinpath('A60_DATA_train_counting.parquet')) 

## Function to get metrics

In [21]:
def get_metrics(data, lambda_k = 100, enc_features = ['NAICS']):
    

    
    # Alternative encoding
    train_df = data[data['dset'] == 'train']
    hier_enc = HierarchicalEncoder(lambda_k = lambda_k)
    hier_enc.fit(train_df[enc_features],train_df['target'])
    hier_col = hier_enc.transform(data[enc_features]) \
        .rename('menc_NAICS')
    
    
    # Append to data
    data = pd.concat([data.drop(columns='menc_NAICS', errors='ignore'), 
                       hier_col], axis=1)
    train_df = data[data['dset'] == 'train']
    test_df = data[data['dset'] == 'test']
    val_df = data[data['dset'] == 'val']
    
    
    # Remove low volume codes from train only
    counts_thresh = train_counts[train_counts['count'] > lambda_k][['NAICS']]
    train_df = train_df.merge(counts_thresh, on='NAICS')
    
    # Model fit
    xgb_model = xgb.XGBClassifier(objective="binary:logistic",
                             n_estimators =  setup.xgb_n_estimators)
    xgb_model = xgb_model.set_params(**xgb_params)
    xgb_model.fit(train_df[predictor_features], train_df['target'],
              eval_set = [(train_df[predictor_features], train_df['target']),
                         (val_df[predictor_features], val_df['target'])])
    
    # Predict
    pred_df = pd.concat([data[['LoanNr_ChkDgt', 'dset', 'target', 'dset_naics_holdout']].reset_index(),
                     pd.DataFrame({'predict_prob':
                                   xgb_model.predict_proba(data[predictor_features])[:,1]})],
                    axis=1) \
    .set_index('index')
    
    # Decision threshold
    pred_train_df = pred_df[pred_df['dset'] == 'train']
    thresh_tune_data = sg_plot.get_f1_frame(pred_train_df['target'], 
                                        pred_train_df['predict_prob'])
    thresh_head = thresh_tune_data.sort_values('f1', ascending=False).head(2)
    best_thresh = thresh_head['thresh'].iloc[0]
    pred_df['predict_bin'] = np.where(pred_df['predict_prob'] >= best_thresh, 1, 0)
    
    # Metrics
    metrics_dset_df = pred_df.groupby('dset') \
        .apply(lambda x: sg_plot.dset_metrics(x.target, x.predict_bin, x.predict_prob)) \
        .reset_index()
    metrics_test_df = pred_df[pred_df['dset'] == 'test'] \
        .groupby(['dset', 'dset_naics_holdout']) \
        .apply(lambda x: sg_plot.dset_metrics(x.target, x.predict_bin, x.predict_prob)) \
        .reset_index()
    metrics_df = pd.concat([metrics_dset_df, metrics_test_df])
    
    return pred_df, metrics_df

## Loop to get predictions, metrics

In [22]:
count_sets = [5, 10, 20, 50, 100, 200, 500, 1000, 2000]

In [23]:
%%capture
pred_df = pd.DataFrame()
metrics_df = pd.DataFrame()
for c in count_sets:
    this_pred, this_met = get_metrics(sba_loans, c)
    this_pred['c'] = c
    pred_df = pd.concat([pred_df, this_pred])
    this_met['c'] = c
    metrics_df = pd.concat([metrics_df, this_met])

In [17]:
metrics_df[metrics_df['dset_naics_holdout'] == 1]

Unnamed: 0,dset,accuracy_score,f1_score,precision_score,recall_score,average_precision_score,roc_auc_score,dset_naics_holdout,c
1,test,0.640549,0.419705,0.296973,0.715337,0.347333,0.729054,1.0,5
1,test,0.644807,0.419972,0.29859,0.707639,0.345349,0.727926,1.0,10
1,test,0.654507,0.420195,0.302279,0.688943,0.346476,0.728818,1.0,20
1,test,0.653416,0.419456,0.301499,0.689028,0.346226,0.728723,1.0,50
1,test,0.654799,0.419802,0.302198,0.687252,0.346062,0.728232,1.0,100
1,test,0.654907,0.418194,0.30145,0.682514,0.347391,0.729256,1.0,200
1,test,0.653093,0.418721,0.301015,0.68759,0.345751,0.728242,1.0,500
1,test,0.63191,0.420204,0.294355,0.734033,0.344108,0.726623,1.0,1000
1,test,0.636306,0.416677,0.294036,0.71483,0.340408,0.72396,1.0,2000


In [18]:
metrics_df[metrics_df['dset_naics_holdout'] == 0]

Unnamed: 0,dset,accuracy_score,f1_score,precision_score,recall_score,average_precision_score,roc_auc_score,dset_naics_holdout,c
0,test,0.659784,0.450008,0.337138,0.676487,0.383901,0.728634,0.0,5
0,test,0.660297,0.450122,0.337498,0.675559,0.38246,0.7282,0.0,10
0,test,0.659326,0.450044,0.337088,0.676853,0.383483,0.728433,0.0,20
0,test,0.658596,0.451102,0.337766,0.678905,0.384025,0.72803,0.0,50
0,test,0.65496,0.451296,0.33706,0.682665,0.384473,0.727334,0.0,100
0,test,0.653215,0.452797,0.338584,0.683287,0.387052,0.726511,0.0,200
0,test,0.648925,0.453166,0.337357,0.690051,0.38574,0.724471,0.0,500
0,test,0.651013,0.449397,0.335355,0.680969,0.38254,0.724788,0.0,1000
0,test,0.628525,0.438504,0.315242,0.720051,0.369373,0.723,0.0,2000


In [19]:
pred_df.to_parquet(Path(setup.temp_path).joinpath('A60_DATA_combined_predictions.parquet'))
metrics_df.to_csv(Path(setup.temp_path).joinpath('A60_REPORT_metrics.csv'), index=True)