In [2]:
# Modify the file A00_setup.  Note the working directory is changed
import os
os.chdir(os.getcwd().rsplit(os.path.sep + 'code')[0] + os.path.sep + 'code')
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./A_target_count_encoding/A00_setup.py").load_module()
os.getcwd()

'/Users/valeriecarey/Documents/projects/2023_10_blog_gnn_sba/code'

# XGBoost, Target-Thresh: Try Some Alternate Fill Values
Instead of leaving the low volume and unknown NAs, try to fill them with a few things.  This could be relevant for models where NAs are not allowed

*This script takes about 5 minutes on my MacBook Air*

In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
import warnings
import matplotlib.pyplot as plt
import pickle

In [4]:
from pathlib import Path

In [5]:
import sklearn as sk
from sklearn.model_selection import RandomizedSearchCV
from sklearn.inspection import permutation_importance, partial_dependence
from scikitplot.metrics import plot_lift_curve

In [6]:
from sba_gnn.sba_gnn import sg_plot, sg_target_count_encoder
from sba_gnn.sba_gnn.sg_target_thresh_encoder import TargetThreshEncoder

## Import Processed Datasets

In [7]:
sba_loans = pd.read_parquet(Path(setup.parent_path).joinpath('01_DATA_transformed.parquet')) 

In [8]:
best_params_df = pd.read_csv(Path(setup.parent_path).joinpath('03_REPORT_fit_parameter_selected.csv'))
best_params = best_params_df.to_dict(orient='records')[0]
best_params

{'subsample': 0.8,
 'scale_pos_weight': 1.0,
 'reg_lambda': 0.01,
 'reg_alpha': 0.001,
 'min_child_weight': 50,
 'max_depth': 8,
 'learning_rate': 0.1,
 'gamma': 2}

In [9]:
xgb_params = dict(best_params, 
                   **{'objective':"binary:logistic", 'n_estimators':setup.xgb_n_estimators})

##### Comparison metrics

In [10]:
metrics_base = pd.read_csv(Path(setup.parent_path).joinpath('14_REPORT_metrics.csv'))

In [11]:
metrics_tcenc = pd.read_csv(Path(setup.temp_path).joinpath('A02_REPORT_metrics.csv'))

## Function to get metrics

In [19]:
def get_metrics(data, fill_value = None, enc_features = ['NAICS', 'NAICS_sector']):
    
    # Alternative encoding
    train_df = data[data['dset'] == 'train']
    enc_dict = {f:TargetThreshEncoder(threshold = setup.selected_lambda_k, fill_value = fill_value) 
                for f in enc_features}
    [enc_dict[f].fit(train_df[f], train_df['target']) for f in enc_features]
    enc_val = pd.concat([enc_dict[f].transform(data[f]) for f in enc_features],
                    axis=1, keys=enc_features)
    enc_val.columns = [c[0] + '_' + c[1] for c in enc_val.columns]
    
    # Append to data
    data = pd.concat([data.drop(columns=enc_val.columns, errors='ignore'), enc_val], 
                      axis=1)
    train_df = data[data['dset'] == 'train']
    test_df = data[data['dset'] == 'test']
    val_df = data[data['dset'] == 'val']
    
    predictor_features = setup.predictor_features + list(enc_val.columns)

    # Model fit
    xgb_model = xgb.XGBClassifier(objective="binary:logistic",
                             n_estimators =  setup.xgb_n_estimators)
    xgb_model = xgb_model.set_params(**xgb_params)
    xgb_model.fit(train_df[predictor_features], train_df['target'],
              eval_set = [(train_df[predictor_features], train_df['target']),
                         (val_df[predictor_features], val_df['target'])])
    
    # Predict
    pred_df = pd.concat([data[['LoanNr_ChkDgt', 'dset', 'target', 'dset_naics_holdout']].reset_index(),
                     pd.DataFrame({'predict_prob':
                                   xgb_model.predict_proba(data[predictor_features])[:,1]})],
                    axis=1) \
    .set_index('index')
    
    # Decision threshold
    pred_train_df = pred_df[pred_df['dset'] == 'train']
    thresh_tune_data = sg_plot.get_f1_frame(pred_train_df['target'], 
                                        pred_train_df['predict_prob'])
    thresh_head = thresh_tune_data.sort_values('f1', ascending=False).head(2)
    best_thresh = thresh_head['thresh'].iloc[0]
    pred_df['predict_bin'] = np.where(pred_df['predict_prob'] >= best_thresh, 1, 0)
    
    # Metrics
    metrics_dset_df = pred_df.groupby('dset') \
        .apply(lambda x: sg_plot.dset_metrics(x.target, x.predict_bin, x.predict_prob)) \
        .reset_index()
    metrics_test_df = pred_df[pred_df['dset'] == 'test'] \
        .groupby(['dset', 'dset_naics_holdout']) \
        .apply(lambda x: sg_plot.dset_metrics(x.target, x.predict_bin, x.predict_prob)) \
        .reset_index()
    metrics_df = pd.concat([metrics_dset_df, metrics_test_df])
    
    return pred_df, metrics_df

## Loop to get predictions, metrics

In [14]:
# Train mean
target_mean = sba_loans[sba_loans['dset'] == 'train']['target'].mean()
target_mean

0.2046546821975603

In [18]:
count_sets = [None] + sorted([-1, 0, target_mean/2, target_mean, target_mean*1.5, 1])
count_sets

[None, -1, 0, 0.10232734109878015, 0.2046546821975603, 0.30698202329634044, 1]

In [20]:
%%capture
pred_df = pd.DataFrame()
metrics_df = pd.DataFrame()
for c in count_sets:
    this_pred, this_met = get_metrics(sba_loans, c, ['NAICS'])
    this_pred['c'] = c
    pred_df = pd.concat([pred_df, this_pred])
    this_met['c'] = c
    metrics_df = pd.concat([metrics_df, this_met])

In [21]:
metrics_df[metrics_df['dset_naics_holdout'] == 1]

Unnamed: 0,dset,accuracy_score,f1_score,precision_score,recall_score,average_precision_score,roc_auc_score,dset_naics_holdout,c
1,test,0.689233,0.416768,0.31623,0.611031,0.34569,0.727347,1.0,
1,test,0.693107,0.412409,0.316227,0.592674,0.342408,0.725531,1.0,-1.0
1,test,0.693107,0.412409,0.316227,0.592674,0.342408,0.725531,1.0,0.0
1,test,0.70436,0.411037,0.322133,0.567718,0.34414,0.726095,1.0,0.102327
1,test,0.68931,0.416929,0.316347,0.611285,0.346063,0.72915,1.0,0.204655
1,test,0.655645,0.418171,0.301724,0.680991,0.346727,0.728946,1.0,0.306982
1,test,0.674091,0.417827,0.309319,0.6436,0.343602,0.727716,1.0,1.0


In [22]:
metrics_df[metrics_df['dset_naics_holdout'] == 0]

Unnamed: 0,dset,accuracy_score,f1_score,precision_score,recall_score,average_precision_score,roc_auc_score,dset_naics_holdout,c
0,test,0.659483,0.448981,0.336491,0.674452,0.382759,0.728253,0.0,
0,test,0.660072,0.449008,0.336794,0.67336,0.382155,0.728078,0.0,-1.0
0,test,0.660072,0.449008,0.336794,0.67336,0.382155,0.728078,0.0,0.0
0,test,0.658221,0.44858,0.335693,0.675857,0.382431,0.727874,0.0,0.102327
0,test,0.660371,0.449761,0.337277,0.674817,0.382314,0.72844,0.0,0.204655
0,test,0.660286,0.449221,0.336995,0.673516,0.382816,0.728442,0.0,0.306982
0,test,0.657814,0.448818,0.335602,0.677314,0.382596,0.728287,0.0,1.0


In [23]:
pred_df.to_parquet(Path(setup.temp_path).joinpath('A17_DATA_combined_predictions.parquet'))
metrics_df.to_csv(Path(setup.temp_path).joinpath('A17_REPORT_metrics.csv'), index=True)

This isn't tragic.  Using a dummy value could be OK.  I would want to try the hierarchy, or add an indicator