In [1]:
# Modify the file 00_setup.py to define input/output file paths on your system
# The information in 00_setup.py will be used across notebooks
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./00_setup.py").load_module()

# Data Imputation and Scaling for Neural Networks
Convert features so they can be used in a neural network (or GNN).  
* Impute missing features
* Scale values

For continuous features, I do quantile encoding then min/max scaling to -1, 1.  For binary fields (and fields with a small number of levels) I just min/max rescale

*This script takes about 5 minutes on my MacBook Air*

In [2]:
import pandas as pd
import numpy as np

In [53]:
from pathlib import Path
import importlib, pickle
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder

In [4]:
# Imputer object for easy dataset conversion to GNN friendly format
from sba_gnn.sba_gnn import sg_imputer 
from sba_gnn.sba_gnn.sg_imputer import GNNImputer

## Input training data, descriptives, fit imputer

For a neural network, I need to handle missings and also scale features.  For missing values, I will do a simple median fill for all, but add missing indicators.  

To scale the data, I will do a quantile transform for features with > 5 levels.  This is to avoid scaling binary features.  Then I do a Min/Max scaling on all features, so they are in the [0,1] range

I define a class to do this in sg_imputer.  This class wraps several scikit-learn imputers/scalers so that I can easily fit the objects and then transform the data.  In this section, I show some descriptives to justify my imputation/scaling choices, and also fit the scaler

##### Import train data, show missing

In [5]:
sba_loans = pd.read_parquet(Path(setup.temp_path).joinpath('01_DATA_transformed.parquet'))

In [85]:
train_df = sba_loans[sba_loans['dset'] == 'train']

In [87]:
# Same information, more compact
pd.concat([pd.DataFrame([train_df[c].isna().sum()]) for c in setup.predictor_features],
         keys=setup.predictor_features).set_axis(['count_missing'], axis=1)

Unnamed: 0,Unnamed: 1,count_missing
NoEmp,0,0
CreateJob,0,0
LowDoc,0,3117
DisbursementGross,0,0
new_business,0,544
urban_flag,0,83339
franchise_flag,0,0


Most fields aren't missing too much.  However I must handle those that are.  I will use a median fill, especially since many of the fields with missingness seem to be binary

##### Explore scaling

In [88]:
# Level counts
feature_level_df = pd.concat([pd.DataFrame([train_df[c].value_counts().count()]) for c in setup.predictor_features],
         keys=setup.predictor_features)
feature_level_df

Unnamed: 0,Unnamed: 1,0
NoEmp,0,425
CreateJob,0,184
LowDoc,0,2
DisbursementGross,0,78795
new_business,0,2
urban_flag,0,2
franchise_flag,0,2


A number of features are binary.  I don't need to scale these.  The others must be scaled.  

What I will do is scale features with more than a threshold number of levels using a quantile scaler.  Then I'll use a MinMax scaler on everything to be safe (as a 2 level feature could have values other than 0,1).

##### Create imputer to do the median fill and scaling
This imputer fills missing values, adding missingness indicator features.  Then it quantile scales features with over 5 levels, and MinMax scales the rest.

In [306]:
importlib.reload(sg_imputer)
from sba_gnn.sba_gnn.sg_imputer import GNNImputer

In [307]:
imputer = GNNImputer(features = setup.predictor_features)                               

In [308]:
train_out = imputer.fit_transform(train_df)

In [309]:
train_out.describe()

Unnamed: 0,NoEmp,CreateJob,LowDoc,DisbursementGross,new_business,urban_flag,franchise_flag,missingindicator_LowDoc,missingindicator_new_business,missingindicator_urban_flag,NAICS
count,425594.0,425594.0,425594.0,425594.0,425594.0,425594.0,425594.0,425594.0,425594.0,425594.0,425594.0
mean,0.000952,-0.425126,-0.80582,-0.008115,-0.440904,0.702341,-0.898283,-0.985352,-0.997444,-0.608364,816.985373
std,0.5753,0.794153,0.592162,0.57931,0.897555,0.711842,0.439417,0.170532,0.071459,0.793659,342.255738
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0
25%,-0.435435,-1.0,-1.0,-0.510511,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,687.0
50%,0.029029,-1.0,-1.0,-0.011011,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,897.0
75%,0.505506,0.557558,-1.0,0.492997,1.0,1.0,-1.0,-1.0,-1.0,-1.0,1094.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1232.0


In [311]:
with open(Path(setup.temp_path).joinpath('10_DATA_imputer.pkl'), 'wb') as fout:
    pickle.dump(imputer, fout)

In [276]:
with open(Path(setup.temp_path).joinpath('10_DATA_features.pkl'), 'wb') as fout:
    pickle.dump(imputer.features_out, fout)

In [312]:
naics_max_levels = imputer.get_naics_encoder_levels()
print(f'NAICS encoder max num: {naics_max_levels}')
with open(Path(setup.temp_path).joinpath('10_DATA_naics_max_levels.pkl'), 'wb') as fout:
    pickle.dump(naics_max_levels, fout)

NAICS encoder max num: 1232


## Transform Test, Validation Data

In [313]:
test_df = sba_loans[sba_loans['dset'] == 'test']
val_df = sba_loans[sba_loans['dset'] == 'val']

In [314]:
test_out = imputer.transform(test_df)

In [315]:
val_out = imputer.transform(val_df)

In [316]:
pd.concat([test_out[['DisbursementGross', 'NoEmp']], test_df[['DisbursementGross', 'NoEmp']]],
          axis=1).corr(method='spearman')

Unnamed: 0,DisbursementGross,NoEmp,DisbursementGross.1,NoEmp.1
DisbursementGross,1.0,0.442105,1.0,0.442105
NoEmp,0.442105,1.0,0.442105,1.0
DisbursementGross,1.0,0.442105,1.0,0.442105
NoEmp,0.442105,1.0,0.442105,1.0


In [317]:
val_out.describe()

Unnamed: 0,NoEmp,CreateJob,LowDoc,DisbursementGross,new_business,urban_flag,franchise_flag,missingindicator_LowDoc,missingindicator_new_business,missingindicator_urban_flag,NAICS
count,126041.0,126041.0,126041.0,126041.0,126041.0,126041.0,126041.0,126041.0,126041.0,126041.0,126041.0
mean,-0.000669,-0.4215,-0.802905,-0.008551,-0.442769,0.70416,-0.898049,-0.985402,-0.997937,-0.606557,814.446014
std,0.575287,0.795284,0.596109,0.579318,0.896639,0.710044,0.439897,0.170247,0.064198,0.795043,343.605139
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
25%,-0.435435,-1.0,-1.0,-0.513514,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,680.0
50%,0.029029,-1.0,-1.0,-0.011011,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,890.0
75%,0.505506,0.557558,-1.0,0.491491,1.0,1.0,-1.0,-1.0,-1.0,-1.0,1093.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1232.0


In [318]:
test_out.describe()

Unnamed: 0,NoEmp,CreateJob,LowDoc,DisbursementGross,new_business,urban_flag,franchise_flag,missingindicator_LowDoc,missingindicator_new_business,missingindicator_urban_flag,NAICS
count,136446.0,136446.0,136446.0,136446.0,136446.0,136446.0,136446.0,136446.0,136446.0,136446.0,136446.0
mean,0.014605,-0.424661,-0.802134,-0.01071,-0.455448,0.715477,-0.903889,-0.985122,-0.99742,-0.582223,616.290606
std,0.577628,0.794512,0.597146,0.578379,0.890266,0.698639,0.427769,0.171855,0.071784,0.813032,460.186502
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
25%,-0.435435,-1.0,-1.0,-0.515055,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,31.25
50%,0.029029,-1.0,-1.0,-0.011011,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,756.0
75%,0.505506,0.557558,-1.0,0.489428,1.0,1.0,-1.0,-1.0,-1.0,-1.0,1057.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1232.0


## Combine data
Combine transformed datasets, with key information.  Include the business ID, target status, NAICS features, as well as transformed features from above. Save this data for later use (especially the indices which indicate train/test statustrain_df

In [319]:
naics_features = ['LoanNr_ChkDgt', 'target', 'dset_naics_holdout', 'NAICS', 'NAICS_sector', 
                  'menc_NAICS', 'menc_NAICS_sector'] +  \
    [c for c in train_df.columns if c.startswith('NS__')]
print(naics_features)

['LoanNr_ChkDgt', 'target', 'dset_naics_holdout', 'NAICS', 'NAICS_sector', 'menc_NAICS', 'menc_NAICS_sector', 'NS___Accommodation and Food Services', 'NS___Construction', 'NS___Health Care and Social Assistance', 'NS___Manufacturing', 'NS___Other Services (except Public Administration)', 'NS___Professional, Scientific, and Technical Services', 'NS___Retail Trade', 'NS___Wholesale Trade', 'NS___infrequent_sklearn']


In [320]:
comb_naics = pd.concat([train_df[naics_features], 
                        test_df[naics_features], 
                        val_df[naics_features]], axis=0, keys=['train', 'test', 'val']) \
    .rename(columns={'NAICS':'NAICS_orig'})

In [321]:
comb_df = pd.concat([comb_naics,
                     pd.concat([train_out, test_out, val_out], 
                               axis=0, keys=['train', 'test', 'val'])],
                    axis=1) \
    .reset_index(level=0) \
    .rename(columns={'level_0':'dset'}, errors='ignore')
print(comb_df.shape)

(688081, 28)


In [322]:
print(comb_df.columns)

Index(['dset', 'LoanNr_ChkDgt', 'target', 'dset_naics_holdout', 'NAICS_orig',
       'NAICS_sector', 'menc_NAICS', 'menc_NAICS_sector',
       'NS___Accommodation and Food Services', 'NS___Construction',
       'NS___Health Care and Social Assistance', 'NS___Manufacturing',
       'NS___Other Services (except Public Administration)',
       'NS___Professional, Scientific, and Technical Services',
       'NS___Retail Trade', 'NS___Wholesale Trade', 'NS___infrequent_sklearn',
       'NoEmp', 'CreateJob', 'LowDoc', 'DisbursementGross', 'new_business',
       'urban_flag', 'franchise_flag', 'missingindicator_LowDoc',
       'missingindicator_new_business', 'missingindicator_urban_flag',
       'NAICS'],
      dtype='object')


In [323]:
comb_df.to_parquet(Path(setup.temp_path).joinpath('10_DATA_combined_scaled_all.parquet'))

In [324]:
comb_df[['target', 'urban_flag', 'menc_NAICS', 'menc_NAICS_sector', 'DisbursementGross']].corr()

Unnamed: 0,target,urban_flag,menc_NAICS,menc_NAICS_sector,DisbursementGross
target,1.0,0.013001,0.23299,0.108193,-0.166215
urban_flag,0.013001,1.0,-0.018373,-0.013307,-0.00415
menc_NAICS,0.23299,-0.018373,1.0,0.451235,-0.257954
menc_NAICS_sector,0.108193,-0.013307,0.451235,1.0,-0.130283
DisbursementGross,-0.166215,-0.00415,-0.257954,-0.130283,1.0
