In [1]:
# Modify the file 00_setup.py to define input/output file paths on your system
# The information in 00_setup.py will be used across notebooks
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./00_setup.py").load_module()

# Data Imputation and Scaling for Neural Networks
Convert features so they can be used in a neural network (or GNN).  
* Impute missing features
* Scale values
* Set an "alternative" NAICS for encoding
  * For NAICS not in the training data, use the highest-volume NAICS within the sector

For continuous features, I do quantile encoding then min/max scaling to -1, 1.  For binary fields (and fields with a small number of levels) I just min/max rescale

*This script takes about 5 minutes on my MacBook Air*

In [2]:
import pandas as pd
import numpy as np

In [3]:
from pathlib import Path
import importlib, pickle
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder

In [4]:
# Imputer object for easy dataset conversion to GNN friendly format
from sba_gnn.sba_gnn import sg_imputer 
from sba_gnn.sba_gnn.sg_imputer import GNNImputer

## Input training data, set alternate NAICS

Import the data, and set an alternate NAICS for test and validation, such that NAICS that are not in the training data are set to the most common NAICS in the same sector in the training data

##### Import train data

In [5]:
sba_loans = pd.read_parquet(Path(setup.temp_path).joinpath('01_DATA_transformed.parquet'))

## Get alternative NAICS (test and validation)
Map unseen NAICS using hierarchical information.  Use the highest-volume code in a grouping

In [6]:
def get_common_naics(data, group_var= 'NAICS_sector', alt_var = 'NAICS_alt'):
    """Get an alternative NAICS as the most common value in a larger group"""
    grouped = data.groupby(group_var)
    alt_naics = pd.concat([g[['NAICS']].value_counts().head(1).reset_index() \
                              for n, g in grouped], keys=[n for n, g in grouped]) \
        .reset_index(level=0) \
        .reset_index(drop = True) \
        .rename(columns={'level_0':group_var, 'NAICS':alt_var}) \
        [[group_var, alt_var]] 
    return alt_naics

In [7]:
def remap_naics(data, group_var= 'NAICS_sector', alt_var = 'NAICS_alt'):
    """Map unseen test/validation codes to same-group codes"""
    train_data = data[data['dset'] == 'train']
    oth_data = data[data['dset'] != 'train']
    
    # Do not modify the train data 
    naics_alt_train = train_data[['NAICS', 'LoanNr_ChkDgt']].rename(columns={'NAICS':alt_var})
    
    # Get alternative NAICS 
    grp_alt = get_common_naics(train_data, group_var, alt_var)
    
    # Identify seen/unseen codes
    train_naics = train_data[['NAICS']].drop_duplicates()
    naics_alt_merge = oth_data[['LoanNr_ChkDgt', 'NAICS', group_var]] \
        .merge(train_naics, how='left', on='NAICS', indicator = True)
    
    # Don't map seen codes
    naics_alt_2 = naics_alt_merge[naics_alt_merge['_merge'] == 'both'] \
        .drop(columns=['_merge', group_var], errors='ignore') \
        .rename(columns={'NAICS':alt_var})
    
    # Map unseen codes
    naics_alt_3 = naics_alt_merge[naics_alt_merge['_merge'] == 'left_only'] \
        .drop(columns=['_merge']) \
        .merge(grp_alt, how='left', on=group_var) \
        .drop(columns=[group_var])
    
    # If no match, use same unseen code
    naics_alt_3[alt_var] = naics_alt_3[alt_var].fillna(naics_alt_3['NAICS'])
    naics_alt_3.drop(columns='NAICS', inplace=True)
    
    naics_alt = pd.concat([naics_alt_train, naics_alt_2, naics_alt_3])[['LoanNr_ChkDgt', alt_var]]
    
    return data.drop(columns=[alt_var], errors='ignore') \
        .merge(naics_alt, on='LoanNr_ChkDgt')        

##### Sector map

In [8]:
sba_loans = remap_naics(sba_loans)

In [9]:
sba_loans.shape

(688081, 56)

In [10]:
(sba_loans['NAICS_alt'] == sba_loans['NAICS']).value_counts()

True     623012
False     65069
Name: count, dtype: int64

In [11]:
sba_loans['NAICS_alt'].isna().value_counts()

NAICS_alt
False    688081
Name: count, dtype: int64

##### Subsector map

In [12]:
sba_loans['naics3'] = sba_loans['NAICS'].str[0:3]

In [13]:
sba_loans = remap_naics(sba_loans, group_var='naics3', alt_var = 'NAICS_alt3')

In [14]:
(sba_loans['NAICS_alt3'] == sba_loans['NAICS']).value_counts()

True     623013
False     65068
Name: count, dtype: int64

In [15]:
sba_loans['NAICS_alt3'].isna().value_counts()

NAICS_alt3
False    688081
Name: count, dtype: int64

##### Industry group map

In [16]:
sba_loans['naics4'] = sba_loans['NAICS'].str[0:4]

In [17]:
sba_loans = remap_naics(sba_loans, group_var='naics4', alt_var = 'NAICS_alt4')

In [18]:
(sba_loans['NAICS_alt4'] == sba_loans['NAICS']).value_counts()

True     627690
False     60391
Name: count, dtype: int64

In [19]:
sba_loans['NAICS_alt4'].isna().value_counts()

NAICS_alt4
False    688081
Name: count, dtype: int64

## Imputer

In [20]:
train_df = sba_loans[sba_loans['dset'] == 'train']
test_val_df = sba_loans[sba_loans['dset'] != 'train']

For a neural network, I need to handle missings and also scale features.  For missing values, I will do a simple median fill for all, but add missing indicators.  

To scale the data, I will do a quantile transform for features with > 5 levels.  This is to avoid scaling binary features.  Then I do a Min/Max scaling on all features, so they are in the [0,1] range

I define a class to do this in sg_imputer.  This class wraps several scikit-learn imputers/scalers so that I can easily fit the objects and then transform the data.  In this section, I show some descriptives to justify my imputation/scaling choices, and also fit the scaler

In [21]:
train_df = sba_loans[sba_loans['dset'] == 'train']

##### Descriptives - explore missingness

In [22]:
# Same information, more compact
pd.concat([pd.DataFrame([train_df[c].isna().sum()]) for c in setup.predictor_features],
         keys=setup.predictor_features).set_axis(['count_missing'], axis=1)

Unnamed: 0,Unnamed: 1,count_missing
NoEmp,0,0
CreateJob,0,0
LowDoc,0,3227
DisbursementGross,0,0
new_business,0,536
urban_flag,0,84496
franchise_flag,0,0


Most fields aren't missing too much.  However I must handle those that are.  I will use a median fill, especially since many of the fields with missingness seem to be binary

##### Explore scaling

In [23]:
# Level counts
feature_level_df = pd.concat([pd.DataFrame([train_df[c].value_counts().count()]) for c in setup.predictor_features],
         keys=setup.predictor_features)
feature_level_df

Unnamed: 0,Unnamed: 1,0
NoEmp,0,426
CreateJob,0,194
LowDoc,0,2
DisbursementGross,0,80190
new_business,0,2
urban_flag,0,2
franchise_flag,0,2


A number of features are binary.  I don't need to scale these.  The others must be scaled.  

What I will do is scale features with more than a threshold number of levels using a quantile scaler.  Then I'll use a MinMax scaler on everything to be safe (as a 2 level feature could have values other than 0,1).

##### Create imputer to do the median fill and scaling
This imputer fills missing values, adding missingness indicator features.  Then it quantile scales features with over 5 levels, and MinMax scales the rest.

In [24]:
importlib.reload(sg_imputer)
from sba_gnn.sba_gnn.sg_imputer import GNNImputer

In [25]:
imputer = GNNImputer(features = setup.predictor_features,
                    naics_features = ['NAICS', 'NAICS_alt', 'NAICS_alt3', 'NAICS_alt4'])                               

In [26]:
train_out = imputer.fit_transform(train_df)

In [27]:
train_out.describe()

Unnamed: 0,NoEmp,CreateJob,LowDoc,DisbursementGross,new_business,urban_flag,franchise_flag,missingindicator_LowDoc,missingindicator_new_business,missingindicator_urban_flag,NAICS,NAICS_alt,NAICS_alt3,NAICS_alt4
count,436120.0,436120.0,436120.0,436120.0,436120.0,436120.0,436120.0,436120.0,436120.0,436120.0,436120.0,436120.0,436120.0,436120.0
mean,-0.0088,-0.422097,-0.805893,0.004205,-0.438659,0.701807,-0.894534,-0.985201,-0.997542,-0.61251,774.081945,774.081945,774.081945,774.081945
std,0.575324,0.797169,0.592062,0.574403,0.898655,0.712368,0.447001,0.171401,0.070072,0.790463,324.994681,324.994681,324.994681,324.994681
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,1.0
25%,-0.447447,-1.0,-1.0,-0.489704,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,654.0,654.0,654.0,654.0
50%,0.024024,-1.0,-1.0,0.002002,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,843.0,843.0,843.0,843.0
75%,0.499499,0.562563,-1.0,0.496796,1.0,1.0,-1.0,-1.0,-1.0,-1.0,1055.0,1055.0,1055.0,1055.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1166.0,1166.0,1166.0,1166.0


In [28]:
with open(Path(setup.temp_path).joinpath('10_DATA_imputer.pkl'), 'wb') as fout:
    pickle.dump(imputer, fout)

In [29]:
with open(Path(setup.temp_path).joinpath('10_DATA_features.pkl'), 'wb') as fout:
    pickle.dump(imputer.features_out, fout)

In [30]:
naics_max_levels = imputer.get_naics_encoder_levels()
print(f'NAICS encoder max num: {naics_max_levels}')
with open(Path(setup.temp_path).joinpath('10_DATA_naics_max_levels.pkl'), 'wb') as fout:
    pickle.dump(naics_max_levels, fout)

NAICS encoder max num: 1166


## Transform Test, Validation Data

In [31]:
test_df = sba_loans[sba_loans['dset'] == 'test']
val_df = sba_loans[sba_loans['dset'] == 'val']

In [32]:
test_out = imputer.transform(test_df)

In [33]:
val_out = imputer.transform(val_df)

In [34]:
pd.concat([test_out[['DisbursementGross', 'NoEmp']], test_df[['DisbursementGross', 'NoEmp']]],
          axis=1).corr(method='spearman')

Unnamed: 0,DisbursementGross,NoEmp,DisbursementGross.1,NoEmp.1
DisbursementGross,1.0,0.452619,1.0,0.452619
NoEmp,0.452619,1.0,0.452619,1.0
DisbursementGross,1.0,0.452619,1.0,0.452619
NoEmp,0.452619,1.0,0.452619,1.0


In [35]:
val_out.describe()

Unnamed: 0,NoEmp,CreateJob,LowDoc,DisbursementGross,new_business,urban_flag,franchise_flag,missingindicator_LowDoc,missingindicator_new_business,missingindicator_urban_flag,NAICS,NAICS_alt,NAICS_alt3,NAICS_alt4
count,93454.0,93454.0,93454.0,93454.0,93454.0,93454.0,93454.0,93454.0,93454.0,93454.0,93454.0,93454.0,93454.0,93454.0
mean,-0.008533,-0.419911,-0.804289,0.000867,-0.437691,0.702656,-0.894066,-0.985255,-0.997517,-0.615576,774.662829,774.693935,774.696546,774.696578
std,0.575003,0.79747,0.594242,0.575508,0.89913,0.711534,0.447939,0.171095,0.070419,0.788082,325.048245,325.011876,325.008575,325.008661
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,1.0,1.0,1.0
25%,-0.447447,-1.0,-1.0,-0.512513,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,654.0,654.0,654.0,654.0
50%,0.024024,-1.0,-1.0,-0.005554,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,845.0,845.0,845.0,845.0
75%,0.499499,0.562563,-1.0,0.497497,1.0,1.0,-1.0,-1.0,-1.0,-1.0,1054.0,1054.0,1054.0,1054.0
max,1.0,1.0,1.0,0.999433,1.0,1.0,1.0,1.0,1.0,1.0,1166.0,1166.0,1166.0,1166.0


In [36]:
test_out.describe()

Unnamed: 0,NoEmp,CreateJob,LowDoc,DisbursementGross,new_business,urban_flag,franchise_flag,missingindicator_LowDoc,missingindicator_new_business,missingindicator_urban_flag,NAICS,NAICS_alt,NAICS_alt3,NAICS_alt4
count,158507.0,158507.0,158507.0,158507.0,158507.0,158507.0,158507.0,158507.0,158507.0,158507.0,158507.0,158507.0,158507.0,158507.0
mean,0.009038,-0.424583,-0.801031,0.008279,-0.462976,0.716378,-0.915726,-0.985666,-0.997502,-0.568764,456.688765,746.977364,753.951258,752.672734
std,0.576679,0.795229,0.598625,0.573027,0.886373,0.697714,0.401804,0.168708,0.070643,0.822504,455.236881,329.028393,321.74266,330.737402
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,1.0,0.0,0.0
25%,-0.447447,-1.0,-1.0,-0.485726,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,0.0,619.0,603.0,597.0
50%,0.024024,-1.0,-1.0,0.002002,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,345.0,844.0,825.0,825.0
75%,0.499499,0.562563,-1.0,0.507508,1.0,1.0,-1.0,-1.0,-1.0,-1.0,937.0,1029.0,1029.0,1029.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1166.0,1166.0,1166.0,1166.0


In [37]:
test_out[['NAICS', 'NAICS_alt']].corr()

Unnamed: 0,NAICS,NAICS_alt
NAICS,1.0,0.4985
NAICS_alt,0.4985,1.0


In [38]:
val_out[['NAICS', 'NAICS_alt']].corr()

Unnamed: 0,NAICS,NAICS_alt
NAICS,1.0,0.999884
NAICS_alt,0.999884,1.0


In [39]:
train_out[['NAICS', 'NAICS_alt']].corr()

Unnamed: 0,NAICS,NAICS_alt
NAICS,1.0,1.0
NAICS_alt,1.0,1.0


## Combine data
Combine transformed datasets, with key information.  Include the business ID, target status, NAICS features, as well as transformed features from above. Save this data for later use (especially the indices which indicate train/test statustrain_df

In [42]:
naics_features = ['LoanNr_ChkDgt', 'target', 'dset_naics_holdout', 'NAICS', 'NAICS_sector', 
                  'menc_NAICS', 'cenc_NAICS','NAICS_alt', 'NAICS_alt3', 'NAICS_alt4'] +  \
    [c for c in train_df.columns if c.startswith('NS__')]
print(naics_features)

['LoanNr_ChkDgt', 'target', 'dset_naics_holdout', 'NAICS', 'NAICS_sector', 'menc_NAICS', 'cenc_NAICS', 'NAICS_alt', 'NAICS_alt3', 'NAICS_alt4', 'NS___Accommodation and Food Services', 'NS___Administrative and Support and Waste Management and Remediation Services', 'NS___Construction', 'NS___Health Care and Social Assistance', 'NS___Manufacturing', 'NS___Other Services (except Public Administration)', 'NS___Professional, Scientific, and Technical Services', 'NS___Retail Trade', 'NS___Wholesale Trade', 'NS___infrequent_sklearn']


In [43]:
comb_naics = pd.concat([train_df[naics_features], 
                        test_df[naics_features], 
                        val_df[naics_features]], axis=0, keys=['train', 'test', 'val']) \
    .rename(columns={'NAICS':'NAICS_orig', 'NAICS_alt':'NAICS_alt_orig', 
                     'NAICS_alt3':'NAICS_alt3_orig', 'NAICS_alt4':'NAICS_alt4_orig'}) \
    .sort_index()

In [44]:
comb_df = pd.concat([comb_naics,
                     pd.concat([train_out, test_out, val_out], 
                               axis=0, keys=['train', 'test', 'val']).sort_index()],
                    axis=1) \
    .reset_index(level=0) \
    .rename(columns={'level_0':'dset'}, errors='ignore')
print(comb_df.shape)

(688081, 35)


In [45]:
print(comb_df.columns)

Index(['dset', 'LoanNr_ChkDgt', 'target', 'dset_naics_holdout', 'NAICS_orig',
       'NAICS_sector', 'menc_NAICS', 'cenc_NAICS', 'NAICS_alt_orig',
       'NAICS_alt3_orig', 'NAICS_alt4_orig',
       'NS___Accommodation and Food Services',
       'NS___Administrative and Support and Waste Management and Remediation Services',
       'NS___Construction', 'NS___Health Care and Social Assistance',
       'NS___Manufacturing',
       'NS___Other Services (except Public Administration)',
       'NS___Professional, Scientific, and Technical Services',
       'NS___Retail Trade', 'NS___Wholesale Trade', 'NS___infrequent_sklearn',
       'NoEmp', 'CreateJob', 'LowDoc', 'DisbursementGross', 'new_business',
       'urban_flag', 'franchise_flag', 'missingindicator_LowDoc',
       'missingindicator_new_business', 'missingindicator_urban_flag', 'NAICS',
       'NAICS_alt', 'NAICS_alt3', 'NAICS_alt4'],
      dtype='object')


In [46]:
comb_df.to_parquet(Path(setup.temp_path).joinpath('10_DATA_combined_scaled_all.parquet'))