In [2]:
# Modify the file 00_setup.py to define input/output file paths on your system
# The information in 00_setup.py will be used across notebooks
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "../00_setup.py").load_module()

# Neural Network Data Prep
Here, I run code to process tabular data.  I use a test imputer

*This script takes about 5 minutes on my MacBook Air*

In [3]:
import pandas as pd
import numpy as np

In [22]:
from pathlib import Path
import importlib, pickle
import sklearn
from sklearn import model_selection

In [17]:
import os

In [45]:
# Imputer object for easy dataset conversion to GNN friendly format
import test_imputer
from test_imputer import TestImputer 

In [44]:
importlib.reload(test_imputer)

<module 'test_imputer' from '/Users/valeriecarey/Documents/projects/2023_10_blog_gnn_sba/code/test/test_imputer.py'>

## Input data, fit imputer

In [18]:
os.chdir('..')

In [19]:
train_df = pd.read_parquet(Path(setup.temp_path).joinpath('01_DATA_combined_base_train.parquet'))

In [46]:
imputer = TestImputer(features = setup.predictor_features)                               

In [47]:
train_out = imputer.fit_transform(train_df)

In [48]:
train_df[train_df['NoEmp'] == train_df['NoEmp'].max()][['LoanNr_ChkDgt', 'NoEmp', 'urban_flag']]
# Index of this si 76763, 1579734005.  8000 EEs

Unnamed: 0_level_0,LoanNr_ChkDgt,NoEmp,urban_flag
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
76763,1579734005,8000,


In [49]:
train_out.loc[76763]

new_business                       1.000000
urban_flag                         1.000000
franchise_flag                     0.000000
missingindicator_LowDoc            0.000000
missingindicator_new_business      0.000000
missingindicator_urban_flag        1.000000
NoEmp                            151.016280
CreateJob                         -0.132311
LowDoc                             3.041835
DisbursementGross                 -0.316339
Name: 76763, dtype: float64

In [50]:
train_out[train_out['NoEmp'] == train_out['NoEmp'].max()]

Unnamed: 0_level_0,new_business,urban_flag,franchise_flag,missingindicator_LowDoc,missingindicator_new_business,missingindicator_urban_flag,NoEmp,CreateJob,LowDoc,DisbursementGross
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
76763,1.0,1.0,0.0,0.0,0.0,1.0,151.01628,-0.132311,3.041835,-0.316339


In [60]:
train_df['LowDoc'].value_counts()

LowDoc
0.0    369518
1.0     40267
Name: count, dtype: int64

In [62]:
train_df[train_df['DisbursementGross'] == train_df['DisbursementGross'].min()] \
    [['LoanNr_ChkDgt', 'NoEmp', 'urban_flag', 'DisbursementGross']]
# One index is 23135

Unnamed: 0_level_0,LoanNr_ChkDgt,NoEmp,urban_flag,DisbursementGross
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
23135,1174365010,3,1.0,4000.0
716027,7471664003,5,1.0,4000.0
71365,1541105005,1,1.0,4000.0
281056,2954754008,1,,4000.0
368030,3580896006,1,,4000.0
...,...,...,...,...
208646,2479305009,2,1.0,4000.0
32327,1245855002,1,1.0,4000.0
686855,7131014000,1,1.0,4000.0
160934,2169985008,1,0.0,4000.0


In [63]:
train_out.loc[23135]

new_business                     1.000000
urban_flag                       1.000000
franchise_flag                   0.000000
missingindicator_LowDoc          0.000000
missingindicator_new_business    0.000000
missingindicator_urban_flag      0.000000
NoEmp                           -0.125757
CreateJob                       -0.071381
LowDoc                          -0.328749
DisbursementGross               -0.638317
Name: 23135, dtype: float64

## Transform Test, Validation Data

In [51]:
test_df = pd.read_parquet(Path(setup.temp_path).joinpath('01_DATA_combined_base_test.parquet'))

In [52]:
val_df = pd.read_parquet(Path(setup.temp_path).joinpath('01_DATA_combined_base_val.parquet'))

In [53]:
test_out = imputer.transform(test_df)

In [54]:
val_out = imputer.transform(val_df)

In [55]:
naics_features = ['LoanNr_ChkDgt', 'target', 'NAICS', 'NAICS_sector', 'menc_NAICS', 'menc_NAICS_sector'] +  \
    [c for c in train_df.columns if c.startswith('NS__')]
print(naics_features)

['LoanNr_ChkDgt', 'target', 'NAICS', 'NAICS_sector', 'menc_NAICS', 'menc_NAICS_sector', 'NS___Accommodation and Food Services', 'NS___Construction', 'NS___Health Care and Social Assistance', 'NS___Manufacturing', 'NS___Other Services (except Public Administration)', 'NS___Professional, Scientific, and Technical Services', 'NS___Retail Trade', 'NS___Wholesale Trade', 'NS___infrequent_sklearn']


In [56]:
comb_naics = pd.concat([train_df[naics_features], test_df[naics_features], 
                        val_df[naics_features]], axis=0, keys=['train', 'test', 'val'])

In [57]:
comb_df = pd.concat([comb_naics,
                     pd.concat([train_out, test_out, val_out], 
                               axis=0, keys=['train', 'test', 'val'])],
                    axis=1) \
    .reset_index(level=0) \
    .rename(columns={'level_0':'dset'}, errors='ignore')
print(comb_df.shape)

(688081, 26)


In [25]:
print(comb_df.columns)

Index(['dset', 'LoanNr_ChkDgt', 'target', 'NAICS', 'NAICS_sector',
       'menc_NAICS', 'menc_NAICS_sector',
       'NS___Accommodation and Food Services', 'NS___Construction',
       'NS___Health Care and Social Assistance', 'NS___Manufacturing',
       'NS___Other Services (except Public Administration)',
       'NS___Professional, Scientific, and Technical Services',
       'NS___Retail Trade', 'NS___Wholesale Trade', 'NS___infrequent_sklearn',
       'NoEmp', 'CreateJob', 'LowDoc', 'DisbursementGross', 'new_business',
       'urban_flag', 'franchise_flag', 'missingindicator_LowDoc',
       'missingindicator_new_business', 'missingindicator_urban_flag'],
      dtype='object')


In [58]:
comb_df.to_parquet(Path(setup.temp_path).joinpath('TMP_11_DATA_combined_scaled_all.parquet'))