In [1]:
# Modify the file 00_setup.py to define input/output file paths on your system
# The information in 00_setup.py will be used across notebooks
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./00_setup.py").load_module()

# 10: Data Imputation and Scaling for GNN
Convert data into a format suitable for GNN, preparing for several models
  * Modify data so it is suitable for a neural network
    * Impute missing features
    * Scale values
  * Create data frames for node and edge features
  * Get edge lists

I do not create graphs here, just do the preliminary data prep.  Not all features created here will be used in all models.  In addition, not all cases/edges would be used.  I will filter and modify the data to compare scenarios later.

*Stellargraph Notes:*. When batching, neighbors are sampled by edge type.  Sampling with replacement is used.  So asking for 10 samples means 10 samples per edge type, and if there is just one edge of a type it would repeat 10 times.  See https://stellargraph.readthedocs.io/en/latest/_modules/stellargraph/mapper/sampled_node_generators.html

*This script takes about 5 minutes on my MacBook Air*

In [2]:
import pandas as pd
import numpy as np

In [3]:
from pathlib import Path
import importlib, pickle
from sklearn import model_selection

In [4]:
# Imputer object for easy dataset conversion to GNN friendly format
from sba_gnn.sba_gnn import sg_imputer 
from sba_gnn.sba_gnn.sg_imputer import GNNImputer

## Input training data, descriptives, fit imputer

For a neural network, I need to handle missings and also scale features.  For missing values, I will do a simple median fill for all, but add missing indicators.  

To scale the data, I will do a quantile transform for features with > 5 levels.  This is to avoid scaling binary features.  Then I do a Min/Max scaling on all features, so they are in the [0,1] range

I define a class to do this in sg_imputer.  This class wraps several scikit-learn imputers/scalers so that I can easily fit the objects and then transform the data.  In this section, I show some descriptives to justify my imputation/scaling choices, and also fit the scaler

##### Import train data, show missing

In [5]:
train_df = pd.read_parquet(Path(setup.temp_path).joinpath('01_DATA_combined_base_train.parquet'))

In [6]:
# Examine missingness
pd.concat([train_df[c].isna().value_counts() for c in setup.predictor_features],
         keys=setup.predictor_features)

NoEmp              False    447252
CreateJob          False    447252
LowDoc             False    444025
                   True       3227
DisbursementGross  False    447252
new_business       False    446708
                   True        544
urban_flag         False    358648
                   True      88604
franchise_flag     False    447252
Name: count, dtype: int64

In [7]:
# Same information, more compact
pd.concat([pd.DataFrame([train_df[c].isna().sum()]) for c in setup.predictor_features],
         keys=setup.predictor_features).set_axis(['count_missing'], axis=1)

Unnamed: 0,Unnamed: 1,count_missing
NoEmp,0,0
CreateJob,0,0
LowDoc,0,3227
DisbursementGross,0,0
new_business,0,544
urban_flag,0,88604
franchise_flag,0,0


Most fields aren't missing too much.  However I must handle those that are.  I will use a median fill, especially since many of the fields with missingness seem to be binary

##### Explore scaling

In [8]:
# Level counts
feature_level_df = pd.concat([pd.DataFrame([train_df[c].value_counts().count()]) for c in setup.predictor_features],
         keys=setup.predictor_features)
feature_level_df

Unnamed: 0,Unnamed: 1,0
NoEmp,0,425
CreateJob,0,182
LowDoc,0,2
DisbursementGross,0,81720
new_business,0,2
urban_flag,0,2
franchise_flag,0,2


A number of features are binary.  I don't need to scale these.  The others must be scaled.  

What I will do is scale features with more than a threshold number of levels using a quantile scaler.  Then I'll use a MinMax scaler on everything to be safe (as a 2 level feature could have values other than 0,1).

##### Create imputer to do the median fill and scaling
This imputer fills missing values, adding missingness indicator features.  Then it quantile scales features with over 5 levels, and MinMax scales the rest.

In [9]:
imputer = GNNImputer(features = setup.predictor_features)                               

In [10]:
train_out = imputer.fit_transform(train_df)

In [11]:
train_out.index

Index([538061, 220915, 237886, 852933, 370109, 825551, 462351, 886865, 687784,
       402987,
       ...
       614020,  30277, 692498, 641806,  12558, 127070, 631061, 821756, 296335,
       521331],
      dtype='int64', name='index', length=447252)

In [12]:
imputer.features_out

['NoEmp',
 'CreateJob',
 'LowDoc',
 'DisbursementGross',
 'new_business',
 'urban_flag',
 'franchise_flag',
 'missingindicator_LowDoc',
 'missingindicator_new_business',
 'missingindicator_urban_flag']

In [13]:
with open(Path(setup.temp_path).joinpath('10_DATA_features.pkl'), 'wb') as fout:
    pickle.dump(imputer.features_out, fout)

## Transform Test, Validation Data

In [14]:
test_df = pd.read_parquet(Path(setup.temp_path).joinpath('01_DATA_combined_base_test.parquet'))

In [15]:
val_df = pd.read_parquet(Path(setup.temp_path).joinpath('01_DATA_combined_base_val.parquet'))

In [16]:
test_out = imputer.transform(test_df)

In [17]:
val_out = imputer.transform(val_df)

## Combine data
Combine transformed datasets, with key information.  Include the business ID, target status, NAICS features, as well as transformed features from above. Save this data for later use (especially the indices which indicate train/test statustrain_df

In [18]:
train_df.columns

Index(['LoanNr_ChkDgt', 'Name', 'City', 'State', 'Zip', 'Bank', 'BankState',
       'NAICS', 'ApprovalDate', 'ApprovalFY', 'Term', 'NoEmp', 'NewExist',
       'CreateJob', 'RetainedJob', 'FranchiseCode', 'UrbanRural', 'RevLineCr',
       'LowDoc', 'ChgOffDate', 'DisbursementDate', 'DisbursementGross',
       'BalanceGross', 'MIS_Status', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv',
       'target', 'guaranteed_fract', 'new_business', 'urban_flag',
       'franchise_flag', 'NAICS_2', 'NAICS_sector', 'NAICS_sector_desc',
       'NAICS_num', 'menc_NAICS', 'menc_NAICS_sector',
       'NS___Accommodation and Food Services', 'NS___Construction',
       'NS___Health Care and Social Assistance', 'NS___Manufacturing',
       'NS___Other Services (except Public Administration)',
       'NS___Professional, Scientific, and Technical Services',
       'NS___Retail Trade', 'NS___Wholesale Trade', 'NS___infrequent_sklearn'],
      dtype='object')

In [19]:
naics_features = ['LoanNr_ChkDgt', 'target', 'NAICS', 'NAICS_sector', 'menc_NAICS', 'menc_NAICS_sector'] +  \
    [c for c in train_df.columns if c.startswith('NS__')]
print(naics_features)

['LoanNr_ChkDgt', 'target', 'NAICS', 'NAICS_sector', 'menc_NAICS', 'menc_NAICS_sector', 'NS___Accommodation and Food Services', 'NS___Construction', 'NS___Health Care and Social Assistance', 'NS___Manufacturing', 'NS___Other Services (except Public Administration)', 'NS___Professional, Scientific, and Technical Services', 'NS___Retail Trade', 'NS___Wholesale Trade', 'NS___infrequent_sklearn']


In [20]:
comb_naics = pd.concat([train_df[naics_features], test_df[naics_features], 
                        val_df[naics_features]], axis=0, keys=['train', 'test', 'val'])

In [21]:
comb_df = pd.concat([comb_naics,
                     pd.concat([train_out, test_out, val_out], 
                               axis=0, keys=['train', 'test', 'val'])],
                    axis=1) \
    .reset_index(level=0) \
    .rename(columns={'level_0':'dset'}, errors='ignore')
print(comb_df.shape)

(688081, 26)


In [22]:
print(comb_df.columns)

Index(['dset', 'LoanNr_ChkDgt', 'target', 'NAICS', 'NAICS_sector',
       'menc_NAICS', 'menc_NAICS_sector',
       'NS___Accommodation and Food Services', 'NS___Construction',
       'NS___Health Care and Social Assistance', 'NS___Manufacturing',
       'NS___Other Services (except Public Administration)',
       'NS___Professional, Scientific, and Technical Services',
       'NS___Retail Trade', 'NS___Wholesale Trade', 'NS___infrequent_sklearn',
       'NoEmp', 'CreateJob', 'LowDoc', 'DisbursementGross', 'new_business',
       'urban_flag', 'franchise_flag', 'missingindicator_LowDoc',
       'missingindicator_new_business', 'missingindicator_urban_flag'],
      dtype='object')


In [23]:
comb_df.to_parquet(Path(setup.temp_path).joinpath('10_DATA_combined_scaled_all.parquet'))

## Create Edge Map

I will have edges between a business and its NAICS, as well as edges to same-sector NAICS nodes.  This section creates general edge data, which may be filtered to create graphs

##### Create NAICS map to same-sector NAICS

In [24]:
sector_map = comb_df[['NAICS', 'NAICS_sector']].drop_duplicates()

In [25]:
naics_all_sectors = sector_map.merge(sector_map.rename(columns={'NAICS':'NAICS_sim'}),
                                     how='left', on ='NAICS_sector')
naics_all_sectors = naics_all_sectors[naics_all_sectors['NAICS'] != naics_all_sectors['NAICS_sim']]

In [26]:
naics_all_sectors.sample(5)

Unnamed: 0,NAICS,NAICS_sector,NAICS_sim
252590,518112,51,512230
34092,333294,31-33,335121
191737,327122,31-33,332117
269935,311221,31-33,311513
110107,325320,31-33,316110


##### Functions for creating index for NAICS

In [27]:
def get_naics_index(naics_seq):
    return 'n_' + naics_seq

##### Get edges from businesses to their exact NAICS

In [28]:
# Edges from businesses to their NAICS code
naics_info_df = comb_df[['LoanNr_ChkDgt', 'NAICS', 'dset']].copy() \
    .rename(columns={'LoanNr_ChkDgt':'source'}) 
naics_info_df['target'] = get_naics_index(naics_info_df['NAICS'])

edges_business_naics = naics_info_df[['source', 'target', 'dset']].copy()
edges_business_naics['type'] = 'loan_naics'

##### Get edges from buisinesses to related NAICS codes

In [29]:
naics_info_df.head(3)

Unnamed: 0_level_0,source,NAICS,dset,target
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
538061,5282874009,422210,train,n_422210
220915,2568556001,441222,train,n_441222
237886,2687465005,621310,train,n_621310


In [30]:
edges_businesses_naics_sim = naics_info_df.drop(columns='target', errors='ignore') \
    .merge(naics_all_sectors, how='inner', on='NAICS')
edges_businesses_naics_sim['target'] = get_naics_index(edges_businesses_naics_sim['NAICS_sim'])
edges_businesses_naics_sim = edges_businesses_naics_sim[['source', 'target', 'dset']]
edges_businesses_naics_sim['type'] = 'loan_sector_naics'
print(f'Sector NAICS edges: {edges_businesses_naics_sim.shape}')

Sector NAICS edges: (67912366, 4)


##### Sample these - Uncomment to Sample
The data is large so sampling could be considered here - I eliminated this

In [31]:
#edges_businesses_naics_sim = edges_businesses_naics_sim.groupby('source') \
#    .sample(20, replace=True, random_state=2342) \
#    .drop_duplicates()
#print(f'Sector NAICS edges post sample: {edges_businesses_naics_sim.shape}')

In [32]:
edges_all = pd.concat([edges_business_naics, edges_businesses_naics_sim], axis=0) \
    .reset_index(drop=True)

In [33]:
edges_all['type'].value_counts()

type
loan_sector_naics    67912366
loan_naics             688081
Name: count, dtype: int64

In [34]:
print(edges_all[['source', 'target']].drop_duplicates().shape)
print(edges_all.shape)

(68600447, 2)
(68600447, 4)


In [35]:
edges_all.to_parquet(Path(setup.temp_path).joinpath('10_DATA_graph_edges.parquet'))

## Get Node Features
This section creates general node features data, which may be filtered to create graphs.  Not all features may be used, and not all nodes may be in all graphs.

In [36]:
# Business node features - these are the original features post scaling.
# Include the target also - will be used to test the "label trick" in some future scripts
features_business = comb_df[['LoanNr_ChkDgt', 'dset'] + imputer.features_out + ['target']] \
    .set_index('LoanNr_ChkDgt')
features_business.to_parquet(Path(setup.temp_path).joinpath('10_DATA_graph_node_features_loans.parquet'))

In [37]:
# NAICS features - One hots, also the mean encodings
features_naics = comb_df[['NAICS', 'menc_NAICS', 'menc_NAICS_sector'] + [c for c in comb_df.columns if c.startswith('NS__')]] \
    .drop_duplicates()
features_naics['source'] = get_naics_index(features_naics['NAICS'])
features_naics.set_index('source', inplace=True) 
features_naics.drop(columns='NAICS', inplace=True) 
print(features_naics.shape)
features_naics.to_parquet(Path(setup.temp_path).joinpath('10_DATA_graph_node_features_naics.parquet'))

(1311, 11)


## Label Data
Split the training data set - used for some "label trick" tests later

In [38]:
label_df = comb_df[['dset', 'LoanNr_ChkDgt', 'target']].set_index('LoanNr_ChkDgt')

In [39]:
train_labels = label_df[label_df['dset'] == 'train'].drop(columns=['dset'])
val_labels = label_df[label_df['dset'] == 'val'].drop(columns=['dset'])
print(f'graph labels shape {label_df.shape}')
print(f'train labels shape {train_labels.shape}')
print(f'validation labels shape {val_labels.shape}')

graph labels shape (688081, 2)
train labels shape (447252, 1)
validation labels shape (96332, 1)


In [40]:
# Split train cases for testing
train_fix, train_train = model_selection.train_test_split(
    train_labels.index, train_size=0.7, stratify=train_labels, random_state=23432
)

In [44]:
print(f'fixed part of train set {len(train_fix)}')
print(f'train part of train set {len(train_train)}')

fixed part of train set 313076
train part of train set 134176


In [53]:
label_df.train_fix = None
label_df.loc[train_fix, 'train_fix'] = 1
label_df['train_fix'].fillna(0, inplace=True)

In [54]:
label_df['train_fix'].value_counts()

train_fix
0    375005
1    313076
Name: count, dtype: int64

In [49]:
len(set(list(train_fix)))

313076

In [55]:
label_df.to_parquet(Path(setup.temp_path).joinpath('10_DATA_label_info.parquet'))

In [52]:
label_df.loc[train_fix].shape

(313076, 3)

In [56]:
label_df.head()

Unnamed: 0_level_0,dset,target,train_fix
LoanNr_ChkDgt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5282874009,train,0,0
2568556001,train,0,0
2687465005,train,1,0
9174473001,train,0,1
3598634004,train,0,0
