In [1]:
# Modify the file 00_setup.py to define input/output file paths on your system
# The information in 00_setup.py will be used across notebooks
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./00_setup.py").load_module()

# Neural Network Baseline
Traditional, simple neural network model with no NAICS input

*This script takes about 2 hours on my MacBook Air*

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [3]:
from pathlib import Path
import importlib, pickle
from sklearn import model_selection

In [4]:
import pandas as pd
import os
import re

from tensorflow.keras import layers, optimizers, losses, metrics, Model
from sklearn import preprocessing, feature_extraction, model_selection
from IPython.display import display, HTML
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import tensorflow.keras.metrics as km

In [6]:
from sba_gnn.sba_gnn import sg_plot 

## Input Data

In [7]:
business_data = pd.read_parquet(Path(setup.temp_path).joinpath('10_DATA_combined_scaled_all.parquet'))

In [8]:
business_data.describe()

Unnamed: 0,target,dset_naics_holdout,menc_NAICS,cenc_NAICS,menc_grp_NAICS,NS___Accommodation and Food Services,NS___Construction,NS___Health Care and Social Assistance,NS___Manufacturing,NS___Other Services (except Public Administration),...,LowDoc,DisbursementGross,new_business,urban_flag,franchise_flag,missingindicator_LowDoc,missingindicator_new_business,missingindicator_urban_flag,NAICS,NAICS_alt
count,688081.0,688081.0,688081.0,688081.0,688081.0,688081.0,688081.0,688081.0,688081.0,688081.0,...,688081.0,688081.0,688081.0,688081.0,688081.0,688081.0,688081.0,688081.0,688081.0,688081.0
mean,0.203011,0.048426,0.202996,0.007089,0.203309,0.097958,0.095606,0.079398,0.096843,0.103658,...,-0.804555,0.009595,-0.444129,0.705279,-0.899352,-0.985316,-0.997529,-0.602849,776.722582,813.454941
std,0.402241,0.214665,0.095164,0.009881,0.095578,0.297258,0.294051,0.270359,0.295744,0.304816,...,0.593879,0.577435,0.895963,0.70893,0.437226,0.170743,0.070251,0.797856,377.413475,343.76757
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,1.0
25%,0.0,0.0,0.131198,0.000902,0.128356,0.0,0.0,0.0,0.0,0.0,...,-1.0,-0.490729,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,610.0,670.0
50%,0.0,0.0,0.203074,0.00321,0.20244,0.0,0.0,0.0,0.0,0.0,...,-1.0,0.019019,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,822.0,890.0
75%,0.0,0.0,0.261907,0.008929,0.262367,0.0,0.0,0.0,0.0,0.0,...,-1.0,0.513514,1.0,1.0,-1.0,-1.0,-1.0,-1.0,1090.0,1093.0
max,1.0,1.0,1.0,0.042806,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1232.0,1232.0


In [9]:
with open(Path(setup.temp_path).joinpath('10_DATA_features.pkl'), 'rb') as fin:
    imputer_features = pickle.load(fin)

In [12]:
features = [f for f in imputer_features if 'NAICS' not in f]
print(features)

['NoEmp', 'CreateJob', 'LowDoc', 'DisbursementGross', 'new_business', 'urban_flag', 'franchise_flag', 'missingindicator_LowDoc', 'missingindicator_new_business', 'missingindicator_urban_flag']


##### Datasets for train, validation

In [13]:
X = business_data[['dset', 'LoanNr_ChkDgt'] + features].set_index('LoanNr_ChkDgt').sort_index()

In [14]:
X_train = X[X['dset'] == 'train'].drop(columns='dset')
y_train = business_data[business_data['dset'] == 'train'].set_index('LoanNr_ChkDgt').sort_index()['target']
print(f'training X: {X_train.shape}, y:{y_train.shape}')

training X: (425594, 10), y:(425594,)


In [15]:
X_val = X[X['dset'] == 'val'].drop(columns='dset')
y_val = business_data[business_data['dset'] == 'val'].set_index('LoanNr_ChkDgt').sort_index()['target']
print(f'val X: {X_val.shape}, y:{y_val.shape}')

val X: (126041, 10), y:(126041,)


In [16]:
base_thresh = y_train.mean()
print(base_thresh)

0.20307382152943884


## Function to create model

In [17]:
def create_model(n_feat = len(features), 
                 hidden_size = setup.nn_layer_sizes,
                 activation='tanh', lr=setup.nn_learning_rate,
                 opt_func = setup.nn_optimizer, dropout = setup.nn_dropout):
    
    n_layers = len(hidden_size)
    model = Sequential()
    model.add(Dense(hidden_size[0], input_shape=(n_feat,), activation=activation))
    for i in range(1, n_layers):
        model.add(Dropout(dropout))
        model.add(Dense(hidden_size[i], input_shape=(hidden_size[i-1],), activation=activation))
    model.add(Dropout(dropout))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    optimizer = opt_func(learning_rate=lr)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, 
                  metrics=[km.AUC(curve='PR'), km.AUC(curve='ROC')])
    return model

## Create, fit model

In [18]:
this_model = create_model()

2024-02-23 21:54:03.091555: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-02-23 21:54:03.091611: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-02-23 21:54:03.091624: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-02-23 21:54:03.092065: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-02-23 21:54:03.092317: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [19]:
this_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               1408      
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 9729 (38.00 KB)
Trainable params: 9729 (38.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
this_history = this_model.fit(X_train, y_train,
                              validation_data=(X_val, y_val),
                              batch_size=setup.nn_batch_size,
                              epochs=setup.nn_epochs)

Epoch 1/20


2024-02-23 21:54:04.491544: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-02-23 21:57:44.480259: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/20
Epoch 3/20

In [None]:
this_history_df = pd.DataFrame(this_history.history)

In [None]:
# Rename columns
try:
    this_history_df.columns = ['_'.join(c.split('_')[0:-1])  \
                               if re.search(r'_\d+$', c) else c for c in this_history_df.columns]
except:
    pass
try:
    cur_col = list(this_history_df.columns)
    this_history_df.columns = [cur_col[0]] + \
        [f'{cur_col[i]}_roc'  if (cur_col[i] == cur_col[i-1]) and 'auc'in cur_col[i] \
         else cur_col[i] for i in range(1, len(cur_col))]
except:
    pass

In [None]:
this_history_df.columns

In [None]:
this_history_df.to_csv(Path(setup.temp_path).joinpath('11_REPORT_fit_history.csv'))

In [None]:
this_history_df[['loss', 'val_loss']].plot()

In [None]:
this_history_df[['auc', 'val_auc']].plot()

In [None]:
this_history_df[['auc_roc', 'val_auc_roc']].plot()

In [None]:
this_model.save(Path(setup.temp_path).joinpath('11_DATA_model.keras'),save_format='tf')

## Predictions on all data

In [None]:
all_predictions = this_model.predict(X.drop(columns='dset'))

In [None]:
all_predictions_df = pd.DataFrame(all_predictions, index=X.index) \
    .set_axis(['predict_prob'], axis=1) \
    .reset_index() \
    .merge(business_data[['target', 'LoanNr_ChkDgt', 'dset', 'dset_naics_holdout']], on='LoanNr_ChkDgt')
    

In [None]:
all_predictions_df[['predict_prob', 'target']].corr(method='spearman')

##### Threshold Tune & Binary Predictions
Using training probability predictions

In [None]:
all_pred_train = all_predictions_df[all_predictions_df['dset'] == 'train']

In [None]:
thresh_tune_data = sg_plot.get_f1_frame(all_pred_train['target'], 
                                        all_pred_train['predict_prob'])

In [None]:
thresh_tune_data.sort_values('f1', ascending=False, inplace=True)
thresh_tune_data.head(3)

In [None]:
best_thresh = thresh_tune_data['thresh'].iloc[0]
best_thresh

##### Append binary predictions to probability predictions

In [None]:
all_predictions_df['predict_bin'] = sg_plot.get_binary_predictions(all_predictions_df['predict_prob'], best_thresh)

In [None]:
all_predictions_df['predict_bin'].value_counts(normalize=True, dropna=False)

In [None]:
all_predictions_df.to_parquet(Path(setup.temp_path).joinpath('11_DATA_predictions.parquet'))

## Metrics

In [None]:
all_predictions_df = pd.read_parquet(Path(setup.temp_path).joinpath('11_DATA_predictions.parquet'))

In [None]:
metrics_dset_df = all_predictions_df.groupby('dset') \
    .apply(lambda x: sg_plot.dset_metrics(x.target, x.predict_bin, x.predict_prob)) \
    .reset_index()
metrics_test_df = all_predictions_df[all_predictions_df['dset'] == 'test'] \
    .groupby(['dset', 'dset_naics_holdout']) \
    .apply(lambda x: sg_plot.dset_metrics(x.target, x.predict_bin, x.predict_prob)) \
    .reset_index()
metrics_df = pd.concat([metrics_dset_df, metrics_test_df])
metrics_df.to_csv(Path(setup.temp_path).joinpath('11_REPORT_metrics.csv'), index=True)
metrics_df

## Embeddings
Save the hidden layer weight prior to sigmoid output, for possible later use in unsupervised GNN 

In [None]:
embed_model = Model(inputs=this_model.inputs, outputs=this_model.layers[-2].output) 

In [None]:
embed_out = embed_model.predict(X.drop(columns='dset'))

In [None]:
embed_out.shape

In [None]:
embed_df = pd.DataFrame(embed_out, index=X.index) 
embed_df.columns = [f'emb_{i:03d}' for i in range(len(embed_df.columns))]
embed_df.reset_index(inplace=True)

In [None]:
embed_df.head()

In [None]:
embed_df.to_parquet(Path(setup.temp_path).joinpath('11_DATA_embeddings.parquet'))