In [1]:
# Modify the file 00_setup.py to define input/output file paths on your system
# The information in 00_setup.py will be used across notebooks
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./00_setup.py").load_module()

## Neural Network With NAICS Embedding Layer, Missing Values Injected in Fit
Use NAICS entity embeddings, plus a custom data generate to randomly inject the unseen code (1) into training cases, in order to help the model learn how to handle missing information

Encode only the base NAICS.

Use the dataset with no missing values. 

*This script takes about 2 hours on my MacBook Air*

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [3]:
from pathlib import Path
import importlib, pickle
from sklearn import model_selection

In [4]:
import pandas as pd
import os
import re

import keras
from tensorflow.keras import layers, optimizers, losses, metrics, Model
from sklearn import preprocessing, feature_extraction, model_selection
from IPython.display import display, HTML
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
from tensorflow.keras.layers import Dense, Dropout, Input, Embedding, \
    Concatenate, Reshape, concatenate, Flatten
import tensorflow.keras.metrics as km

In [6]:
from sba_nn.sba_nn import sbnn_metrics, sbnn_model
from sba_nn.sba_nn.sbnn_model import CatInjectGenerator

## Input Data

In [7]:
sba_loans = pd.read_parquet(Path(setup.temp_path).joinpath('01_DATA_transformed_nomiss.parquet'))

In [8]:
with open(Path(setup.temp_path).joinpath('01_DATA_features.pkl'), 'rb') as fin:
    imputer_features = pickle.load(fin)

In [9]:
with open(Path(setup.temp_path).joinpath('01_DATA_naics_max_encodings.pkl'), 'rb') as fin:
    naics_max_levels= pickle.load(fin)
naics_max_levels

{'NAICS': 1170,
 'NAICS_5': 764,
 'NAICS_4': 345,
 'NAICS_3': 107,
 'NAICS_sector': 21}

##### Features

In [10]:
# Numeric features
features_numeric = [f for f in imputer_features if 'NAICS' not in f]
features_numeric_len = len(features_numeric)
print(features_numeric)

['NoEmp', 'CreateJob', 'LowDoc', 'DisbursementGross', 'new_business', 'urban_flag', 'franchise_flag', 'missingindicator_LowDoc', 'missingindicator_new_business', 'missingindicator_urban_flag']


In [11]:
# NAICS feature - just bas
features_naics = ['NAICS']
features_naics_max_levels  = [naics_max_levels[n] for n in features_naics]
features_naics_emb_dim = [setup.nn_naics_embed_size_dict[n] for n in features_naics]

In [12]:
print(features_naics_max_levels)
print(features_naics_emb_dim)

[1170]
[8]


##### Datasets for train, validation

In [13]:
X = sba_loans[['dset', 'LoanNr_ChkDgt'] + features_numeric].set_index('LoanNr_ChkDgt').sort_index()

In [14]:
X_train = X[X['dset'] == 'train'].drop(columns='dset')
y_train = sba_loans[sba_loans['dset'] == 'train'].set_index('LoanNr_ChkDgt').sort_index()['target']
print(f'training X: {X_train.shape}, y:{y_train.shape}')

training X: (446848, 10), y:(446848,)


In [15]:
X_val = X[X['dset'] == 'val'].drop(columns='dset')
y_val = sba_loans[sba_loans['dset'] == 'val'].set_index('LoanNr_ChkDgt').sort_index()['target']
print(f'val X: {X_val.shape}, y:{y_val.shape}')

val X: (95604, 10), y:(95604,)


In [16]:
base_thresh = y_train.mean()
print(base_thresh)

0.20230592953308507


In [17]:
X_naics = [sba_loans[['dset', 'LoanNr_ChkDgt', n]].set_index('LoanNr_ChkDgt').sort_index() \
           for n in features_naics]

In [18]:
X_naics_train = [Xn[Xn['dset'] == 'train'].drop(columns='dset') for Xn in X_naics]
X_naics_val =  [Xn[Xn['dset'] == 'val'].drop(columns='dset') for Xn in X_naics]

## Create, fit model

In [22]:
this_model = sbnn_model.create_emb_model(features_numeric_len,
                                         features_naics_max_levels,
                                         features_naics_emb_dim,
                                         naics_embedding_names = features_naics,
                                         hidden_size = setup.nn_layer_sizes,
                                         activation='tanh',
                                         lr=setup.nn_learning_rate,
                                         opt_func = setup.nn_optimizer,
                                         dropout = setup.nn_dropout)

2024-05-17 22:12:25.968390: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-05-17 22:12:25.968439: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-05-17 22:12:25.968446: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-05-17 22:12:25.968545: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-05-17 22:12:25.968577: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [23]:
this_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_naics_00 (InputLayer  [(None, 1)]                  0         []                            
 )                                                                                                
                                                                                                  
 emb_naics_00 (Embedding)    (None, 1, 8)                 9360      ['input_naics_00[0][0]']      
                                                                                                  
 NAICS (Flatten)             (None, 8)                    0         ['emb_naics_00[0][0]']        
                                                                                                  
 input_numeric (InputLayer)  [(None, 10)]                 0         []                        

##### Use data generator to inject 1 values into training data

In [25]:
cat_feature_index = [i for i in range(1, len(X_naics_train)+1)]
cat_feature_index

[1]

In [74]:
importlib.reload(sbnn_model)
from sba_nn.sba_nn.sbnn_model import CatInjectGenerator 

In [75]:
generator = CatInjectGenerator(pd.concat([X_train]+ X_naics_train, axis=1), 
                               y_train,
                               categorical_columns=features_naics,
                               batch_size = setup.nn_batch_size)
                               #inject_value = 1,
                               #inject_fraction = 0.1)

In [None]:
this_history = this_model.fit(generator,
                              batch_size=setup.nn_batch_size,
                              epochs = 1,
                              validation_data=([X_val] + X_naics_val, y_val),
                             shuffle=True)



In [None]:
this_history = this_model.fit(generator,
                              validation_data=([X_val] + X_naics_val, y_val),
                              batch_size=setup.nn_batch_size,
                              epochs = 1)
                              #epochs=setup.nn_epochs)

batch size: 32
batch shapes: 32 32
32
batch size: 32
batch shapes: 32 32
32
batch size: 32
batch shapes: 32 32
32
batch size: 32
batch shapes: 32 32
32


2024-05-17 21:47:45.446168: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-05-17 21:47:45.545315: W tensorflow/core/framework/op_kernel.cc:1816] INVALID_ARGUMENT: TypeError: `generator` yielded an element of shape (32,) where an element of shape (None, None) was expected.
Traceback (most recent call last):

  File "/Users/valeriecarey/miniconda3/envs/tf_p38/lib/python3.8/site-packages/tensorflow/python/ops/script_ops.py", line 268, in __call__
    ret = func(*args)

  File "/Users/valeriecarey/miniconda3/envs/tf_p38/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)

  File "/Users/valeriecarey/miniconda3/envs/tf_p38/lib/python3.8/site-packages/tensorflow/python/data/ops/from_generator_op.py", line 235, in generator_py_func
    raise TypeError(

TypeError: `generator` yielded an element of shape (32,) where an element of shape (No

InvalidArgumentError: Graph execution error:

2 root error(s) found.
  (0) INVALID_ARGUMENT:  TypeError: `generator` yielded an element of shape (32,) where an element of shape (None, None) was expected.
Traceback (most recent call last):

  File "/Users/valeriecarey/miniconda3/envs/tf_p38/lib/python3.8/site-packages/tensorflow/python/ops/script_ops.py", line 268, in __call__
    ret = func(*args)

  File "/Users/valeriecarey/miniconda3/envs/tf_p38/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)

  File "/Users/valeriecarey/miniconda3/envs/tf_p38/lib/python3.8/site-packages/tensorflow/python/data/ops/from_generator_op.py", line 235, in generator_py_func
    raise TypeError(

TypeError: `generator` yielded an element of shape (32,) where an element of shape (None, None) was expected.


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]]
	 [[model/Cast_1/_10]]
  (1) INVALID_ARGUMENT:  TypeError: `generator` yielded an element of shape (32,) where an element of shape (None, None) was expected.
Traceback (most recent call last):

  File "/Users/valeriecarey/miniconda3/envs/tf_p38/lib/python3.8/site-packages/tensorflow/python/ops/script_ops.py", line 268, in __call__
    ret = func(*args)

  File "/Users/valeriecarey/miniconda3/envs/tf_p38/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py", line 643, in wrapper
    return func(*args, **kwargs)

  File "/Users/valeriecarey/miniconda3/envs/tf_p38/lib/python3.8/site-packages/tensorflow/python/data/ops/from_generator_op.py", line 235, in generator_py_func
    raise TypeError(

TypeError: `generator` yielded an element of shape (32,) where an element of shape (None, None) was expected.


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_1415]

In [36]:
setup.nn_batch_size,

(32,)

In [None]:
this_history_df = sbnn_model.process_history(this_history)

In [None]:
this_history_df.to_csv(Path(setup.temp_path).joinpath('10_REPORT_fit_history.csv'))

In [None]:
this_history_df[['loss', 'val_loss']].plot()

In [None]:
this_history_df[['auc', 'val_auc']].plot()

In [None]:
this_history_df[['auc_roc', 'val_auc_roc']].plot()

In [None]:
this_model.save(Path(setup.temp_path).joinpath('10_DATA_model.keras'),save_format='tf')

## Predictions on all data

In [None]:
X_naics = [sba_loans[['dset', 'LoanNr_ChkDgt', n]].set_index('LoanNr_ChkDgt').sort_index() \
           for n in features_naics]

In [None]:
all_predictions = this_model.predict([X.drop(columns='dset')] +
                                     [Xn.drop(columns='dset') for Xn in X_naics])

In [None]:
all_predictions_df = pd.DataFrame(all_predictions, index=X.index) \
    .set_axis(['predict_prob'], axis=1) \
    .reset_index() \
    .merge(sba_loans[['target', 'LoanNr_ChkDgt', 'dset', 'dset_naics_holdout', 'NAICS']], 
           on='LoanNr_ChkDgt')

In [None]:
all_predictions_df[['predict_prob', 'target']].corr(method='spearman')

##### Threshold Tune & Binary Predictions
Using training probability predictions

In [None]:
all_pred_train = all_predictions_df[all_predictions_df['dset'] == 'train']

In [None]:
thresh_tune_data = sbnn_metrics.get_f1_frame(all_pred_train['target'], 
                                        all_pred_train['predict_prob'])

In [None]:
thresh_tune_data.sort_values('f1', ascending=False, inplace=True)
thresh_tune_data.head(3)

In [None]:
best_thresh = thresh_tune_data['thresh'].iloc[0]
best_thresh

##### Append binary predictions to probability predictions

In [None]:
all_predictions_df['predict_bin'] = sbnn_metrics.get_binary_predictions(all_predictions_df['predict_prob'], best_thresh)

In [None]:
all_predictions_df['predict_bin'].value_counts(normalize=True, dropna=False)

In [None]:
all_predictions_df.to_parquet(Path(setup.temp_path).joinpath('10_DATA_predictions.parquet'))

## Metrics

In [None]:
metrics_dset_df = all_predictions_df.groupby('dset') \
    .apply(lambda x: sbnn_metrics.dset_metrics(x.target, x.predict_bin, x.predict_prob)) \
    .reset_index()
metrics_test_df = all_predictions_df[all_predictions_df['dset'] == 'test'] \
    .groupby(['dset', 'dset_naics_holdout']) \
    .apply(lambda x: sbnn_metrics.dset_metrics(x.target, x.predict_bin, x.predict_prob)) \
    .reset_index()
metrics_df = pd.concat([metrics_dset_df, metrics_test_df])
metrics_df.to_csv(Path(setup.temp_path).joinpath('10_REPORT_metrics.csv'), index=True)
metrics_df