In [1]:
# Modify the file A00_setup.  Note the working directory is changed
import os
os.chdir(os.getcwd().rsplit(os.path.sep + 'code')[0] + os.path.sep + 'code')
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./_A_embeddings/A00_setup.py").load_module()

## Get NAICS embeddings for many models
Retrieve embeddings for relevant models (inlcuding parents), for later analysis and visualizations

Models to embed:
  * 03 (parent): NAICS only, no randomization
  * 11 (parent): NAICS only, generator randomization
  * 04 (parent): NAICS+hieararchy, no randomization
  * 12 (parent): NAICS+hieararchy, data generator
  * A01: NAICS+hiearchy, data generator, intermediate layer
  * A02: NAICS+hiearchy, no randomization, intermediate layer

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [3]:
from pathlib import Path
import importlib, pickle

In [4]:
import os
import re

import keras
from tensorflow.keras import layers, optimizers, losses, metrics, Model
from sklearn import preprocessing, feature_extraction, model_selection
from IPython.display import display, HTML
import matplotlib.pyplot as plt
from keras.utils import plot_model
%matplotlib inline

In [5]:
from sba_nn.sba_nn import sbnn_metrics, sbnn_model

## Input Data

In [6]:
sba_loans = pd.read_parquet(Path(setup.parent_path).joinpath('01_DATA_transformed_nomiss.parquet'))

In [7]:
with open(Path(setup.parent_path).joinpath('01_DATA_features.pkl'), 'rb') as fin:
    imputer_features = pickle.load(fin)

In [8]:
# List numeric features features
features_numeric = [f for f in imputer_features if 'NAICS' not in f]
features_numeric_len = len(features_numeric)
print(features_numeric)

['NoEmp', 'CreateJob', 'LowDoc', 'DisbursementGross', 'new_business', 'urban_flag', 'franchise_flag', 'missingindicator_LowDoc', 'missingindicator_new_business', 'missingindicator_urban_flag']


In [9]:
features_orig = [c + '_orig' for c in features_numeric if c + '_orig' in sba_loans.columns]
print(features_orig)

['NoEmp_orig', 'CreateJob_orig', 'LowDoc_orig', 'DisbursementGross_orig', 'new_business_orig', 'urban_flag_orig', 'franchise_flag_orig']


## NAICS info table

In [10]:
naics_unique = sba_loans.drop_duplicates('NAICS_orig')

In [11]:
# NAICS feature 
features_naics = ['NAICS', 'NAICS_4', 'NAICS_3', 'NAICS_sector']
features_naics_orig = [c + '_orig' for c in features_naics]

In [12]:
naics_info_1 = sba_loans.groupby('NAICS_orig') \
    .agg({'dset_naics_holdout':'max', 'target':'mean', 'LoanNr_ChkDgt':'count'}) \
    .rename(columns={'LoanNr_ChkDgt':'count'}) \
    .reset_index()
naics_info = naics_unique[features_naics_orig].merge(naics_info_1, on='NAICS_orig')

In [13]:
naics_info['dset_naics_holdout'] = naics_info['dset_naics_holdout'] \
    .where(naics_info['dset_naics_holdout'] == 1, 0)
naics_info.sample(3)

Unnamed: 0,NAICS_orig,NAICS_4_orig,NAICS_3_orig,NAICS_sector_orig,dset_naics_holdout,target,count
1261,111336,1113,111,11,0.0,0.0,1
624,327332,3273,327,31-33,0.0,0.2,15
408,333923,3339,333,31-33,0.0,0.072464,69


In [14]:
naics_info['dset_naics_holdout'].value_counts()

dset_naics_holdout
0.0    1180
1.0     131
Name: count, dtype: int64

In [15]:
naics_info.to_parquet(Path(setup.temp_path).joinpath('A10_DATA_naics_info.parquet'))

##### Append predictor means

In [16]:
pred_agg = dict(zip(features_numeric + features_orig, ['mean']*len(features_numeric + features_orig)))

In [17]:
naics_info_2 = sba_loans.groupby('NAICS_orig') \
    .agg(pred_agg) \
    .reset_index()

In [18]:
naics_info_2.to_parquet(Path(setup.temp_path).joinpath('A10_DATA_naics_info_predictors.parquet'))

## Model Specs
Get lists containing relevant info for selected models

In [19]:
# Lists of relevant info
model_prefix = ['03', '11', '04', '12', 'A01', 'A02']
model_path= [setup.parent_path]*4 + [setup.temp_path]*2
model_hier = [False]*2 +  [True] * 4
model_rand  = [False, True, False, True, True, False]
model_layer_name = ['NAICS']*4 + ['embedding_int']*2
num_models=len(model_prefix)

In [20]:
model_full_paths = [Path(model_path[i]).joinpath(model_prefix[i] + '_DATA_model.keras') 
                    for i in range(num_models)]

In [21]:
# Dataframe of the above
model_info_frame = pd.DataFrame({'model':model_prefix,
                                'model_path':model_full_paths,
                                 'model_hier':model_hier,
                                 'model_rand':model_rand,
                                 'model_layer_name':model_layer_name})
model_info_frame.to_csv(Path(setup.temp_path).joinpath('A10_REPORT_model_info.csv'),
                       index=False)
model_info_frame

Unnamed: 0,model,model_path,model_hier,model_rand,model_layer_name
0,03,../data/2024_05_16/03_DATA_model.keras,False,False,NAICS
1,11,../data/2024_05_16/11_DATA_model.keras,False,True,NAICS
2,04,../data/2024_05_16/04_DATA_model.keras,True,False,NAICS
3,12,../data/2024_05_16/12_DATA_model.keras,True,True,NAICS
4,A01,../data/2024_06_04/A01_DATA_model.keras,True,True,embedding_int
5,A02,../data/2024_06_04/A02_DATA_model.keras,True,False,embedding_int


In [22]:
# NAICS input tables
features_naics_base = ['NAICS']
features_naics_hier = ['NAICS', 'NAICS_4', 'NAICS_3', 'NAICS_sector']
model_naics_features = [features_naics_hier if model_hier[i] else
                        features_naics_base for i in range(num_models)] 

## Function 
Get embeddings for a dataset from a model.  Optionally save the model diagram

In [23]:
def model_emb(full_path, data, 
              data_index = ['NAICS_orig'],
              data_append = naics_info,
              save_diagram=False,
              out_path = Path(setup.temp_path),
              out_prefix = 'XX', 
              layer_name = 'NAICS',
              features_numeric = features_numeric,
              features_naics = ['NAICS', 'NAICS_4', 'NAICS_3', 'NAICS_sector']):
    
    # Load this model, save diagram if applicable
    model = keras.models.load_model(full_path)
    if save_diagram:
        plot_model(model, 
                   to_file=Path(out_path).joinpath(out_prefix +'_model_diag.png'),
                   show_shapes=False)
        
    # Create a new model object to access the embeddings 1 layers above (normalized output)
    emb_model = Model(inputs=model.input, outputs=model.get_layer(layer_name).output)
    
    # Get the data of interest
    X = data[data_index + features_numeric +  features_naics].copy() \
        .set_index(data_index).sort_index()
    
    naics_emb = emb_model.predict([X[features_numeric]]+ [X[f] for f in features_naics])
    
    naics_emb_df = pd.DataFrame(naics_emb, index=X.index) \
        .set_axis([f'emb_{i:02}' for i in range(naics_emb.shape[1])], axis=1) \
        .reset_index() \
        .merge(data_append, on=data_index) 
    
    return naics_emb_df

In [24]:
# Function to apply model_emb to the dataframe

In [25]:
def model_emb_apply(model_info_series, 
                    data = naics_unique,
                    data_append = None,
                    save_diagram = True,
                    data_index = ['NAICS_orig'],
                    prefix = 'A10_PLOT_model'):
    
    if save_diagram:
        out_prefix = prefix + '_' + model_info_series['model'] 
    else:
        out_prefix = None
        
    if model_info_series['model_hier']:
        features_naics = features_naics_hier
    else:
        features_naics = features_naics_base
        
    if data_append is None:
        data_append = naics_info[['dset_naics_holdout'] + features_naics_orig]
        
    out_data = model_emb(model_info_series['model_path'],
                         data, 
                         data_append = data_append,
                         save_diagram = save_diagram,
                         data_index = data_index,
                         out_prefix = out_prefix,
                         layer_name = model_info_series['model_layer_name'],
                         features_naics = features_naics)
    out_data['model'] = model_info_series['model']
    
    return out_data           

## Embeddings for all models
Get embeddings for all models, combine them into one dataframe and save.

In [26]:
emb_all = pd.concat(model_info_frame.apply(model_emb_apply, axis=1,
                                result_type='reduce').to_list())

2024-07-15 21:56:44.491153: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-07-15 21:56:44.491187: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-07-15 21:56:44.491200: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-07-15 21:56:44.491256: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-07-15 21:56:44.491278: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)




2024-07-15 21:56:46.602792: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-07-15 21:56:47.131668: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-07-15 21:56:47.675044: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-07-15 21:56:48.189849: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-07-15 21:56:48.743681: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-07-15 21:56:50.138429: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


In [27]:
emb_all['model'].value_counts()

model
03     1311
11     1311
04     1311
12     1311
A01    1311
A02    1311
Name: count, dtype: int64

In [28]:
emb_all[emb_all['NAICS_orig'] == '111419']

Unnamed: 0,NAICS_orig,emb_00,emb_01,emb_02,emb_03,emb_04,emb_05,emb_06,emb_07,dset_naics_holdout,NAICS_4_orig,NAICS_3_orig,NAICS_sector_orig,model
17,111419,-0.030238,-0.020883,-0.008131,0.023881,-0.044659,-0.017204,-0.017765,0.004579,1.0,1114,111,11,03
17,111419,-0.12986,-0.041437,0.234216,-0.379106,-0.060093,0.303469,-0.257404,0.366726,1.0,1114,111,11,11
17,111419,0.011556,-0.04391,-0.049571,-0.044651,-0.037497,0.029903,0.034559,-0.026072,1.0,1114,111,11,04
17,111419,0.126656,-0.013041,-0.201824,-0.345253,-0.232339,0.175296,0.138054,0.17334,1.0,1114,111,11,12
17,111419,-0.197121,-0.284792,0.144166,-0.371903,0.370247,0.285222,-0.334234,-0.104493,1.0,1114,111,11,A01
17,111419,0.224055,0.166809,0.132954,0.217611,0.361108,-0.037709,-0.250264,0.053568,1.0,1114,111,11,A02


In [29]:
model_info_frame

Unnamed: 0,model,model_path,model_hier,model_rand,model_layer_name
0,03,../data/2024_05_16/03_DATA_model.keras,False,False,NAICS
1,11,../data/2024_05_16/11_DATA_model.keras,False,True,NAICS
2,04,../data/2024_05_16/04_DATA_model.keras,True,False,NAICS
3,12,../data/2024_05_16/12_DATA_model.keras,True,True,NAICS
4,A01,../data/2024_06_04/A01_DATA_model.keras,True,True,embedding_int
5,A02,../data/2024_06_04/A02_DATA_model.keras,True,False,embedding_int


In [30]:
emb_all.to_csv(Path(setup.temp_path).joinpath('A10_DATA_embeddings.csv'), index=False)
emb_all.to_parquet(Path(setup.temp_path).joinpath('A10_DATA_embeddings.parquet'))

In [31]:
emb_feat  = list(emb_all.filter(like='emb_').columns)
emb_feat

['emb_00',
 'emb_01',
 'emb_02',
 'emb_03',
 'emb_04',
 'emb_05',
 'emb_06',
 'emb_07']

In [32]:
emb_all.groupby(['dset_naics_holdout', 'model' ])[emb_feat].agg(['std'])

Unnamed: 0_level_0,Unnamed: 1_level_0,emb_00,emb_01,emb_02,emb_03,emb_04,emb_05,emb_06,emb_07
Unnamed: 0_level_1,Unnamed: 1_level_1,std,std,std,std,std,std,std,std
dset_naics_holdout,model,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
0.0,03,0.336414,0.25863,0.376186,0.332166,0.391243,0.281973,0.386383,0.343534
0.0,04,0.202458,0.306968,0.263039,0.364828,0.318304,0.347856,0.337176,0.326354
0.0,11,0.347715,0.26759,0.460955,0.357648,0.280502,0.452262,0.503583,0.378299
0.0,12,0.227926,0.199674,0.268447,0.319675,0.291202,0.309239,0.230302,0.311243
0.0,A01,0.30437,0.238561,0.294604,0.310306,0.308202,0.302769,0.328902,0.281968
0.0,A02,0.289704,0.28143,0.261905,0.274005,0.309345,0.26234,0.299625,0.317212
1.0,03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
emb_all.groupby(['dset_naics_holdout', 'model' ])[emb_feat].agg(['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,emb_00,emb_01,emb_02,emb_03,emb_04,emb_05,emb_06,emb_07
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean,mean,mean,mean,mean
dset_naics_holdout,model,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
0.0,03,-0.443369,-0.255221,0.406945,0.336117,0.475787,-0.357603,0.547174,0.339206
0.0,04,0.109499,-0.343665,0.291973,-0.437716,0.346902,-0.46757,-0.461331,0.40962
0.0,11,-0.34379,-0.064857,0.540493,-0.486862,-0.159478,0.529685,-0.604031,0.461721
0.0,12,0.241371,-0.022571,-0.338312,-0.39761,-0.350991,0.34072,0.198512,0.35872
0.0,A01,0.243338,-0.044152,-0.252525,-0.101927,0.190771,0.150443,-0.292385,0.238891
0.0,A02,-0.227313,-0.224117,-0.212152,0.060614,-0.045353,-0.20057,-0.099945,-0.288401
1.0,03,-0.030238,-0.020883,-0.008131,0.023881,-0.044659,-0.017204,-0.017765,0.004579
1.0,04,0.011556,-0.04391,-0.049571,-0.044651,-0.037497,0.029903,0.034559,-0.026072
1.0,11,-0.12986,-0.041437,0.234216,-0.379106,-0.060093,0.303469,-0.257404,0.366726
1.0,12,0.126656,-0.013041,-0.201824,-0.345253,-0.232339,0.175296,0.138054,0.17334


## Data With Missingness
I want to look at embeddings for training NAICS set to missing to the original embeddings (for hiearchical models with embeddings representing all NAICS levels).   This will be used for distance measures, to see how close the hiearchy gets to the "actual" embedding, and only for the hiearchical model (as the rest are all the same)

In [34]:
features_naics_len = len(features_naics)

In [35]:
features_naics

['NAICS', 'NAICS_4', 'NAICS_3', 'NAICS_sector']

##### Lists for retrieval
Get lists of NAICS features set to missing, data tables, and NAICS info for non-missing only

In [36]:
naics_mod =[]
naics_info_mod = []
X_mod = []
for i in range(1, features_naics_len + 1):
    this_naics_list = features_naics[0:i]
    print(f'{i}: set to missing: {this_naics_list}')
    this_X = naics_unique.copy()
    this_X[this_naics_list] = 1 # Set to missing for lower levels
    this_X.drop_duplicates(features_naics, inplace=True)
    
    keep_naics = features_naics[i:]

    this_info = this_X.copy() \
            [[c + '_orig' for c in features_naics if c not in this_naics_list]] 
    if len(keep_naics) < 1:
        this_info['NAICS_orig'] = -999
        this_X['NAICS_orig']= -999
    print(f'{i}: data shape {this_X.shape}, info shape {this_info.shape}')
    X_mod += [this_X]
    naics_mod += [this_naics_list]
    naics_info_mod += [this_info]

1: set to missing: ['NAICS']
1: data shape (352, 55), info shape (352, 3)
2: set to missing: ['NAICS', 'NAICS_4']
2: data shape (106, 55), info shape (106, 2)
3: set to missing: ['NAICS', 'NAICS_4', 'NAICS_3']
3: data shape (20, 55), info shape (20, 1)
4: set to missing: ['NAICS', 'NAICS_4', 'NAICS_3', 'NAICS_sector']
4: data shape (1, 55), info shape (1, 1)


In [37]:
naics_mod_orig = [[c + '_orig' for c in k] for k in naics_mod]
naics_mod_orig

[['NAICS_orig'],
 ['NAICS_orig', 'NAICS_4_orig'],
 ['NAICS_orig', 'NAICS_4_orig', 'NAICS_3_orig'],
 ['NAICS_orig', 'NAICS_4_orig', 'NAICS_3_orig', 'NAICS_sector_orig']]

## Embeddings with Missingness

In [38]:
model_info_frame_hier = model_info_frame[model_info_frame['model'].isin(['A02', 'A01'])]
model_info_frame_hier

Unnamed: 0,model,model_path,model_hier,model_rand,model_layer_name
4,A01,../data/2024_06_04/A01_DATA_model.keras,True,True,embedding_int
5,A02,../data/2024_06_04/A02_DATA_model.keras,True,False,embedding_int


In [39]:
def model_emb_apply_miss(model_info,
                         data,
                         data_append):
    data_index = list(data_append.columns)
    out_data = model_emb_apply(model_info, data, data_append,
                               data_index = data_index) \
        .drop(columns='NAICS_orig', errors='ignore')
    return out_data

In [40]:
emb_all_miss = pd.concat([pd.concat(model_info_frame_hier \
                                   .apply(lambda x: model_emb_apply_miss(x,
                                                         data = X_mod[i],
                                                         data_append = naics_info_mod[i]),
                                          axis=1, result_type='reduce').to_list()) 
                     for i in range(len(X_mod))],
                   keys = range(len(X_mod))) \
    .reset_index(level=0) \
    .rename(columns={'level_0':'miss_group'})



2024-07-15 21:56:50.971993: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-07-15 21:56:51.489323: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-07-15 21:56:52.026100: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-07-15 21:56:52.557340: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-07-15 21:56:53.064052: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-07-15 21:56:53.541970: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2024-07-15 21:56:54.023569: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-07-15 21:56:54.998911: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


In [41]:
emb_all_miss.head()

Unnamed: 0,miss_group,NAICS_4_orig,NAICS_3_orig,NAICS_sector_orig,emb_00,emb_01,emb_02,emb_03,emb_04,emb_05,emb_06,emb_07,model
0,0,1111,111,11,0.399143,-0.243259,-0.453278,-0.521713,0.526384,0.626523,-0.731203,0.410564,A01
1,0,1112,111,11,0.216578,-0.147997,-0.292293,-0.306259,0.386267,0.355314,-0.531091,0.266188,A01
2,0,1113,111,11,0.0707,-0.154308,-0.148617,-0.350522,0.363056,0.350788,-0.47046,0.150035,A01
3,0,1114,111,11,-0.197121,-0.284792,0.144166,-0.371903,0.370247,0.285222,-0.334234,-0.104493,A01
4,0,1119,111,11,0.286802,-0.319514,-0.344209,-0.600843,0.566535,0.668142,-0.728737,0.312444,A01


In [42]:
emb_all_miss.tail()

Unnamed: 0,miss_group,NAICS_4_orig,NAICS_3_orig,NAICS_sector_orig,emb_00,emb_01,emb_02,emb_03,emb_04,emb_05,emb_06,emb_07,model
17,2,,,72.0,0.185983,0.145803,0.132922,-0.026984,0.127725,0.05282,-0.034131,0.13605,A02
18,2,,,81.0,0.181248,0.141901,0.130869,-0.034045,0.119164,0.050529,-0.027457,0.135068,A02
19,2,,,92.0,0.202699,0.166374,0.152289,-0.031364,0.155457,0.055254,-0.032395,0.150308,A02
0,3,,,,0.084273,-0.029651,-0.136195,-0.056131,0.111378,0.030732,-0.14857,0.104896,A01
0,3,,,,0.196139,0.159282,0.146438,-0.033899,0.14422,0.053474,-0.029401,0.146268,A02


In [43]:
emb_all_miss.to_csv(Path(setup.temp_path).joinpath('A10_DATA_embeddings_missing.csv'), index=False)
emb_all_miss.to_parquet(Path(setup.temp_path).joinpath('A10_DATA_embeddings_missing.parquet'))