In [1]:
# Modify the file 00_setup.py to define input/output file paths on your system
# The information in 00_setup.py will be used across notebooks
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./00_setup.py").load_module()

# 20: GNN Data
Data for a GNN view of NAICS, where NAICS and NAICS sectors are edges connecting nodes

*This script takes about 1 hour on my MacBook Air*

In [2]:
import pandas as pd
import numpy as np

In [3]:
from pathlib import Path
import importlib, pickle
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder

In [4]:
# Imputer object for easy dataset conversion to GNN friendly format
from sba_gnn.sba_gnn import sg_imputer 
from sba_gnn.sba_gnn.sg_imputer import GNNImputer

In [5]:
import itertools

In [24]:
from stellargraph import StellarGraph

2024-01-17 06:32:57.952022: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-01-17 06:32:57.952059: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-01-17 06:32:57.952073: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-01-17 06:32:57.952295: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-01-17 06:32:57.952318: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


## Input Data

In [6]:
comb_df = pd.read_parquet(Path(setup.temp_path).joinpath('10_DATA_combined_scaled_all.parquet'))

In [18]:
with open(Path(setup.temp_path).joinpath('10_DATA_features.pkl'), 'rb') as fin:
    imputed_features = pickle.load(fin)

In [21]:
num_feat =  [c for c in imputed_features if c != 'NAICS']
print(num_feat)

['NoEmp', 'CreateJob', 'LowDoc', 'DisbursementGross', 'new_business', 'urban_flag', 'franchise_flag', 'missingindicator_LowDoc', 'missingindicator_new_business', 'missingindicator_urban_flag']


## Edge Mapping Function

There are 2 edge types:
  * NAICS connects exact NAICS match edges
  * naics_sector - connects same sector edges

There are a very large number of edges.  These will be sampled in the graph but even to load a graph will be too much on home hardware (and likely not really worth it).  So I will pre-sample for each node for both edge types. 

I want to get at least a set number of edges per node (setup.gnn_graph_sample_n), and so I sample accordingly. There may be more than this number of edges per node.  For higher-volume NAICS, the edge count per node will be closer to setup.gnn_graph_sample_n.

The GNN will do sampling also, and so I need the GNN samples to be less than setup.gnn_graph_sample_n.

##### Function to create samples for 1 code
Returns an edge list containing at least a set number of samples per node.  The list may contain more edges than requested, but will be smaller than the full edge list.  For high-volume code, the number of edges is ~setup.gnn_graph_sample_n, and the total rows is close to (# businesses with the code) * setup.gnn_graph_sample_n

In [8]:
def get_sample_per_code(ser, min_n = setup.gnn_graph_sample_n,
                       big_thresh = 500):
    """ Samples edges for each node, returning at least min_n edges for each node,
      Inputs:
        ser:  Pandas series consisting of node indexes
        min_n: Minimum number of edges per node
      Value:
        Pandas dataframe containing rows 'source' and 'target' for the edges.
          The 'source' value is always less than 'target'
    """
    
    ser_len = len(ser)
    
    # Return no edges for isolated codes
    if ser_len <= 1:
        return pd.DataFrame()
    
    np_ary = ser.sort_values().to_numpy()
    
    # If we have a small list, np_list all
    if ser_len <= min_n:
        pairs = itertools.combinations(np_ary, 2)
        return pd.DataFrame(pairs)\
           .set_axis(['source', 'target'], axis=1)

    # Otherwise get samples.  Get the sources first
    sources = np_ary.repeat(min_n).reshape((-1, 1))
    
    big_ser = (min_n*ser_len) >= big_thresh
    
    # If we have a very big series, just sample once. 
    if big_ser:
        targets = np.random.choice(np_ary,  min_n*ser_len, replace=True) \
            .reshape((-1, 1))
    else:
        # For a medium sized series, sample more carefully
        # Sample from nodes other than the source
        targets = np.concatenate([np.random.choice(np.concatenate((np_ary[:i], np_ary[i+1:])), 
                                               min_n, replace=False) \
                                  for i in range(ser_len)]) \
            .reshape((-1, 1))
    
    # Combine sources and targets
    comb_data = np.concatenate((sources, targets), axis=1)

    # Sort, remove duplicates
    comb_data.sort(axis=1)
    samples = pd.DataFrame(comb_data, columns=['source', 'target']) \
        .drop_duplicates()
    
    # Remove self loops
    if big_ser:
        samples = samples[samples['target'] != samples['source']]

    return samples

## Edge Lists
Create edge lists for the full dataset, as well as train + validation only.  Also do both sector and single-code lists

In [9]:
# Function to get edges of both types
def get_edges(data):
    
    # NAICS first
    edges_all = pd.concat([pd.concat([get_sample_per_code(g.LoanNr_ChkDgt) \
                                          for n, g in data.groupby('NAICS_orig')], axis=0),
                          pd.concat([get_sample_per_code(g.LoanNr_ChkDgt) \
                                          for n, g in data.groupby('NAICS_sector')], axis=0)],
                          axis=0, keys=['naics', 'sector']) \
        .reset_index(level=0) \
        .reset_index(drop=True) \
        .rename(columns={'level_0':'type'}, errors='ignore') 
    return edges_all

##### Edges - all nodes

In [10]:
edges_all = get_edges(comb_df)

In [11]:
edges_all.to_parquet(Path(setup.temp_path).joinpath('20_DATA_edges_naics_all.parquet'))

##### Edges - train and validation

In [12]:
edges_train_val = get_edges(comb_df[comb_df['dset'].isin(['val', 'train'])])

In [13]:
edges_train_val.to_parquet(Path(setup.temp_path).joinpath('20_DATA_edges_naics_train_val.parquet'))

## Node features

In [22]:
features_data_all = comb_df[['LoanNr_ChkDgt', 'dset'] + num_feat] \
    .set_index('LoanNr_ChkDgt')

## Create Graphs

##### All nodes

In [25]:
sba_graph_all = StellarGraph({'LoanNr_ChkDgt':features_data_all.drop(columns=['dset'])},
                             edges_all,
                             source_column="source", target_column="target",
                             edge_type_column="type")

In [26]:
print(sba_graph_all.info())

StellarGraph: Undirected multigraph
 Nodes: 688081, Edges: 66553913

 Node types:
  LoanNr_ChkDgt: [688081]
    Features: float32 vector, length 10
    Edge types: LoanNr_ChkDgt-naics->LoanNr_ChkDgt, LoanNr_ChkDgt-sector->LoanNr_ChkDgt

 Edge types:
    LoanNr_ChkDgt-sector->LoanNr_ChkDgt: [34354688]
        Weights: all 1 (default)
        Features: none
    LoanNr_ChkDgt-naics->LoanNr_ChkDgt: [32199225]
        Weights: all 1 (default)
        Features: none


In [27]:
# Save stellargraph object
with open(Path(setup.temp_path).joinpath('20_DATA_stellargraph_all.pkl'), 'wb') as fout:
      pickle.dump(sba_graph_all, fout)

##### Train and validation

In [29]:
feat_train_val = features_data_all[features_data_all['dset'].isin(['val','train'])].drop(columns=['dset'])

In [30]:
sba_graph_train_val = StellarGraph({'LoanNr_ChkDgt':feat_train_val},
                             edges_train_val,
                             source_column="source", target_column="target",
                             edge_type_column="type")

In [31]:
print(sba_graph_train_val.info())

StellarGraph: Undirected multigraph
 Nodes: 551635, Edges: 53101575

 Node types:
  LoanNr_ChkDgt: [551635]
    Features: float32 vector, length 10
    Edge types: LoanNr_ChkDgt-naics->LoanNr_ChkDgt, LoanNr_ChkDgt-sector->LoanNr_ChkDgt

 Edge types:
    LoanNr_ChkDgt-sector->LoanNr_ChkDgt: [27532592]
        Weights: all 1 (default)
        Features: none
    LoanNr_ChkDgt-naics->LoanNr_ChkDgt: [25568983]
        Weights: all 1 (default)
        Features: none


In [32]:
# Save stellargraph object
with open(Path(setup.temp_path).joinpath('20_DATA_stellargraph_train_val.pkl'), 'wb') as fout:
      pickle.dump(sba_graph_train_val, fout)

## Label Data

In [33]:
label_df = comb_df[['dset', 'dset_naics_holdout', 'LoanNr_ChkDgt', 'target']].set_index('LoanNr_ChkDgt')

In [34]:
label_df.to_parquet(Path(setup.temp_path).joinpath('20_DATA_label_info.parquet'))