In [1]:
# Modify the file 00_setup.py to define input/output file paths on your system
# The information in 00_setup.py will be used across notebooks
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./00_setup.py").load_module()

# GNN - Data for Unsupervised Model

Create Stellargraph objects for unsupervised GNN runs.  The goal is to get embeddings for use in e.g. XGBoost models.

Here I use raw training features, with no label feature information

The unsupervised model has both business and NAICS nodes, and uses NAICS nodes as the head type.

This script is nearly identical to # 20, but I reproduce code here as I may change things in one type of model

*This script takes about 15 minutes on my MacBook Air*

In [2]:
import pandas as pd
import numpy as np

In [3]:
from pathlib import Path
import importlib
import pickle, re

In [4]:
from sba_gnn.sba_gnn import sg_gnn

In [5]:
import stellargraph as sg
from stellargraph import StellarGraph

2024-03-03 06:51:29.511877: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-03-03 06:51:29.511899: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-03-03 06:51:29.511905: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-03-03 06:51:29.512312: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-03-03 06:51:29.512676: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


## Import Data

In [6]:
## NN scaled input data
comb_df = pd.read_parquet(Path(setup.temp_path).joinpath('10_DATA_combined_scaled_all.parquet'))

In [7]:
with open(Path(setup.temp_path).joinpath('10_DATA_features.pkl'), 'rb') as fin:
    imputed_features = pickle.load(fin)

In [8]:
num_feat =  [c for c in imputed_features if 'NAICS' not in c]
print(num_feat)

['NoEmp', 'CreateJob', 'LowDoc', 'DisbursementGross', 'new_business', 'urban_flag', 'franchise_flag', 'missingindicator_LowDoc', 'missingindicator_new_business', 'missingindicator_urban_flag']


## Create Edge Map

I will have edges between a business and its NAICS, as well as edges to same-sector NAICS nodes.  This section creates general edge data, which may be filtered to create graphs

##### Get edges from businesses to their exact NAICS

In [9]:
# Edges from businesses to their NAICS code
naics_info_df = comb_df[['LoanNr_ChkDgt', 'NAICS_orig', 'dset']].copy() \
    .rename(columns={'LoanNr_ChkDgt':'source'}) 
naics_info_df['target'] = sg_gnn.get_naics_index(naics_info_df['NAICS_orig'])

edges_all = naics_info_df[['source', 'target', 'dset']].copy()
edges_all['type'] = 'loan_naics'

In [11]:
edges_all['type'].value_counts()

type
loan_naics    688081
Name: count, dtype: int64

In [12]:
print(edges_all[['source', 'target']].drop_duplicates().shape)
print(edges_all.shape)

(688081, 2)
(688081, 4)


In [13]:
edges_all.to_parquet(Path(setup.temp_path).joinpath('30_DATA_graph_edges.parquet'))

## Get Node Features
Use the scaled NN data.  For NAICS, just set an indicator for now.  

In [14]:
features_business = comb_df[['LoanNr_ChkDgt', 'dset', 'NAICS_orig'] + num_feat] \
    .set_index('LoanNr_ChkDgt')
features_business.to_parquet(Path(setup.temp_path).joinpath('30_DATA_graph_node_features_loans.parquet'))

In [15]:
# NAICS features - Just use indicator.  Include original code for mapping
features_naics = comb_df[['NAICS_orig']].drop_duplicates()
features_naics['feat'] = 1
features_naics['source'] = sg_gnn.get_naics_index(features_naics['NAICS_orig'])
features_naics.set_index('source', inplace=True) 
print(features_naics.shape)
features_naics.to_parquet(Path(setup.temp_path).joinpath('30_DATA_graph_node_features_naics.parquet'))

(1311, 2)


## Stellargraph - train and validation
Graph without test nodes (make sure set-aside NAICS not included)

In [16]:
# Get only relevant edges and node data
edges_train_val, features_business_train_val, features_naics_train_val = \
    sg_gnn.limit_data(edges_all, features_business, features_naics.drop(columns='NAICS_orig'),
                     ['train', 'val'])

In [17]:
print(f'Train+val graph data')
print(f'business features, start rows {features_business.shape[0]}, end {features_business_train_val.shape[0]}')
print(f'naics features, start rows {features_naics.shape[0]}, end {features_naics_train_val.shape[0]}')
print(f'edges, start rows {edges_all.shape[0]}, end {edges_train_val.shape[0]}')

Train+val graph data
business features, start rows 688081, end 551635
naics features, start rows 1311, end 1240
edges, start rows 688081, end 551635


In [18]:
sba_graph_train_val = StellarGraph({'LoanNr_ChkDgt':features_business_train_val.drop(columns='NAICS_orig'),
                                    'NAICS': features_naics_train_val},
                                   edges_train_val, 
                                   source_column="source", target_column="target",
                                   edge_type_column="type")

In [19]:
print(sba_graph_train_val.info())

StellarGraph: Undirected multigraph
 Nodes: 552875, Edges: 551635

 Node types:
  LoanNr_ChkDgt: [551635]
    Features: float32 vector, length 10
    Edge types: LoanNr_ChkDgt-loan_naics->NAICS
  NAICS: [1240]
    Features: float32 vector, length 1
    Edge types: NAICS-loan_naics->LoanNr_ChkDgt

 Edge types:
    LoanNr_ChkDgt-loan_naics->NAICS: [551635]
        Weights: all 1 (default)
        Features: none


In [20]:
# Save stellargraph object
with open(Path(setup.temp_path).joinpath('30_DATA_stellargraph_train_val.pkl'), 'wb') as fout:
      pickle.dump(sba_graph_train_val, fout)

In [21]:
features_naics.columns

Index(['NAICS_orig', 'feat'], dtype='object')

## Stellargraph - all nodes

In [22]:
sba_graph_all = StellarGraph({'LoanNr_ChkDgt':features_business.drop(columns=['dset', 'NAICS_orig']) ,
                              'NAICS': features_naics.drop(columns='NAICS_orig')},
                             edges_all.drop(columns=['dset']).drop_duplicates(),
                             source_column="source", target_column="target",
                             edge_type_column="type")

In [23]:
print(sba_graph_all.info())

StellarGraph: Undirected multigraph
 Nodes: 689392, Edges: 688081

 Node types:
  LoanNr_ChkDgt: [688081]
    Features: float32 vector, length 10
    Edge types: LoanNr_ChkDgt-loan_naics->NAICS
  NAICS: [1311]
    Features: float32 vector, length 1
    Edge types: NAICS-loan_naics->LoanNr_ChkDgt

 Edge types:
    LoanNr_ChkDgt-loan_naics->NAICS: [688081]
        Weights: all 1 (default)
        Features: none


In [24]:
# Save stellargraph object
with open(Path(setup.temp_path).joinpath('30_DATA_stellargraph_all.pkl'), 'wb') as fout:
      pickle.dump(sba_graph_all, fout)