In [1]:
# Modify the file 00_setup.py to define input/output file paths on your system
# The information in 00_setup.py will be used across notebooks
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./00_setup.py").load_module()

# GNN - Unsup Model Data, With Label

Create Stellargraph objects for unsupervised GNN runs, including original features, plus the target as a feature.

For the unseen / test data, randomly set the target value in the feature

*This script takes about 15 minutes on my MacBook Air*

In [2]:
import pandas as pd
import numpy as np

In [3]:
from pathlib import Path
import importlib
import pickle, re

In [4]:
from sba_gnn.sba_gnn import sg_gnn

In [5]:
import stellargraph as sg
from stellargraph import StellarGraph

2024-03-16 13:51:35.550796: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-03-16 13:51:35.550818: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-03-16 13:51:35.550826: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-03-16 13:51:35.550862: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-03-16 13:51:35.550887: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


## Import Data

In [6]:
## NN scaled input data
comb_df = pd.read_parquet(Path(setup.temp_path).joinpath('20_DATA_combined_scaled_all.parquet'))

In [7]:
with open(Path(setup.temp_path).joinpath('20_DATA_features.pkl'), 'rb') as fin:
    imputed_features = pickle.load(fin)

In [8]:
num_feat =  [c for c in imputed_features if 'NAICS' not in c]
print(num_feat)

['NoEmp', 'CreateJob', 'LowDoc', 'DisbursementGross', 'new_business', 'urban_flag', 'franchise_flag', 'missingindicator_LowDoc', 'missingindicator_new_business', 'missingindicator_urban_flag']


###### Data used to create previous graphs

In [9]:
edges_all = pd.read_parquet(Path(setup.temp_path).joinpath('30_DATA_graph_edges.parquet'))

In [10]:
node_features_business = pd.read_parquet(Path(setup.temp_path).joinpath('30_DATA_graph_node_features_loans.parquet'))

In [11]:
node_features_naics = pd.read_parquet(Path(setup.temp_path).joinpath('30_DATA_graph_node_features_naics.parquet'))

## Add target to node features
Add target data to the features for the train and validation data.  For test, randomly set

In [12]:
targ_feat_train_val = comb_df[comb_df['dset'].isin(['train', 'val'])] \
    [['LoanNr_ChkDgt', 'target', 'dset']] \
    .set_index('LoanNr_ChkDgt') \
    .rename(columns={'target':'feature_target'})

In [13]:
base_target_rate = targ_feat_train_val[targ_feat_train_val['dset']=='train']['feature_target'].mean()
base_target_rate

0.2046546821975603

In [14]:
targ_feat_test = comb_df[comb_df['dset'] == 'test'] \
    [['LoanNr_ChkDgt', 'dset']] \
    .set_index('LoanNr_ChkDgt') 

In [15]:
np.random.seed(1234)
targ_feat_test['feature_target'] = np.random.choice([1, 0], 
                                                    p = [base_target_rate, 1-base_target_rate],
                                                    size = len(targ_feat_test))

In [16]:
targ_feat = pd.concat([targ_feat_train_val, targ_feat_test]).sort_index()

In [17]:
node_features_business = pd.concat([node_features_business.sort_index(),
                                    targ_feat[['feature_target']]], axis=1)
node_features_business.to_parquet(Path(setup.temp_path).joinpath('32_DATA_graph_node_features_loans.parquet'))

In [18]:
node_features_business.head(3)

Unnamed: 0_level_0,dset,NAICS_orig,NoEmp,CreateJob,LowDoc,DisbursementGross,new_business,urban_flag,franchise_flag,missingindicator_LowDoc,missingindicator_new_business,missingindicator_urban_flag,feature_target
LoanNr_ChkDgt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1000014003,train,451120,0.024024,-1.0,1.0,-0.195195,1.0,1.0,-1.0,-1.0,-1.0,1.0,0
1000024006,test,722410,-0.447447,-1.0,1.0,-0.451451,1.0,1.0,-1.0,-1.0,-1.0,1.0,1
1000034009,train,621210,0.381381,-1.0,-1.0,0.620502,-1.0,1.0,-1.0,-1.0,-1.0,1.0,0


## Stellargraph - train and validation
Graph without test nodes (make sure set-aside NAICS not included)

In [19]:
# Get only relevant edges and node data
edges_train_val, features_business_train_val, features_naics_train_val = \
    sg_gnn.limit_data(edges_all, node_features_business, node_features_naics.drop(columns='NAICS_orig'),
                     ['train', 'val'])

In [20]:
print(f'Train+val graph data')
print(f'business features, start rows {node_features_business.shape[0]}, end {features_business_train_val.shape[0]}')
print(f'naics features, start rows {node_features_naics.shape[0]}, end {features_naics_train_val.shape[0]}')
print(f'edges, start rows {edges_all.shape[0]}, end {edges_train_val.shape[0]}')

Train+val graph data
business features, start rows 688081, end 529574
naics features, start rows 1311, end 1173
edges, start rows 688081, end 529574


In [21]:
sba_graph_train_val = StellarGraph({'LoanNr_ChkDgt':features_business_train_val.drop(columns='NAICS_orig'),
                                    'NAICS': features_naics_train_val},
                                   edges_train_val, 
                                   source_column="source", target_column="target",
                                   edge_type_column="type")

In [22]:
print(sba_graph_train_val.info())

StellarGraph: Undirected multigraph
 Nodes: 530747, Edges: 529574

 Node types:
  LoanNr_ChkDgt: [529574]
    Features: float32 vector, length 11
    Edge types: LoanNr_ChkDgt-loan_naics->NAICS
  NAICS: [1173]
    Features: float32 vector, length 1
    Edge types: NAICS-loan_naics->LoanNr_ChkDgt

 Edge types:
    LoanNr_ChkDgt-loan_naics->NAICS: [529574]
        Weights: all 1 (default)
        Features: none


In [23]:
# Save stellargraph object
with open(Path(setup.temp_path).joinpath('32_DATA_stellargraph_train_val.pkl'), 'wb') as fout:
      pickle.dump(sba_graph_train_val, fout)

## Stellargraph - all nodes

In [24]:
sba_graph_all = StellarGraph({'LoanNr_ChkDgt':node_features_business.drop(columns=['dset', 'NAICS_orig']) ,
                              'NAICS': node_features_naics.drop(columns='NAICS_orig')},
                             edges_all.drop(columns=['dset']).drop_duplicates(),
                             source_column="source", target_column="target",
                             edge_type_column="type")

In [25]:
print(sba_graph_all.info())

StellarGraph: Undirected multigraph
 Nodes: 689392, Edges: 688081

 Node types:
  LoanNr_ChkDgt: [688081]
    Features: float32 vector, length 11
    Edge types: LoanNr_ChkDgt-loan_naics->NAICS
  NAICS: [1311]
    Features: float32 vector, length 1
    Edge types: NAICS-loan_naics->LoanNr_ChkDgt

 Edge types:
    LoanNr_ChkDgt-loan_naics->NAICS: [688081]
        Weights: all 1 (default)
        Features: none


In [26]:
# Save stellargraph object
with open(Path(setup.temp_path).joinpath('32_DATA_stellargraph_all.pkl'), 'wb') as fout:
      pickle.dump(sba_graph_all, fout)