In [4]:
# Modify the file 00_setup.py to define input/output file paths on your system
# The information in 00_setup.py will be used across notebooks
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./00_setup.py").load_module()

# GNN - Unsup Model Data, Features From NN Model

Create Stellargraph objects for unsupervised GNN runs.  Features are taken from businesses' NN model embeddings (model without NAICS, script 11).  No target feature. 

*This script takes about 15 minutes on my MacBook Air*

In [5]:
import pandas as pd
import numpy as np

In [6]:
from pathlib import Path
import importlib
import pickle, re

In [7]:
from sba_gnn.sba_gnn import sg_gnn

In [8]:
import stellargraph as sg
from stellargraph import StellarGraph

2024-01-29 15:28:36.557051: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-01-29 15:28:36.557084: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-01-29 15:28:36.557100: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-01-29 15:28:36.557159: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-01-29 15:28:36.557183: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


## Import Data

In [9]:
## Business embeddings from NN model
embed_df = pd.read_parquet(Path(setup.temp_path).joinpath('11_DATA_embeddings.parquet'))

In [10]:
print(list(embed_df.columns))

['LoanNr_ChkDgt', 'emb_000', 'emb_001', 'emb_002', 'emb_003', 'emb_004', 'emb_005', 'emb_006', 'emb_007', 'emb_008', 'emb_009', 'emb_010', 'emb_011', 'emb_012', 'emb_013', 'emb_014', 'emb_015', 'emb_016', 'emb_017', 'emb_018', 'emb_019', 'emb_020', 'emb_021', 'emb_022', 'emb_023', 'emb_024', 'emb_025', 'emb_026', 'emb_027', 'emb_028', 'emb_029', 'emb_030', 'emb_031', 'emb_032', 'emb_033', 'emb_034', 'emb_035', 'emb_036', 'emb_037', 'emb_038', 'emb_039', 'emb_040', 'emb_041', 'emb_042', 'emb_043', 'emb_044', 'emb_045', 'emb_046', 'emb_047', 'emb_048', 'emb_049', 'emb_050', 'emb_051', 'emb_052', 'emb_053', 'emb_054', 'emb_055', 'emb_056', 'emb_057', 'emb_058', 'emb_059', 'emb_060', 'emb_061', 'emb_062', 'emb_063']


In [11]:
## Raw features
comb_df = pd.read_parquet(Path(setup.temp_path).joinpath('10_DATA_combined_scaled_all.parquet'))

In [19]:
comb_df.filter(like='NAICS').columns

Index(['NAICS_orig', 'NAICS_sector', 'menc_NAICS', 'menc_NAICS_sector',
       'NAICS'],
      dtype='object')

###### Data used to create previous graphs

In [12]:
edges_all = pd.read_parquet(Path(setup.temp_path).joinpath('40_DATA_graph_edges.parquet'))

In [13]:
node_features_naics = pd.read_parquet(Path(setup.temp_path).joinpath('40_DATA_graph_node_features_naics.parquet'))

## Loan node features
From business embeddings

In [20]:
node_features_business = embed_df.set_index('LoanNr_ChkDgt') \
    .merge(comb_df[['LoanNr_ChkDgt', 'dset', 'NAICS_orig']], on='LoanNr_ChkDgt') \
    .set_index('LoanNr_ChkDgt') \
    .sort_index()

In [21]:
node_features_business.to_parquet(Path(setup.temp_path).joinpath('44_DATA_graph_node_features_loans.parquet'))

In [22]:
node_features_business.head(3)

Unnamed: 0_level_0,emb_000,emb_001,emb_002,emb_003,emb_004,emb_005,emb_006,emb_007,emb_008,emb_009,...,emb_056,emb_057,emb_058,emb_059,emb_060,emb_061,emb_062,emb_063,dset,NAICS_orig
LoanNr_ChkDgt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000014003,0.654536,-0.994952,0.987932,1.0,0.735649,-0.999932,-0.951871,0.999999,0.876765,0.999992,...,0.809824,0.626924,0.99871,-0.267917,0.954344,-0.999909,-0.985373,0.999995,train,451120
1000024006,0.547691,-0.999159,0.998841,0.999999,0.752479,-0.999985,-0.975103,0.999999,0.932148,1.0,...,-0.401658,0.968849,0.99997,-0.574112,0.98854,-0.999994,-0.978627,0.999904,test,722410
1000034009,-0.819275,-0.679908,-0.980788,1.0,0.991393,-0.998462,-0.744888,1.0,0.99317,0.975103,...,-0.940089,0.990855,-0.999382,-0.323084,0.987167,0.179436,-0.99801,1.0,train,621210


## Stellargraph - train and validation
Graph without test nodes (make sure set-aside NAICS not included)

In [23]:
# Get only relevant edges and node data
edges_train_val, features_business_train_val, features_naics_train_val = \
    sg_gnn.limit_data(edges_all, node_features_business, node_features_naics.drop(columns='NAICS_orig'),
                     ['train', 'val'])

In [24]:
print(f'Train+val graph data')
print(f'business features, start rows {node_features_business.shape[0]}, end {features_business_train_val.shape[0]}')
print(f'naics features, start rows {node_features_naics.shape[0]}, end {features_naics_train_val.shape[0]}')
print(f'edges, start rows {edges_all.shape[0]}, end {edges_train_val.shape[0]}')

Train+val graph data
business features, start rows 688081, end 551635
naics features, start rows 1311, end 1240
edges, start rows 23171852, end 17688739


In [25]:
sba_graph_train_val = StellarGraph({'LoanNr_ChkDgt':features_business_train_val.drop(columns='NAICS_orig'),
                                    'NAICS': features_naics_train_val},
                                   edges_train_val, 
                                   source_column="source", target_column="target",
                                   edge_type_column="type")

In [26]:
print(sba_graph_train_val.info())

StellarGraph: Undirected multigraph
 Nodes: 552875, Edges: 17688739

 Node types:
  LoanNr_ChkDgt: [551635]
    Features: float32 vector, length 64
    Edge types: LoanNr_ChkDgt-loan_naics->NAICS, LoanNr_ChkDgt-loan_sector_naics->NAICS
  NAICS: [1240]
    Features: float32 vector, length 1
    Edge types: NAICS-loan_naics->LoanNr_ChkDgt, NAICS-loan_sector_naics->LoanNr_ChkDgt

 Edge types:
    LoanNr_ChkDgt-loan_sector_naics->NAICS: [17137104]
        Weights: all 1 (default)
        Features: none
    LoanNr_ChkDgt-loan_naics->NAICS: [551635]
        Weights: all 1 (default)
        Features: none


In [27]:
# Save stellargraph object
with open(Path(setup.temp_path).joinpath('44_DATA_stellargraph_train_val.pkl'), 'wb') as fout:
      pickle.dump(sba_graph_train_val, fout)

## Stellargraph - all nodes

In [28]:
sba_graph_all = StellarGraph({'LoanNr_ChkDgt':node_features_business.drop(columns=['dset', 'NAICS_orig']) ,
                              'NAICS': node_features_naics.drop(columns='NAICS_orig')},
                             edges_all.drop(columns=['dset']).drop_duplicates(),
                             source_column="source", target_column="target",
                             edge_type_column="type")

In [29]:
print(sba_graph_all.info())

StellarGraph: Undirected multigraph
 Nodes: 689392, Edges: 23171852

 Node types:
  LoanNr_ChkDgt: [688081]
    Features: float32 vector, length 64
    Edge types: LoanNr_ChkDgt-loan_naics->NAICS, LoanNr_ChkDgt-loan_sector_naics->NAICS
  NAICS: [1311]
    Features: float32 vector, length 1
    Edge types: NAICS-loan_naics->LoanNr_ChkDgt, NAICS-loan_sector_naics->LoanNr_ChkDgt

 Edge types:
    LoanNr_ChkDgt-loan_sector_naics->NAICS: [22483771]
        Weights: all 1 (default)
        Features: none
    LoanNr_ChkDgt-loan_naics->NAICS: [688081]
        Weights: all 1 (default)
        Features: none


In [30]:
# Save stellargraph object
with open(Path(setup.temp_path).joinpath('44_DATA_stellargraph_all.pkl'), 'wb') as fout:
      pickle.dump(sba_graph_all, fout)