In [1]:
# Modify the file 00_setup.py to define input/output file paths on your system
# The information in 00_setup.py will be used across notebooks
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./00_setup.py").load_module()

# 11: Base GNN - Stellargraphs
Here, I create Stellargraph objects.  I do *not* include the target as a feature (e.g. the "label trick") at this point.  I also don't include mean encoded features.  My baseline GNN is just the predictor features for businesses, and NAICS nodes have one hot encoded sector indicators

However, I create several graphs for testing.
  * Graph with training and validation cases, for model training
  * Graph will all data (training, validation, test)
  * Graph with test only

I also save label data relevant for the cases above

*This script takes about 5 minutes on my MacBook Air*

In [2]:
import pandas as pd
import numpy as np

In [3]:
from pathlib import Path
import importlib, pickle

In [4]:
from stellargraph import StellarGraph

2023-12-28 07:54:58.800926: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2023-12-28 07:54:58.800952: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2023-12-28 07:54:58.800960: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2023-12-28 07:54:58.800999: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-12-28 07:54:58.801018: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [5]:
from sba_gnn.sba_gnn import sg_gnn 

## Import Data
Node features and edge list

In [6]:
features_naics = pd.read_parquet(Path(setup.temp_path).joinpath('10_DATA_graph_node_features_naics.parquet'))

In [7]:
# Remove mean encoding features from NAICS data
menc_feat = [c for c in features_naics.columns if c.startswith('menc_')]
print(f'Dropping features {menc_feat}')
if len(menc_feat) > 0:
    features_naics = features_naics.drop(columns=menc_feat)

Dropping features ['menc_NAICS', 'menc_NAICS_sector']


In [8]:
features_business = pd.read_parquet(Path(setup.temp_path).joinpath('10_DATA_graph_node_features_loans.parquet'))

In [9]:
# Remove target feature from business nodes
features_business = features_business.drop(columns=['target'], errors='ignore')

In [10]:
edges_all = pd.read_parquet(Path(setup.temp_path).joinpath('10_DATA_graph_edges.parquet'))

In [11]:
label_info = pd.read_parquet(Path(setup.temp_path).joinpath('10_DATA_label_info.parquet'))

## Stellargraph - Train plus Validation

In [12]:
# Get only relevant edges and node data
edges_train_val, features_business_train_val, features_naics_train_val = \
    sg_gnn.limit_data(edges_all, features_business, features_naics,
                     ['train', 'val'])

In [13]:
print(f'Train+val graph data')
print(f'business features, start rows {features_business.shape[0]}, end {features_business_train_val.shape[0]}')
print(f'naics features, start rows {features_naics.shape[0]}, end {features_naics_train_val.shape[0]}')

Train+val graph data
business features, start rows 688081, end 633035
naics features, start rows 1311, end 1311


In [14]:
sba_graph_train_val = StellarGraph({'LoanNr_ChkDgt':features_business_train_val,
                                    'NAICS': features_naics_train_val},
                                   edges_train_val, 
                                   source_column="source", target_column="target",
                                   edge_type_column="type")

In [15]:
print(sba_graph_train_val.info())

StellarGraph: Undirected multigraph
 Nodes: 634346, Edges: 11132277

 Node types:
  LoanNr_ChkDgt: [633035]
    Features: float32 vector, length 10
    Edge types: LoanNr_ChkDgt-loan_naics->NAICS, LoanNr_ChkDgt-loan_sector_naics->NAICS
  NAICS: [1311]
    Features: float32 vector, length 9
    Edge types: NAICS-loan_naics->LoanNr_ChkDgt, NAICS-loan_sector_naics->LoanNr_ChkDgt

 Edge types:
    LoanNr_ChkDgt-loan_sector_naics->NAICS: [10499242]
        Weights: all 1 (default)
        Features: none
    LoanNr_ChkDgt-loan_naics->NAICS: [633035]
        Weights: all 1 (default)
        Features: none


In [16]:
# Save stellargraph object
with open(Path(setup.temp_path).joinpath('11_DATA_stellargraph_train_val.pkl'), 'wb') as fout:
      pickle.dump(sba_graph_train_val, fout)

In [17]:
# Save labels

In [18]:
importlib.reload(sg_gnn)

<module 'sba_gnn.sba_gnn.sg_gnn' from '/Users/valeriecarey/Documents/projects/2023_10_blog_gnn_sba/code/sba_gnn/sba_gnn/sg_gnn.py'>

In [19]:
graph_labels_train_val = sg_gnn.graph_labels(label_info, sba_graph_train_val)

In [20]:
print(label_info.shape)
print(graph_labels_train_val.shape)

(688081, 3)
(633035, 4)


In [21]:
graph_labels_train_val.to_parquet(Path(setup.temp_path).joinpath('11_DATA_labels_train_val.parquet'))

## Stellargraph - Full Graph

In [22]:
sba_graph_all = StellarGraph({'LoanNr_ChkDgt':features_business.drop(columns=['dset']) ,
                              'NAICS': features_naics},
                             edges_all.drop(columns=['dset']).drop_duplicates(),
                             source_column="source", target_column="target",
                             edge_type_column="type")

In [23]:
print(sba_graph_all.info())

StellarGraph: Undirected multigraph
 Nodes: 689392, Edges: 12101146

 Node types:
  LoanNr_ChkDgt: [688081]
    Features: float32 vector, length 10
    Edge types: LoanNr_ChkDgt-loan_naics->NAICS, LoanNr_ChkDgt-loan_sector_naics->NAICS
  NAICS: [1311]
    Features: float32 vector, length 9
    Edge types: NAICS-loan_naics->LoanNr_ChkDgt, NAICS-loan_sector_naics->LoanNr_ChkDgt

 Edge types:
    LoanNr_ChkDgt-loan_sector_naics->NAICS: [11413065]
        Weights: all 1 (default)
        Features: none
    LoanNr_ChkDgt-loan_naics->NAICS: [688081]
        Weights: all 1 (default)
        Features: none


In [24]:
# Save stellargraph object
with open(Path(setup.temp_path).joinpath('11_DATA_stellargraph_all.pkl'), 'wb') as fout:
      pickle.dump(sba_graph_all, fout)

## Stellargraph - Test Only

In [25]:
# Get only relevant edges and node data
edges_test, features_business_test, features_naics_test = \
    sg_gnn.limit_data(edges_all, features_business, features_naics, ['test'])

In [26]:
print(f'Test graph data')
print(f'business features, start rows {features_business.shape[0]}, end {features_business_test.shape[0]}')
print(f'naics features, start rows {features_naics.shape[0]}, end {features_naics_test.shape[0]}')

Test graph data
business features, start rows 688081, end 55046
naics features, start rows 1311, end 1311


In [27]:
sba_graph_test  = StellarGraph({'LoanNr_ChkDgt':features_business_test,
                                    'NAICS': features_naics_test},
                                   edges_test, 
                                   source_column="source", target_column="target",
                                   edge_type_column="type")

In [28]:
print(sba_graph_test.info())

StellarGraph: Undirected multigraph
 Nodes: 56357, Edges: 968869

 Node types:
  LoanNr_ChkDgt: [55046]
    Features: float32 vector, length 10
    Edge types: LoanNr_ChkDgt-loan_naics->NAICS, LoanNr_ChkDgt-loan_sector_naics->NAICS
  NAICS: [1311]
    Features: float32 vector, length 9
    Edge types: NAICS-loan_naics->LoanNr_ChkDgt, NAICS-loan_sector_naics->LoanNr_ChkDgt

 Edge types:
    LoanNr_ChkDgt-loan_sector_naics->NAICS: [913823]
        Weights: all 1 (default)
        Features: none
    LoanNr_ChkDgt-loan_naics->NAICS: [55046]
        Weights: all 1 (default)
        Features: none


In [29]:
# Save stellargraph object
with open(Path(setup.temp_path).joinpath('11_DATA_stellargraph_test.pkl'), 'wb') as fout:
      pickle.dump(sba_graph_test, fout)

In [30]:
graph_labels_test = sg_gnn.graph_labels(label_info, sba_graph_test)

In [31]:
print(label_info.shape)
print(graph_labels_test.shape)

(688081, 3)
(55046, 4)


In [32]:
graph_labels_test.to_parquet(Path(setup.temp_path).joinpath('11_DATA_labels_test.parquet'))