In [10]:
# Modify the file 00_setup.py to define input/output file paths on your system
# The information in 00_setup.py will be used across notebooks
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./00_setup.py").load_module()

# GNN - Data, Sector Node for Class Model

Alternative graph structure for GNN node classification.  Add a third node type of NAICS-NAICS sector (no LoanNr_ChkDgt-loan_sector_naics->NAICS links)

*This script takes about 15 minutes on my MacBook Air*

In [11]:
import pandas as pd
import numpy as np

In [12]:
from pathlib import Path
import importlib
import pickle, re

In [13]:
from sba_gnn.sba_gnn import sg_gnn

In [14]:
import stellargraph as sg
from stellargraph import StellarGraph

## Import Data

In [15]:
## NN scaled input data
comb_df = pd.read_parquet(Path(setup.temp_path).joinpath('10_DATA_combined_scaled_all.parquet'))

In [16]:
with open(Path(setup.temp_path).joinpath('10_DATA_features.pkl'), 'rb') as fin:
    imputed_features = pickle.load(fin)

In [17]:
num_feat =  [c for c in imputed_features if c != 'NAICS']
print(num_feat)

['NoEmp', 'CreateJob', 'LowDoc', 'DisbursementGross', 'new_business', 'urban_flag', 'franchise_flag', 'missingindicator_LowDoc', 'missingindicator_new_business', 'missingindicator_urban_flag']


## Create Edge Map

I will have edges between a business and its NAICS, as well as edges to same-sector NAICS nodes.  This section creates general edge data, which may be filtered to create graphs

##### Create NAICS to NAICS sector edge

In [75]:
importlib.reload(sg_gnn)

<module 'sba_gnn.sba_gnn.sg_gnn' from '/Users/valeriecarey/Documents/projects/2023_10_blog_gnn_sba/code/sba_gnn/sba_gnn/sg_gnn.py'>

In [44]:
sector_map = comb_df[['NAICS_orig', 'NAICS_sector']].drop_duplicates('NAICS_orig')

In [53]:
sector_map['target'] = sg_gnn.get_naics_index(sector_map['NAICS_orig'])
sector_map['source'] = sg_gnn.get_naics_sector_index(sector_map['NAICS_sector'])
edges_naics_sector = sector_map[['source', 'target']].copy()
edges_naics_sector['type'] = 'naics_sector'

In [54]:
edges_naics_sector.sample(3)

Unnamed: 0_level_0,source,target,type
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5981,ns_31-33,n_339992,naics_sector
298269,ns_31-33,n_313112,naics_sector
5711,ns_62,n_623312,naics_sector


##### Get edges from businesses to their exact NAICS

In [55]:
# Edges from businesses to their NAICS code
naics_info_df = comb_df[['LoanNr_ChkDgt', 'NAICS_orig', 'dset']].copy() \
    .rename(columns={'LoanNr_ChkDgt':'source'}) 
naics_info_df['target'] = sg_gnn.get_naics_index(naics_info_df['NAICS_orig'])

edges_business_naics = naics_info_df[['source', 'target', 'dset']].copy()
edges_business_naics['type'] = 'loan_naics'

In [56]:
edges_business_naics.head(3)

Unnamed: 0_level_0,source,target,dset,type
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1000014003,n_451120,train,loan_naics
2,1000034009,n_621210,train,loan_naics
7,1000094005,n_811118,train,loan_naics


In [57]:
edges_all = pd.concat([edges_business_naics, edges_naics_sector], axis=0) \
    .reset_index(drop=True)

In [58]:
edges_all['type'].value_counts()

type
loan_naics      688081
naics_sector      1311
Name: count, dtype: int64

In [59]:
print(edges_all[['source', 'target']].drop_duplicates().shape)
print(edges_all.shape)

(689392, 2)
(689392, 4)


In [60]:
edges_all.to_parquet(Path(setup.temp_path).joinpath('23_DATA_graph_edges.parquet'))

## Get Node Features
Use the scaled NN data.  For NAICS, just set an indicator for now.  

In [61]:
features_business = comb_df[['LoanNr_ChkDgt', 'dset', 'NAICS_orig'] + num_feat] \
    .set_index('LoanNr_ChkDgt')
features_business.to_parquet(Path(setup.temp_path).joinpath('23_DATA_graph_node_features_loans.parquet'))

In [62]:
# NAICS features - Just use indicator.  Include original code for mapping
features_naics = comb_df[['NAICS_orig']].drop_duplicates()
features_naics['feat'] = 1
features_naics['source'] = sg_gnn.get_naics_index(features_naics['NAICS_orig'])
features_naics.set_index('source', inplace=True) 
print(features_naics.shape)
features_naics.to_parquet(Path(setup.temp_path).joinpath('23_DATA_graph_node_features_naics.parquet'))

(1311, 2)


In [71]:
# NAICS sector features - again just indicator
features_naics_sector = sector_map[['source', 'NAICS_sector']].drop_duplicates('NAICS_sector') 
features_naics_sector['feat'] = 1
features_naics_sector.set_index('source', inplace=True) 
print(features_naics_sector.shape)
features_naics_sector.to_parquet(Path(setup.temp_path).joinpath('23_DATA_graph_node_features_naics_sector.parquet'))

(20, 2)


## Stellargraph - train and validation
Graph without test nodes (make sure set-aside NAICS not included)

In [76]:
# Get only relevant edges and node data
edges_train_val, features_business_train_val, features_naics_train_val = \
    sg_gnn.limit_data(edges_all, features_business, features_naics.drop(columns='NAICS_orig'),
                     ['train', 'val'])

In [77]:
edges_train_val['type'].value_counts()

type
loan_naics      551635
naics_sector      1240
Name: count, dtype: int64

In [78]:
print(f'Train+val graph data')
print(f'business features, start rows {features_business.shape[0]}, end {features_business_train_val.shape[0]}')
print(f'naics features, start rows {features_naics.shape[0]}, end {features_naics_train_val.shape[0]}')
print(f'edges, start rows {edges_all.shape[0]}, end {edges_train_val.shape[0]}')

Train+val graph data
business features, start rows 688081, end 551635
naics features, start rows 1311, end 1240
edges, start rows 689392, end 552875


In [79]:
sba_graph_train_val = StellarGraph({'LoanNr_ChkDgt':features_business_train_val.drop(columns='NAICS_orig'),
                                    'NAICS': features_naics_train_val,
                                   'NAICS_sector': features_naics_sector.drop(columns='NAICS_sector')},
                                   edges_train_val, 
                                   source_column="source", target_column="target",
                                   edge_type_column="type")

In [80]:
print(sba_graph_train_val.info())

StellarGraph: Undirected multigraph
 Nodes: 552895, Edges: 552875

 Node types:
  LoanNr_ChkDgt: [551635]
    Features: float32 vector, length 10
    Edge types: LoanNr_ChkDgt-loan_naics->NAICS
  NAICS: [1240]
    Features: float32 vector, length 1
    Edge types: NAICS-loan_naics->LoanNr_ChkDgt, NAICS-naics_sector->NAICS_sector
  NAICS_sector: [20]
    Features: float32 vector, length 1
    Edge types: NAICS_sector-naics_sector->NAICS

 Edge types:
    LoanNr_ChkDgt-loan_naics->NAICS: [551635]
        Weights: all 1 (default)
        Features: none
    NAICS-naics_sector->NAICS_sector: [1240]
        Weights: all 1 (default)
        Features: none


In [81]:
# Save stellargraph object
with open(Path(setup.temp_path).joinpath('23_DATA_stellargraph_train_val.pkl'), 'wb') as fout:
      pickle.dump(sba_graph_train_val, fout)

## Stellargraph - all nodes

In [82]:
sba_graph_all = StellarGraph({'LoanNr_ChkDgt':features_business.drop(columns=['dset', 'NAICS_orig']) ,
                              'NAICS': features_naics.drop(columns='NAICS_orig'),
                              'NAICS_sector': features_naics_sector.drop(columns='NAICS_sector')},
                             edges_all.drop(columns=['dset']).drop_duplicates(),
                             source_column="source", target_column="target",
                             edge_type_column="type")

In [83]:
print(sba_graph_all.info())

StellarGraph: Undirected multigraph
 Nodes: 689412, Edges: 689392

 Node types:
  LoanNr_ChkDgt: [688081]
    Features: float32 vector, length 10
    Edge types: LoanNr_ChkDgt-loan_naics->NAICS
  NAICS: [1311]
    Features: float32 vector, length 1
    Edge types: NAICS-loan_naics->LoanNr_ChkDgt, NAICS-naics_sector->NAICS_sector
  NAICS_sector: [20]
    Features: float32 vector, length 1
    Edge types: NAICS_sector-naics_sector->NAICS

 Edge types:
    LoanNr_ChkDgt-loan_naics->NAICS: [688081]
        Weights: all 1 (default)
        Features: none
    NAICS-naics_sector->NAICS_sector: [1311]
        Weights: all 1 (default)
        Features: none


In [84]:
# Save stellargraph object
with open(Path(setup.temp_path).joinpath('23_DATA_stellargraph_all.pkl'), 'wb') as fout:
      pickle.dump(sba_graph_all, fout)