In [1]:
# Modify the file 00_setup.py to define input/output file paths on your system
# The information in 00_setup.py will be used across notebooks
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./00_setup.py").load_module()

# 20: GNN Data
New GNN graphs, with the "label trick" (using the label as a feature on some nodes)

The "train" set is split into rows with or without label information (set to 0, generally).  The rows without become the new training dataset. 

I am going to set an equal fraction of train and validation to 0 and these become the new train/validation data, with the remaining "fixed" rows in the graph but not used in training

For the test phase, actual label features are used for train and validation but the test label features are set to 0 

*This script takes about 1 hour on my MacBook Air*

In [2]:
import pandas as pd
import numpy as np

In [3]:
from pathlib import Path
import importlib, pickle
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [4]:
# Imputer object for easy dataset conversion to GNN friendly format
from sba_gnn.sba_gnn import sg_imputer 
from sba_gnn.sba_gnn.sg_imputer import GNNImputer

In [5]:
import itertools

In [6]:
from stellargraph import StellarGraph

2024-01-21 13:31:33.667649: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-01-21 13:31:33.667673: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-01-21 13:31:33.667681: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-01-21 13:31:33.667715: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-01-21 13:31:33.667731: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


## Input Data

In [7]:
comb_df = pd.read_parquet(Path(setup.temp_path).joinpath('10_DATA_combined_scaled_all.parquet'))

In [8]:
train_df = comb_df[comb_df['dset'] == 'train']
test_df =  comb_df[comb_df['dset'] == 'test']
val_df =  comb_df[comb_df['dset'] == 'val']

In [9]:
with open(Path(setup.temp_path).joinpath('10_DATA_features.pkl'), 'rb') as fin:
    imputed_features = pickle.load(fin)

In [10]:
num_feat =  [c for c in imputed_features if c != 'NAICS']
print(num_feat)

['NoEmp', 'CreateJob', 'LowDoc', 'DisbursementGross', 'new_business', 'urban_flag', 'franchise_flag', 'missingindicator_LowDoc', 'missingindicator_new_business', 'missingindicator_urban_flag']


## Label Data - Train/Validation
Process new label data, zeroing out the new "train" and creating a "fix" slice to retain labels.
This new label data is used as node features. The original label is used for evaluation metrics. 

In [11]:
split_ratio = len(comb_df[comb_df['dset'] == 'test']) / \
    len(comb_df)
print(split_ratio)

0.1982993281314264


In [12]:
# Split tran and validation
[train_index, train_fix_index] = train_test_split(train_df['LoanNr_ChkDgt'], 
                                                     train_size=split_ratio, random_state=343)
[val_index, val_fix_index] = train_test_split(val_df['LoanNr_ChkDgt'], 
                                                     train_size=split_ratio, random_state=345)

In [13]:
fix_index_df = pd.concat([train_fix_index, val_fix_index]).to_frame()
fix_index_df['dset'] = 'fix'

##### New label data, with new slices (split "train")

In [14]:
train_val_label_df = comb_df[comb_df['dset'].isin(['train', 'val'])] \
    [['dset', 'dset_naics_holdout', 'LoanNr_ChkDgt', 'target']] 

In [15]:
train_val_label_df.rename(columns={'dset':'dset_old'}, inplace=True)

In [16]:
train_val_label_df = train_val_label_df.merge(fix_index_df, how='left', on=['LoanNr_ChkDgt'])
train_val_label_df['dset'] = train_val_label_df['dset'].fillna(train_val_label_df['dset_old'])

In [17]:
train_val_label_df[['dset_old', 'dset']].value_counts()

dset_old  dset 
train     fix      341199
val       fix      101048
train     train     84395
val       val       24993
Name: count, dtype: int64

##### Target features - set to 0 for the new train and validation datasets.  

In [18]:
train_val_label_df['target_remap'] = train_val_label_df['target'] \
    .where(train_val_label_df['dset'] == 'fix', 0)

In [19]:
train_val_label_df.groupby('dset')['target_remap'].agg(['mean', 'std', 'min', 'max'])

Unnamed: 0_level_0,mean,std,min,max
dset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fix,0.203884,0.402884,0,1
train,0.0,0.0,0,0
val,0.0,0.0,0,0


In [20]:
train_val_label_df.to_parquet(Path(setup.temp_path).joinpath('30_DATA_label_info_train_val.parquet'))

## Label Data - Full data (For Testing)
The all nodes dataset is used for testing.  It uses original labels except for test, which is set to 0

In [21]:
all_label_df = comb_df.copy() \
    [['dset', 'dset_naics_holdout', 'LoanNr_ChkDgt', 'target']] 

In [22]:
all_label_df['target_remap'] = all_label_df['target'] \
    .where(all_label_df['dset'] != 'test', 0)

In [23]:
all_label_df.groupby('dset')['target_remap'].agg(['mean', 'std', 'min', 'max'])

Unnamed: 0_level_0,mean,std,min,max
dset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
test,0.0,0.0,0,0
train,0.203074,0.402287,0,1
val,0.205505,0.404071,0,1


In [24]:
all_label_df.to_parquet(Path(setup.temp_path).joinpath('30_DATA_label_info_all.parquet'))

In [25]:
all_label_df.head()

Unnamed: 0_level_0,dset,dset_naics_holdout,LoanNr_ChkDgt,target,target_remap
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,train,0,1000014003,0,0
2,train,0,1000034009,0,0
7,train,0,1000094005,0,0
8,train,0,1000104006,0,0
10,train,0,1000134004,0,0


## Node features

##### Train/validation

In [26]:
features_data_train_val = comb_df[comb_df['dset'].isin(['train', 'val'])] \
    [['LoanNr_ChkDgt', 'dset'] + num_feat] \
    .merge(train_val_label_df[['LoanNr_ChkDgt', 'target_remap']], on = 'LoanNr_ChkDgt') \
    .set_index('LoanNr_ChkDgt')

In [27]:
features_data_train_val.head(2)

Unnamed: 0_level_0,dset,NoEmp,CreateJob,LowDoc,DisbursementGross,new_business,urban_flag,franchise_flag,missingindicator_LowDoc,missingindicator_new_business,missingindicator_urban_flag,target_remap
LoanNr_ChkDgt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1000014003,train,0.029029,-1.0,1.0,-0.228228,1.0,1.0,-1.0,-1.0,-1.0,1.0,0
1000034009,train,0.394394,-1.0,-1.0,0.610611,-1.0,1.0,-1.0,-1.0,-1.0,1.0,0


##### All data

In [28]:
features_data_all = comb_df[['LoanNr_ChkDgt', 'dset'] + num_feat] \
    .merge(all_label_df[['LoanNr_ChkDgt', 'target_remap']], on = 'LoanNr_ChkDgt') \
    .set_index('LoanNr_ChkDgt')

In [29]:
features_data_all.head(2)

Unnamed: 0_level_0,dset,NoEmp,CreateJob,LowDoc,DisbursementGross,new_business,urban_flag,franchise_flag,missingindicator_LowDoc,missingindicator_new_business,missingindicator_urban_flag,target_remap
LoanNr_ChkDgt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1000014003,train,0.029029,-1.0,1.0,-0.228228,1.0,1.0,-1.0,-1.0,-1.0,1.0,0
1000034009,train,0.394394,-1.0,-1.0,0.610611,-1.0,1.0,-1.0,-1.0,-1.0,1.0,0


## Edges (as in previous)

In [30]:
edges_all = pd.read_parquet(Path(setup.temp_path).joinpath('20_DATA_edges_naics_all.parquet'))
edges_train_val = pd.read_parquet(Path(setup.temp_path).joinpath('20_DATA_edges_naics_train_val.parquet'))

## Create Graphs

##### All nodes

In [31]:
sba_graph_all = StellarGraph({'LoanNr_ChkDgt':features_data_all.drop(columns=['dset'])},
                             edges_all,
                             source_column="source", target_column="target",
                             edge_type_column="type")

In [32]:
print(sba_graph_all.info())

StellarGraph: Undirected multigraph
 Nodes: 688081, Edges: 66553913

 Node types:
  LoanNr_ChkDgt: [688081]
    Features: float32 vector, length 11
    Edge types: LoanNr_ChkDgt-naics->LoanNr_ChkDgt, LoanNr_ChkDgt-sector->LoanNr_ChkDgt

 Edge types:
    LoanNr_ChkDgt-sector->LoanNr_ChkDgt: [34354688]
        Weights: all 1 (default)
        Features: none
    LoanNr_ChkDgt-naics->LoanNr_ChkDgt: [32199225]
        Weights: all 1 (default)
        Features: none


In [33]:
# Save stellargraph object
with open(Path(setup.temp_path).joinpath('30_DATA_stellargraph_all.pkl'), 'wb') as fout:
      pickle.dump(sba_graph_all, fout)

##### Train and validation

In [35]:
sba_graph_train_val = StellarGraph({'LoanNr_ChkDgt':features_data_train_val.drop(columns=['dset'])},
                             edges_train_val,
                             source_column="source", target_column="target",
                             edge_type_column="type")

In [36]:
print(sba_graph_train_val.info())

StellarGraph: Undirected multigraph
 Nodes: 551635, Edges: 53101575

 Node types:
  LoanNr_ChkDgt: [551635]
    Features: float32 vector, length 11
    Edge types: LoanNr_ChkDgt-naics->LoanNr_ChkDgt, LoanNr_ChkDgt-sector->LoanNr_ChkDgt

 Edge types:
    LoanNr_ChkDgt-sector->LoanNr_ChkDgt: [27532592]
        Weights: all 1 (default)
        Features: none
    LoanNr_ChkDgt-naics->LoanNr_ChkDgt: [25568983]
        Weights: all 1 (default)
        Features: none


In [37]:
# Save stellargraph object
with open(Path(setup.temp_path).joinpath('30_DATA_stellargraph_train_val.pkl'), 'wb') as fout:
      pickle.dump(sba_graph_train_val, fout)