In [1]:
# Modify the file 00_setup.py to define input/output file paths on your system
# The information in 00_setup.py will be used across notebooks
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./00_setup.py").load_module()

# GNN Training

In [2]:
import pandas as pd
import numpy as np

In [3]:
from pathlib import Path
import importlib
import pickle

In [4]:
import stellargraph as sg
#from stellargraph.mapper import GraphSAGENodeGenerator
from stellargraph.layer import HinSAGE
from stellargraph.mapper import HinSAGENodeGenerator
from stellargraph.layer import HinSAGE, link_regression

from tensorflow.keras import layers, optimizers, losses, metrics, Model
from sklearn import preprocessing, feature_extraction, model_selection

import tensorflow.keras.metrics as km
from keras import initializers
import tensorflow as tf

2023-12-05 21:41:31.074893: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2023-12-05 21:41:31.074931: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2023-12-05 21:41:31.074945: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2023-12-05 21:41:31.075464: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-12-05 21:41:31.075655: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [5]:
from stellargraph import StellarGraph

In [6]:
import matplotlib.pyplot as plt
%matplotlib inline

## Import Data
Import Stellargraph datasets for training and parameter tuning

##### Heterogeneous graphs

In [7]:
# Validation plus training clients
#with open(Path(setup.temp_path).joinpath('10_DATA_stellargraph_val.pkl'), 'rb') as fin:
#      sba_graph_val = pickle.load(fin)

In [8]:
# Training clients only
with open(Path(setup.temp_path).joinpath('10_DATA_stellargraph_train.pkl'), 'rb') as fin:
      sba_graph_train  = pickle.load(fin)

In [11]:
print(sba_graph_train.info())

StellarGraph: Undirected multigraph
 Nodes: 448570, Edges: 448550

 Node types:
  LoanNr_ChkDgt: [447252]
    Features: float32 vector, length 10
    Edge types: LoanNr_ChkDgt-loan_naics->NAICS
  NAICS: [1298]
    Features: float32 vector, length 9
    Edge types: NAICS-loan_naics->LoanNr_ChkDgt, NAICS-naics_sector->NAICS_sector
  NAICS_sector: [20]
    Features: float32 vector, length 9
    Edge types: NAICS_sector-naics_sector->NAICS

 Edge types:
    LoanNr_ChkDgt-loan_naics->NAICS: [447252]
        Weights: all 1 (default)
        Features: none
    NAICS-naics_sector->NAICS_sector: [1298]
        Weights: all 1 (default)
        Features: none


##### Labels

In [12]:
label_df = pd.read_parquet(Path(setup.temp_path).joinpath('10_DATA_combined_scaled_all.parquet')) \
    [['dset', 'LoanNr_ChkDgt', 'target']]

In [13]:
train_ind = label_df[label_df['dset'] == 'train'].set_index('LoanNr_ChkDgt')['target']

In [14]:
train_ind

LoanNr_ChkDgt
5282874009    0
2568556001    0
2687465005    1
9174473001    0
3598634004    0
             ..
1933115007    0
6285954009    0
8838754008    0
3058006005    0
5064344005    0
Name: target, Length: 447252, dtype: int64

##### Set seeds to setup

In [15]:
# Set some seeds
np.random.seed(16)
tf.random.set_seed(16)

## Train

##### Split the train dataset

In [16]:
train_fix, train_test = model_selection.train_test_split(
    train_ind.index, train_size=0.1, test_size=None, stratify=train_ind
)

##### Node Predictor Generator

In [18]:
generator = HinSAGENodeGenerator(sba_graph_train, batch_size=20, 
                                 num_samples=[15, 10, 10], 
                                 head_node_type='LoanNr_ChkDgt',
                                 seed=2454)

In [19]:
train_gen = generator.flow(train_fix, train_ind.loc[train_fix], shuffle=True)

In [20]:
graphsage_model = HinSAGE(
    layer_sizes=[8, 8, 8], generator=generator, bias=True, dropout=0.7,
)

In [24]:
tf.random.set_seed(16)

In [25]:
x_inp, x_out = graphsage_model.in_out_tensors()
prediction = layers.Dense(units=1, activation="sigmoid",
                         kernel_initializer=initializers.glorot_uniform(seed=676))(x_out) 

In [26]:
model = Model(inputs=x_inp, outputs=prediction)
model.compile(
    optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.003),
    loss = losses.binary_crossentropy,  
    metrics=["acc", km.AUC(curve='PR'), km.Precision(), km.Recall()],
)

In [27]:
train_test_gen = generator.flow(train_test, train_ind.loc[train_test], shuffle=False)

In [None]:
history = model.fit(
    train_gen, epochs=20, validation_data=train_test_gen, verbose=2, shuffle=False
)

Epoch 1/20


2023-12-05 21:42:53.007962: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-12-05 22:02:07.669055: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


In [None]:
with open(Path(setup.temp_path).joinpath('11_DATA_model.pkl'), 'wb') as fout:
      pickle.dump(model, fout)

In [None]:
with open(Path(setup.temp_path).joinpath('11_DATA_generator.pkl'), 'wb') as fout:
      pickle.dump(generator, fout)

In [None]:
sg.utils.plot_history(history)

In [None]:
train_test_metrics = model.evaluate(train_test_gen)

In [None]:
train_metrics = model.evaluate(train_gen)

In [None]:
metrics = pd.concat([pd.DataFrame(dict(zip(model.metrics_names, [[t] for t in train_metrics]))),
                         pd.DataFrame(dict(zip(model.metrics_names, [[t] for t in test_metrics])))],
                        keys=['train', 'test'])
metrics.to_csv(Path(setup.temp_path).joinpath('11_REPORT_train_metrics.pkl'), index=True)
metrics