In [33]:
# Modify the file 00_setup.py to define input/output file paths on your system
# The information in 00_setup.py will be used across notebooks
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./00_setup.py").load_module()

# 20: GNN Data
Data for a GNN view of NAICS, where NAICS and NAICS sectors are edges connecting nodes

*This script takes about 5 minutes on my MacBook Air*

In [7]:
import pandas as pd
import numpy as np

In [8]:
from pathlib import Path
import importlib, pickle
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder

In [9]:
# Imputer object for easy dataset conversion to GNN friendly format
from sba_gnn.sba_gnn import sg_imputer 
from sba_gnn.sba_gnn.sg_imputer import GNNImputer

In [106]:
import itertools

## Input Data

In [10]:
comb_df = pd.read_parquet(Path(setup.temp_path).joinpath('10_DATA_combined_scaled_all.parquet'))

## Create Edge Maps

There are 2 edge types:
  * NAICS connects exact NAICS match edges
  * naics_sector - connects same sector edges

There are a very large number of edges.  These will be sampled in the graph but even to load a graph will be too much on home hardware (and likely not really worth it).  So I will sample for each node for both edge types. 

I want to get at least a set number of edges per node (setup.gnn_graph_sample_n), and so I sample accordingly. There may be more than this number of edges per node.  For higher-volume NAICS, the edge count per node will be closer to setup.gnn_graph_sample_n.

The GNN will do sampling also, and so I need the GNN samples to be less than setup.gnn_graph_sample_n.

##### Function to create samples for 1 code
Returns an edge list containing at least a set number of samples per node.  The list may contain more edges than requested, but will be smaller than the full edge list.  For high-volume code, the number of edges is ~setup.gnn_graph_sample_n, and the total rows is close to (# businesses with the code) * setup.gnn_graph_sample_n

See also: https://stackoverflow.com/questions/48061508/numpy-slicing-all-except-one-array-entry

In [467]:
def get_sample_per_code_pd(ser, min_n = setup.gnn_graph_sample_n):
    """ Samples edges for each node, returning at least min_n edges for each node,
      Inputs:
        ser:  Pandas series consisting of node indexes
        min_n: Minimum number of edges per node
      Value:
        Pandas dataframe containing rows 'source' and 'target' for the edges.
          The 'source' value is always less than 'target'
    """
    ser_len = len(ser)
    
    ser = ser.copy().rename('source')
    
    # Return no edges for isolated codes
    if ser_len <= 1:
        return pd.DataFrame()
    
    # If we have a small list, return all
    if ser_len <= min_n:
        pairs = itertools.combinations(ser, 2)
        return pd.DataFrame(pairs)\
           .set_axis(['source', 'target'], axis=1)

    # Otherwise get samples
    samples = pd.concat([pd.concat([ser[0:i], ser[i+1:]]) \
                             .sample(n=min_n).reset_index(drop=True).rename('target') \
                         for i in range(ser_len)],
                        axis=0, keys=ser) \
        .reset_index(level=0) \
        .reset_index(drop=True)

    # Sort, remove duplicates
    data = samples.to_numpy()
    data.sort(axis=1)
    samples = pd.DataFrame(data, columns=samples.columns) \
        .drop_duplicates()

    return samples

In [468]:
def get_sample_per_code(ser, min_n = setup.gnn_graph_sample_n):
    """ Samples edges for each node, returning at least min_n edges for each node,
      Inputs:
        ser:  Pandas series consisting of node indexes
        min_n: Minimum number of edges per node
      Value:
        Pandas dataframe containing rows 'source' and 'target' for the edges.
          The 'source' value is always less than 'target'
    """
    
    ser_len = len(ser)
    
    # Return no edges for isolated codes
    if ser_len <= 1:
        return pd.DataFrame()
    
    np_ary = ser.sort_values().to_numpy()
    
    # If we have a small list, np_list all
    if ser_len <= min_n:
        pairs = itertools.combinations(np_ary, 2)
        return pd.DataFrame(pairs)\
           .set_axis(['source', 'target'], axis=1)

    # Otherwise get samples.  Get the sources first
    sources = np_ary.repeat(min_n).reshape((-1, 1))
    
    # Targets - sample from nodes other than the source
    targets = np.concatenate([np.random.choice(np.concatenate((np_ary[:i], np_ary[i+1:])), 
                                               min_n, replace=False) \
                              for i in range(ser_len)]) \
        .reshape((-1, 1))
    
    # Combine sources and targets
    comb_data = np.concatenate((sources, targets), axis=1)

    # Sort, remove duplicates
    comb_data.sort(axis=1)
    samples = pd.DataFrame(comb_data, columns=['source', 'target']) \
        .drop_duplicates()

    return samples

In [385]:
id_ser = comb_df[comb_df['NAICS_orig'] == '327215'][['NAICS_orig', 'LoanNr_ChkDgt']].copy() \
    ['LoanNr_ChkDgt']

In [458]:
s = get_sample_per_code(id_ser)
s

Unnamed: 0,source,target
0,1019465005,2101075010
1,1019465005,9780793003
2,1019465005,5311114004
3,1019465005,3873514009
4,1019465005,3166904009
...,...,...
47092,2238886003,9994873009
47094,7442553010,9994873009
47095,1857175000,9994873009
47096,1263524005,9994873009


In [460]:
ser_100 = pd.Series([i for i in range(100)])

In [462]:
s = get_sample_per_code(ser_100)
print(s.shape)

NameError: name 'ser_list' is not defined

In [None]:
s = get_sample_per_code(id_ser)
print(s.shape)

In [339]:
s = get_sample_per_code(id_ser)
s

Unnamed: 0,source,target
0,1019465005,6599364001
1,1019465005,1527665002
2,1019465005,2342714006
3,1019465005,5455884000
4,1019465005,2042265007
...,...,...
95,7418443001,7703064000
96,7418443001,9859713004
97,7418443001,8542683005
98,7418443001,9017114001


In [332]:
id_ser.shape

(471,)

In [261]:
471*100

47100

In [333]:
s[s['source'] >= s['target']]

Unnamed: 0,source,target


In [334]:
s

Unnamed: 0,source,target
0,1019465005,5594583008
1,1019465005,9085813005
2,1019465005,3125334008
3,1019465005,1579195005
4,1019465005,2238886003
...,...,...
47092,5126594006,9994873009
47093,4919124000,9994873009
47094,5692484002,9994873009
47097,3317654004,9994873009


In [258]:
s_cnt = s.groupby(['source'])['target'].agg('count')
s_cnt.describe()

count    371.0
mean     100.0
std        0.0
min      100.0
25%      100.0
50%      100.0
75%      100.0
max      100.0
Name: target, dtype: float64

In [202]:
s_cnt[s_cnt == 1]

source
9868093010    1
Name: target, dtype: int64

In [204]:
s[s['target'] == '9868093010']

Unnamed: 0,source,target
468,1019465005,9868093010
937,1021505003,9868093010
1405,1023475000,9868093010
1872,1046185010,9868093010
2338,1053014007,9868093010
...,...,...
110668,8984483001,9868093010
110673,9119503005,9868093010
110677,9262183008,9868093010
110680,9606403007,9868093010


In [128]:
get_sample_per_code(comb_df[comb_df['NAICS_orig'] == '926130']['LoanNr_ChkDgt'])

Unnamed: 0,source,target
0,1435695003,7818243010
1,1435695003,7828953004
2,7818243010,7828953004


In [439]:
large_ser = comb_df[comb_df['NAICS_orig'] == '722110']['LoanNr_ChkDgt']

In [443]:
import time

In [None]:
start = time.time()
sc_large = get_sample_per_code(large_ser)
end = time.time()
print(end - start)
print (sc_large.shape)

In [None]:
start = time.time()
sc_large = get_sample_per_code_pd(large_ser)
end = time.time()
print(end - start)
print (sc_large.shape)

In [381]:
sc_large.groupby('source')['target'].agg('count').describe()

count    28377.000000
mean        98.463544
std        414.754073
min          1.000000
25%         31.000000
50%         57.000000
75%         78.000000
max       6031.000000
Name: target, dtype: float64

In [382]:
sc_large.groupby('target')['source'].agg('count').describe()

count    28310.000000
mean        98.696574
std        466.100230
min          1.000000
25%         23.000000
50%         45.000000
75%         71.000000
max       6042.000000
Name: source, dtype: float64

In [167]:
sc_large[sc_large['target'] == '1134835004']

Unnamed: 0,target,source


In [None]:
9994873009

In [126]:
27941*100

2794100

In [22]:
vc = comb_df['NAICS_orig'].value_counts(sort=False)

In [24]:
vc.sort_values().head()

NAICS_orig
315240    1
333241    1
514190    1
925120    1
212210    1
Name: count, dtype: int64

In [130]:
vc[vc == 4950].head(3)

Series([], Name: count, dtype: int64)

In [122]:
vc.sort_values(ascending=False).head()

NAICS_orig
722110    27941
722211    19435
811111    14235
621210    13756
624410     9891
Name: count, dtype: int64

In [14]:
comb_df[comb_df['NAICS_orig'] == '327215'].shape

(471, 28)

In [15]:
comb_df[comb_df['NAICS_orig'] == '811111'].shape

(14235, 28)

In [18]:
comb_df[comb_df['NAICS_orig'] == '332321'].shape

(361, 28)

In [61]:
id_ser = comb_df[comb_df['NAICS_orig'] == '327215'][['NAICS_orig', 'LoanNr_ChkDgt']].copy() \
    ['LoanNr_ChkDgt'].rename('target') 
id_ser.index.name = 'source_index'

In [68]:
id_np = id_ser.to_numpy()

In [81]:
id_ser

source_index
2948      1019465005
3246      1021505003
3507      1023475000
6757      1046185010
7803      1053014007
             ...    
859005    9262183008
880861    9606403007
883096    9681803009
891063    9868093010
897124    9994873009
Name: target, Length: 471, dtype: object

In [39]:
sample_n = np.min([setup.gnn_graph_sample_n, len(id_ser)])
sample_n

100

In [93]:
samples = pd.concat([id_ser.sample(n=sample_n).reset_index(drop=True) \
                    for i in range(len(id_ser))],
                    axis=0, keys=id_ser.index) \
    .reset_index(level=0) \
    .merge(id_ser.reset_index().rename(columns={'target':'source'}), on='source_index') \
    .drop(columns=['source_index'])

In [89]:
samples = samples[samples['target'] != samples['source']]

In [94]:
samples

Unnamed: 0,target,source
0,2428324008,1019465005
1,8258893001,1019465005
2,2717376000,1019465005
3,2013635001,1019465005
4,8036383006,1019465005
...,...,...
47095,2859944008,9994873009
47096,1614015004,9994873009
47097,4621923005,9994873009
47098,1699245001,9994873009


In [84]:
samples.merge(id_ser.reset_index().rename(columns={'target':'source'}), on='source_index') 

Unnamed: 0,source_index,target,source
0,2948,4055203006,1019465005
1,2948,7839043008,1019465005
2,2948,2415264004,1019465005
3,2948,3698213008,1019465005
4,2948,4083014001,1019465005
...,...,...,...
47095,897124,4652564003,9994873009
47096,897124,6374404010,9994873009
47097,897124,7200323008,9994873009
47098,897124,9185203007,9994873009


In [46]:
samples.loc[2948]

index
2948    1019465005
2948    1019465005
2948    1019465005
2948    1019465005
2948    1019465005
           ...    
2948    1019465005
2948    1019465005
2948    1019465005
2948    1019465005
2948    1019465005
Name: target, Length: 101, dtype: object

In [None]:
vc_ex = 

##### Same-NAICS edges
There are a very large number of possible relationships.  I may need to sample

In [11]:
comb_df

Unnamed: 0_level_0,dset,LoanNr_ChkDgt,target,dset_naics_holdout,NAICS_orig,NAICS_sector,menc_NAICS,menc_NAICS_sector,NS___Accommodation and Food Services,NS___Construction,...,CreateJob,LowDoc,DisbursementGross,new_business,urban_flag,franchise_flag,missingindicator_LowDoc,missingindicator_new_business,missingindicator_urban_flag,NAICS
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,train,1000014003,0,0,451120,44-45,0.226337,0.225793,0.0,0.0,...,-1.0,1.0,-0.228228,1.0,1.0,-1.0,-1.0,-1.0,1.0,778.0
2,train,1000034009,0,0,621210,62,0.042125,0.100900,0.0,0.0,...,-1.0,-1.0,0.610611,-1.0,1.0,-1.0,-1.0,-1.0,1.0,1089.0
7,train,1000094005,0,0,811118,81,0.147149,0.202440,0.0,0.0,...,-1.0,1.0,-0.429429,1.0,1.0,-1.0,-1.0,-1.0,1.0,1167.0
8,train,1000104006,0,0,721310,72,0.050785,0.217493,1.0,0.0,...,-1.0,-1.0,0.644675,1.0,1.0,-1.0,-1.0,-1.0,1.0,1152.0
10,train,1000134004,0,0,811111,81,0.155980,0.202440,0.0,0.0,...,-1.0,1.0,-0.154154,1.0,1.0,-1.0,-1.0,-1.0,1.0,1164.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
897124,val,9994873009,0,0,327215,31-33,0.126623,0.155421,0.0,0.0,...,-1.0,1.0,-0.532533,-1.0,1.0,-1.0,-1.0,-1.0,1.0,356.0
897133,val,9995023003,0,0,422410,42,0.074783,0.189649,0.0,0.0,...,-1.0,-1.0,0.070070,-1.0,1.0,-1.0,-1.0,-1.0,1.0,642.0
897136,val,9995063004,0,0,322211,31-33,0.103229,0.155421,0.0,0.0,...,-1.0,1.0,0.029029,1.0,1.0,-1.0,-1.0,-1.0,1.0,265.0
897146,val,9995213001,1,0,235610,23,0.090024,0.245595,0.0,1.0,...,-1.0,-1.0,0.320320,1.0,1.0,-1.0,-1.0,-1.0,1.0,104.0


In [None]:
naics_edges = comb_df[['LoanNr_ChkDgt', 'NAICS_orig']] \
    .rename(columns={'LoanNr_ChkDgt':'source'}) \
    .merge(comb_df[['LoanNr_ChkDgt', 'NAICS_orig']] \s
           .rename(columns={'LoanNr_ChkDgt':'target'}),
           on='NAICS_orig', how='left') \
    .drop(columns=['NAICS_orig'])
# Remove self loops (optional)
naics_edges = naics_edges[naics_edges['source'] != naics_edges['target']]

In [None]:
naics_edges.head()

In [None]:
print(f'NAICS edge list: {naics_edges.shape}')

In [325]:
sector_map = comb_df[['NAICS_orig', 'NAICS_sector']].drop_duplicates()

In [326]:
naics_all_sectors = sector_map.merge(sector_map.rename(columns={'NAICS_orig':'NAICS_sim'}),
                                     how='left', on ='NAICS_sector')
naics_all_sectors = naics_all_sectors[naics_all_sectors['NAICS_orig'] != naics_all_sectors['NAICS_sim']]

In [327]:
naics_all_sectors.sample(5)

Unnamed: 0,NAICS_orig,NAICS_sector,NAICS_sim
278114,334111,31-33,327331
272109,445299,44-45,441310
139121,332431,31-33,313111
272717,484220,48-49,485999
281184,314911,31-33,311520


##### Functions for creating index for NAICS

In [328]:
def get_naics_index(naics_seq):
    return 'n_' + naics_seq

##### Get edges from businesses to their exact NAICS

In [329]:
# Edges from businesses to their NAICS code
naics_info_df = comb_df[['LoanNr_ChkDgt', 'NAICS_orig', 'dset']].copy() \
    .rename(columns={'LoanNr_ChkDgt':'source'}) 
naics_info_df['target'] = get_naics_index(naics_info_df['NAICS_orig'])

edges_business_naics = naics_info_df[['source', 'target', 'dset']].copy()
edges_business_naics['type'] = 'loan_naics'

##### Get edges from buisinesses to related NAICS codes

In [330]:
naics_info_df.head(3)

Unnamed: 0_level_0,source,NAICS_orig,dset,target
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1000014003,451120,train,n_451120
2,1000034009,621210,train,n_621210
7,1000094005,811118,train,n_811118


In [331]:
edges_businesses_naics_sim = naics_info_df.drop(columns='target', errors='ignore') \
    .merge(naics_all_sectors, how='inner', on='NAICS_orig')
edges_businesses_naics_sim['target'] = get_naics_index(edges_businesses_naics_sim['NAICS_sim'])
edges_businesses_naics_sim = edges_businesses_naics_sim[['source', 'target', 'dset']]
edges_businesses_naics_sim['type'] = 'loan_sector_naics'
print(f'Sector NAICS edges: {edges_businesses_naics_sim.shape}')

Sector NAICS edges: (67912366, 4)


##### Sample these - See 00_setup.py
The data is large so sampling could be considered here.  This is controlled in 00_setup.py.

In [332]:
if setup.naics_sector_sample:
    edges_businesses_naics_sim = edges_businesses_naics_sim.groupby('source') \
        .sample(setup.naics_sector_sample_n, replace=True, random_state=2342) \
        .drop_duplicates()
    print(f'Sector NAICS edges post sample: {edges_businesses_naics_sim.shape}')

Sector NAICS edges post sample: (11413065, 4)


In [333]:
edges_all = pd.concat([edges_business_naics, edges_businesses_naics_sim], axis=0) \
    .reset_index(drop=True)

In [334]:
edges_all['type'].value_counts()

type
loan_sector_naics    11413065
loan_naics             688081
Name: count, dtype: int64

In [335]:
print(edges_all[['source', 'target']].drop_duplicates().shape)
print(edges_all.shape)

(12101146, 2)
(12101146, 4)


In [336]:
edges_all.to_parquet(Path(setup.temp_path).joinpath('10_DATA_graph_edges.parquet'))

## Get Node Features
This section creates general node features data, which may be filtered to create graphs.  Not all features may be used, and not all nodes may be in all graphs.

In [337]:
# Business node features - these are the original features post scaling.
# Include the target also - will be used to test the "label trick" in some future scripts
features_business = comb_df[['LoanNr_ChkDgt', 'dset'] + imputer.features_out + ['target']] \
    .set_index('LoanNr_ChkDgt')
features_business.to_parquet(Path(setup.temp_path).joinpath('10_DATA_graph_node_features_loans.parquet'))

In [338]:
# NAICS features - One hots, also the mean encodings
features_naics = comb_df[['NAICS_orig', 'menc_NAICS', 'menc_NAICS_sector'] + [c for c in comb_df.columns if c.startswith('NS__')]] \
    .drop_duplicates()
features_naics['source'] = get_naics_index(features_naics['NAICS'])
features_naics.set_index('source', inplace=True) 
features_naics.drop(columns='NAICS_orig', inplace=True) 
print(features_naics.shape)
features_naics.to_parquet(Path(setup.temp_path).joinpath('10_DATA_graph_node_features_naics.parquet'))

KeyError: 'NAICS'

## Label Data
Split the training data set - used for some "label trick" tests later

In [None]:
label_df = comb_df[['dset', 'dset_naics_holdout', 'LoanNr_ChkDgt', 'target']].set_index('LoanNr_ChkDgt')

In [None]:
comb_df.columns

In [None]:
train_labels = label_df[label_df['dset'] == 'train'].drop(columns=['dset'])
val_labels = label_df[label_df['dset'] == 'val'].drop(columns=['dset'])
print(f'graph labels shape {label_df.shape}')
print(f'train labels shape {train_labels.shape}')
print(f'validation labels shape {val_labels.shape}')

In [None]:
# Split train cases for testing
train_fix, train_train = model_selection.train_test_split(
    train_labels.index, train_size=0.7, stratify=train_labels, random_state=23432
)

In [None]:
print(f'fixed part of train set {len(train_fix)}')
print(f'train part of train set {len(train_train)}')

In [None]:
label_df.train_fix = None
label_df.loc[train_fix, 'train_fix'] = 1
label_df['train_fix'].fillna(0, inplace=True)

In [None]:
label_df['train_fix'].value_counts()

In [None]:
len(set(list(train_fix)))

In [None]:
label_df.to_parquet(Path(setup.temp_path).joinpath('10_DATA_label_info.parquet'))

In [None]:
label_df.loc[train_fix].shape

In [None]:
label_df.head()