In [4]:
# Modify the file 00_setup.py to define input/output file paths on your system
# The information in 00_setup.py will be used across notebooks
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./00_setup.py").load_module()

# 50: Process GNN Encodings for XGBoost Usage
Mean encode embeddings using the training data.  Mean encode embeddings from previous scripts

*This script takes about 5 minutes on my MacBook Air*

In [5]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import pickle

In [118]:
from pathlib import Path, PurePath

In [26]:
from category_encoders import TargetEncoder

In [3]:
from sba_gnn.sba_gnn import sg_plot 

## Import data

In [65]:
# Features - for NAICS
train_df = pd.read_parquet(Path(setup.temp_path).joinpath('01_DATA_combined_base_train.parquet'))
train_df[['LoanNr_ChkDgt', 'NAICS', 'target']].head(3)

Unnamed: 0_level_0,LoanNr_ChkDgt,NAICS,target
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
538061,5282874009,422210,0
220915,2568556001,441222,0
237886,2687465005,621310,1


In [67]:
test_df = pd.read_parquet(Path(setup.temp_path).joinpath('01_DATA_combined_base_test.parquet'))

In [68]:
val_df = pd.read_parquet(Path(setup.temp_path).joinpath('01_DATA_combined_base_val.parquet'))

##### Create a combined dataset for all rows (needed features only)

In [109]:
# Combine and set indices
comb_df = pd.concat([train_df, test_df, val_df], axis = 0,
                    keys = ['train', 'test', 'val']) \
    [['LoanNr_ChkDgt', 'target', 'NAICS', 'NAICS_sector']] \
    .reset_index(level=0) \
    .rename(columns={'level_0':'dset'}) \
    .set_index('LoanNr_ChkDgt')

In [110]:
comb_df.head(3)

Unnamed: 0_level_0,dset,target,NAICS,NAICS_sector
LoanNr_ChkDgt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5282874009,train,0,422210,42
2568556001,train,0,441222,44-45
2687465005,train,1,621310,62


## Folder for saving transformed embeddings

In [122]:
outpath = Path(PurePath(Path(setup.temp_path), '50_DATA_embeddings_menc'))

In [124]:
outpath.mkdir(exist_ok=True)

## Functions to mean encode embeddings

In [89]:
# Function to fit multiple target encoders for multiple embedding columns
def enc_create_fit(data, col_name):
    naics_encoder = TargetEncoder(cols=['NAICS', 'NAICS_sector'], min_samples_leaf = 25)
    naics_encoder.fit(data[['NAICS', 'NAICS_sector']], train_df[col_name])
    return naics_encoder

In [153]:
# Functions to import embeddings, mean encode NAICS, and save the transformed data.
def emb_enc(emb_path, comb_data = comb_df, out_path = outpath,
           outname = None):
    
    # Resolve input file names / paths
    if outname is None:
        outname = emb_path.name
    if out_path.joinpath(outname) == emb_path:
        print('ERROR - cannot use same output file as input')
        return
    
    # Get embeddings info
    emb_data = pd.read_parquet(emb_path)
    emb_cols = list(emb_data.columns)
    emb_data_full = pd.concat([emb_data, comb_data[['NAICS', 'NAICS_sector', 'dset']]], 
                          axis=1)
    
    # Train target encoders
    emb_data_train = emb_data_full[emb_data_full['dset'] == 'train'].drop(columns='dset')
    naics_encoders = [enc_create_fit(emb_data_train, emb_cols[i]) for i in range(len(emb_cols))]
    
    # Transform data
    trans_df = pd.concat([naics_encoders[i].transform(emb_data_full[['NAICS', 'NAICS_sector']]) \
                          for i in range(len(emb_cols))],
                         axis = 1, keys = emb_cols)
    trans_df.columns = ['_'.join(c) for c in trans_df.columns]
    
    # Save transformed data (same filename, new folder)

    trans_df.to_parquet(out_path.joinpath(outname))

## Process embeddings

In [165]:
# List the node embeddings datasets
embedding_paths = [Path(setup.temp_path).joinpath('13_DATA_embeddings.parquet'), # base GNN
                   Path(setup.temp_path).joinpath('22_DATA_embeddings.parquet'), # label trick GNN
                   Path(setup.temp_path).joinpath('32_DATA_embeddings.parquet')] # mean encoder GNN

In [155]:
# Rename files according to purpose
embedding_names = ['50_emb_base.parquet', '50_emb_label_trick.parquet', '50_emb_menc.parquet']

In [156]:
for i in range(len(embedding_paths)):
    print(embedding_paths[i])
    emb_enc(embedding_paths[i], outname = embedding_names[i])

../data/2023_12_15/13_DATA_embeddings.parquet
../data/2023_12_15/22_DATA_embeddings.parquet
../data/2023_12_15/32_DATA_embeddings.parquet


In [162]:
trans_df = pd.read_parquet(outpath.joinpath('50_emb_base.parquet'))

In [163]:
print(trans_df.shape)
print(trans_df.drop_duplicates().shape)

(688081, 16)
(1304, 16)


In [164]:
trans_df.head(5)

Unnamed: 0_level_0,emb_00_NAICS,emb_00_NAICS_sector,emb_01_NAICS,emb_01_NAICS_sector,emb_02_NAICS,emb_02_NAICS_sector,emb_03_NAICS,emb_03_NAICS_sector,emb_04_NAICS,emb_04_NAICS_sector,emb_05_NAICS,emb_05_NAICS_sector,emb_06_NAICS,emb_06_NAICS_sector,emb_07_NAICS,emb_07_NAICS_sector
LoanNr_ChkDgt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5282874009,-0.090746,-0.162568,-0.43142,-0.239123,-0.226072,-0.280463,0.123255,0.089394,0.2452,0.086713,-0.332046,-0.270013,-0.031899,0.096427,0.635116,0.708287
2568556001,-0.224277,-0.254972,-0.335808,-0.204232,-0.171521,-0.204713,0.170227,0.137785,0.533708,0.235135,-0.054817,-0.046233,0.111827,0.296616,0.618582,0.783837
2687465005,-0.010788,-0.013005,-0.1404,-0.134427,-0.366313,-0.367353,0.186497,0.186067,0.052724,0.103372,-0.539575,-0.538084,-0.022882,-0.053171,0.682114,0.654524
9174473001,-0.247923,-0.254972,-0.244501,-0.204232,-0.195701,-0.204713,0.148148,0.137785,0.256208,0.235135,-0.040249,-0.046233,0.29077,0.296616,0.774004,0.783837
3598634004,-0.109511,-0.117992,-0.18035,-0.160999,-0.398751,-0.40442,0.160877,0.158372,-0.024255,-0.021966,-0.398764,-0.377281,0.171193,0.178724,0.705284,0.699564


In [166]:
trans_df.corr()

Unnamed: 0,emb_00_NAICS,emb_00_NAICS_sector,emb_01_NAICS,emb_01_NAICS_sector,emb_02_NAICS,emb_02_NAICS_sector,emb_03_NAICS,emb_03_NAICS_sector,emb_04_NAICS,emb_04_NAICS_sector,emb_05_NAICS,emb_05_NAICS_sector,emb_06_NAICS,emb_06_NAICS_sector,emb_07_NAICS,emb_07_NAICS_sector
emb_00_NAICS,1.0,0.908166,-0.212652,0.314497,-0.162432,-0.282538,0.162446,0.01508,0.215814,0.003438,-0.257605,-0.200676,-0.610757,-0.535477,-0.103865,0.095406
emb_00_NAICS_sector,0.908166,1.0,0.137368,0.351064,-0.29757,-0.308115,0.015013,0.013346,0.004599,0.004144,-0.215284,-0.218139,-0.480333,-0.587297,0.081976,0.109374
emb_01_NAICS,-0.212652,0.137368,1.0,0.390488,-0.154184,0.048026,-0.41493,0.060237,-0.741358,-0.171003,0.032328,-0.130368,0.490653,0.016817,0.54341,0.060044
emb_01_NAICS_sector,0.314497,0.351064,0.390488,1.0,0.11486,0.124856,0.123064,0.134081,-0.251951,-0.433086,-0.317911,-0.324096,0.029116,0.046923,0.118469,0.158255
emb_02_NAICS,-0.162432,-0.29757,-0.154184,0.11486,1.0,0.956829,-0.122155,-0.235745,0.134348,0.038041,-0.01461,0.018259,0.201484,0.362783,0.145793,0.302879
emb_02_NAICS_sector,-0.282538,-0.308115,0.048026,0.124856,0.956829,1.0,-0.203882,-0.249018,0.02464,0.039399,0.018871,0.016821,0.307665,0.382538,0.25068,0.313207
emb_03_NAICS,0.162446,0.015013,-0.41493,0.123064,-0.122155,-0.203882,1.0,0.826003,0.448377,0.060692,-0.22391,-0.140244,-0.583243,-0.357323,-0.364464,-0.069344
emb_03_NAICS_sector,0.01508,0.013346,0.060237,0.134081,-0.235745,-0.249018,0.826003,1.0,0.043155,0.075944,-0.164011,-0.168115,-0.348628,-0.432927,-0.065837,-0.086885
emb_04_NAICS,0.215814,0.004599,-0.741358,-0.251951,0.134348,0.02464,0.448377,0.043155,1.0,0.594298,0.291809,0.43536,-0.579452,-0.141372,-0.32073,0.202619
emb_04_NAICS_sector,0.003438,0.004144,-0.171003,-0.433086,0.038041,0.039399,0.060692,0.075944,0.594298,1.0,0.714314,0.731139,-0.192475,-0.239033,0.269773,0.337516
