In [2]:
# Modify the file 00_setup.py to define input/output file paths on your system
# The information in 00_setup.py will be used across notebooks
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./00_setup.py").load_module()

# Performance Summary
Combine performance of models tested, and summarize

Also combine all metrics for easy comparisons

*This script takes about 1 minutes on my MacBook Air*

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from pathlib import Path

In [5]:
from sba_gnn.sba_gnn import sg_plot 

In [6]:
sg_plot.plot_defaults()

## File Dictionaries

##### XGB basic tests

In [7]:
dict_pred_1_xgb_basic = {
    'xgb base': Path(setup.temp_path).joinpath('03_DATA_combined_predictions.parquet'),
    'xgb mean':  Path(setup.temp_path).joinpath('04_DATA_combined_predictions.parquet'),
    'xgb one hot':  Path(setup.temp_path).joinpath('05_DATA_combined_predictions.parquet'),
    'xgb numeric naics':  Path(setup.temp_path).joinpath('06_DATA_combined_predictions.parquet'),
    'xgb mean x naics sector':  Path(setup.temp_path).joinpath('07_DATA_combined_predictions.parquet'),
    'xgb mean x naics subsector':  Path(setup.temp_path).joinpath('08_DATA_combined_predictions.parquet'),
    'xgb mean x naics industry':  Path(setup.temp_path).joinpath('09_DATA_combined_predictions.parquet')
}

##### Neural network basic

In [8]:
dict_pred_2_nn_basic = {
    'nn base': Path(setup.temp_path).joinpath('11_DATA_predictions.parquet'),
    'nn emb': Path(setup.temp_path).joinpath('12_DATA_predictions.parquet'),
}

##### XGB + embeddings

In [9]:
dict_pred_3_emb = {
    'xgb emb nn':  Path(setup.temp_path).joinpath('40_DATA_combined_predictions.parquet'),
    'xgb dgi base':  Path(setup.temp_path).joinpath('50_DATA_combined_predictions.parquet'),
    'xgb dgi base+label':  Path(setup.temp_path).joinpath('52_DATA_combined_predictions.parquet')
}

##### Clustering - Mean Encoding

In [10]:
dict_pred_4_clus_menc = {
    'xgb clus menc nn':  Path(setup.temp_path).joinpath('62_DATA_combined_predictions.parquet'),
    'xgb clus menc dgi base':  Path(setup.temp_path).joinpath('68_DATA_combined_predictions.parquet'),
    'xgb clus menc dgi base+label':  Path(setup.temp_path).joinpath('71_DATA_combined_predictions.parquet')
}

##### Clustering - One Hot Encoding

In [13]:
dict_pred_5_oh = {
    'xgb clus oh nn':  Path(setup.temp_path).joinpath('61_DATA_combined_predictions.parquet'),
    'xgb clus oh dgi base':  Path(setup.temp_path).joinpath('67_DATA_combined_predictions.parquet'),
    'xgb clus oh dgi base+label':  Path(setup.temp_path).joinpath('70_DATA_combined_predictions.parquet')
}

## Combine predictions

##### Raw predictions - will get metrics for low volume naics

In [14]:
all_dict_list = [dict_pred_1_xgb_basic, dict_pred_2_nn_basic,
             dict_pred_3_emb, dict_pred_4_clus_menc, dict_pred_5_oh]

In [15]:
dict_pred = {k: v for d in all_dict_list for k, v in d.items()}

In [16]:
# Verify these all exist
all([Path(v).exists() for k, v in dict_pred.items()])

True

##### Standard metrics
Get filenames for standard metrics calculated in all scripts. These all have similar file naming

In [17]:
dict_metrics = {k:Path(setup.temp_path).joinpath(v.name[0:2] + '_REPORT_metrics.csv') \
                for k, v in dict_pred.items()}

In [18]:
# Verify these all exist
all([Path(v).exists() for k, v in dict_metrics.items()])

True

## Combine standard metrics

In [46]:
all_metrics = pd.concat([pd.read_csv(v) for k, v in dict_metrics.items()],
                        keys = dict_metrics.keys()) \
    .reset_index(level=0) \
    .drop(columns=['Unnamed: 0'], errors='ignore') \
    .rename(columns={'level_0':'model'})

In [47]:
all_metrics.to_csv(Path(setup.temp_path).joinpath('80_REPORT_metrics_combined.csv'), index=False)

## Summary Data
For test dataset only, show AUC and f1 scores for test dataset, along with f1 scores for holdout and low-volume NAICS

In [48]:
overall_stats =  all_metrics[(all_metrics['dset'] == 'test') & \
                             (all_metrics['dset_naics_holdout'] == 0)] \
    [['model', 'roc_auc_score', 'average_precision_score', 'f1_score']]

In [49]:
holdout_stats = all_metrics[(all_metrics['dset'] == 'test') & \
                            (all_metrics['dset_naics_holdout'] == 1)] \
    [['model', 'roc_auc_score', 'average_precision_score', 'f1_score']] \
    .rename(columns={'f1_score':'f1_score_ho', 'average_precision_score':'ap_ho',
                    'roc_auc_score':'roc_ho'})

In [50]:
overall_stats = overall_stats.merge(holdout_stats, on='model')

##### Type information

In [53]:
overall_stats[overall_stats['model'].isin(dict_pred_1_xgb_basic.keys())] \
.sort_values('ap_ho', ascending=False)

Unnamed: 0,model,roc_auc_score,average_precision_score,f1_score,roc_ho,ap_ho,f1_score_ho
3,xgb numeric naics,0.725089,0.378392,0.444074,0.729245,0.362891,0.429051
0,xgb base,0.706474,0.353071,0.431561,0.723115,0.356861,0.424024
5,xgb mean x naics4,0.732462,0.390023,0.449633,0.724233,0.356358,0.419578
2,xgb one hot,0.714582,0.360212,0.438666,0.722105,0.353897,0.425756
1,xgb mean,0.732194,0.389849,0.449935,0.721075,0.352738,0.424237
4,xgb mean x naics sector,0.732311,0.389739,0.450566,0.712509,0.338828,0.420017
6,xgb mean x naics3,0.733338,0.390645,0.451238,0.713045,0.337917,0.411777


In [68]:
overall_stats[overall_stats['model'].isin(list(dict_pred_2_nn_basic.keys()))] \
.sort_values('average_precision_score', ascending=False)

Unnamed: 0,model,roc_auc_score,average_precision_score,f1_score,roc_ho,ap_ho,f1_score_ho
8,nn emb,0.73229,0.388653,0.450865,0.708303,0.336031,0.406653
10,nn one hot,0.712272,0.354201,0.438092,0.71666,0.345082,0.421648
9,nn numeric naics,0.704058,0.347964,0.429553,0.715936,0.344513,0.419954
7,nn base,0.702495,0.345526,0.430669,0.718693,0.348599,0.425718


In [62]:
overall_stats[overall_stats['model'].isin(list(dict_pred_3_emb.keys())] \
.sort_values('average_precision_score', ascending=False)

Unnamed: 0,model,roc_auc_score,average_precision_score,f1_score,roc_ho,ap_ho,f1_score_ho
11,xgb emb nn,0.734922,0.394202,0.452451,0.715999,0.343732,0.411568
13,xgb dgi base+label,0.727737,0.381411,0.446389,0.716325,0.343496,0.410392
12,xgb dgi base,0.728501,0.381393,0.446686,0.724172,0.351215,0.426014
14,xgb dgi nn,0.7266,0.379323,0.446571,0.722928,0.352367,0.423382


In [59]:
overall_stats[overall_stats['model'].isin(dict_pred_4_clus_menc.keys())] \
    .sort_values('average_precision_score', ascending=False)

Unnamed: 0,model,roc_auc_score,average_precision_score,f1_score,roc_ho,ap_ho,f1_score_ho
17,xgb clus menc dgi base+label,0.732375,0.390136,0.451332,0.716507,0.343121,0.416203
16,xgb clus menc dgi base,0.732172,0.389762,0.450391,0.722189,0.350638,0.426248
15,xgb clus menc nn,0.732102,0.389748,0.450267,0.709013,0.32558,0.406173
18,xgb clus menc dgi nn,0.732162,0.389674,0.450031,0.715159,0.342118,0.419236


In [65]:
overall_stats[overall_stats['model'].isin(list(dict_pred_5_oh.keys()) + \
                                          ['xgb one hot'])] \
    .sort_values('average_precision_score', ascending=False)

Unnamed: 0,model,roc_auc_score,average_precision_score,f1_score,roc_ho,ap_ho,f1_score_ho
19,xgb clus oh nn,0.729904,0.383323,0.450028,0.716921,0.348051,0.408084
22,xgb clus oh dgi nn,0.71494,0.362428,0.436111,0.720162,0.349227,0.423375
21,xgb clus oh dgi base+label,0.71409,0.36171,0.435064,0.720058,0.346136,0.425508
20,xgb clus oh dgi base,0.715041,0.361347,0.437073,0.722276,0.348755,0.427212
2,xgb one hot,0.714582,0.360212,0.438666,0.722105,0.353897,0.425756


In [69]:
overall_stats.to_csv(Path(setup.temp_path).joinpath('80_REPORT_summary_stats.csv'),
                    index=False)

In [55]:
TO DO
ANOVA
Cross-silhouette scores
Further prune code
DONE Bernoulli error https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval
recolor DGI embeddings (meanc)
DGI emb ples meanc same w nn

SyntaxError: invalid syntax (3051542928.py, line 1)

In [1]:
Consider making hierarchical clusters - factors of 2 - for mean encoding DGI
67 -> 64 etc
Do the hiearchical blending from 3 levels

SyntaxError: invalid syntax (3866325392.py, line 1)

In [73]:
naics = pd.read_csv('../data/naics/2022_struct.csv', dtype='str', header=None)

In [75]:
naics['len'] = naics[0].str.len()

In [77]:
naics.groupby('len').agg('count')

Unnamed: 0_level_0,0,1
len,Unnamed: 1_level_1,Unnamed: 2_level_1
2.0,17,17
3.0,96,96
4.0,308,308
5.0,692,692
6.0,1012,1012
