In [1]:
# Modify the file 00_setup.py to define input/output file paths on your system
# The information in 00_setup.py will be used across notebooks
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./00_setup.py").load_module()

# Performance Summary
Combine performance of models tested, and summarize

Also combine all metrics for easy comparisons

*This script takes about 1 minutes on my MacBook Air*

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from pathlib import Path

In [4]:
from sba_gnn.sba_gnn import sg_plot 

In [5]:
sg_plot.plot_defaults()

## File Dictionaries

##### XGB basic tests

In [6]:
dict_pred_1_xgb_basic = {
    'xgb base': Path(setup.temp_path).joinpath('03_DATA_combined_predictions.parquet'),
    'xgb mean':  Path(setup.temp_path).joinpath('04_DATA_combined_predictions.parquet'),
    'xgb one hot':  Path(setup.temp_path).joinpath('05_DATA_combined_predictions.parquet'),
    'xgb numeric naics':  Path(setup.temp_path).joinpath('06_DATA_combined_predictions.parquet'),
    'xgb mean x naics sector':  Path(setup.temp_path).joinpath('07_DATA_combined_predictions.parquet'),
    'xgb mean x naics subsector':  Path(setup.temp_path).joinpath('08_DATA_combined_predictions.parquet'),
    'xgb mean x naics industry':  Path(setup.temp_path).joinpath('09_DATA_combined_predictions.parquet')
}

##### Neural network basic

In [7]:
dict_pred_2_nn_basic = {
    'nn base': Path(setup.temp_path).joinpath('11_DATA_predictions.parquet'),
    'nn emb': Path(setup.temp_path).joinpath('12_DATA_predictions.parquet'),
}

##### XGB + embeddings

In [8]:
dict_pred_3_emb = {
    'xgb emb nn':  Path(setup.temp_path).joinpath('40_DATA_combined_predictions.parquet'),
    'xgb dgi base':  Path(setup.temp_path).joinpath('50_DATA_combined_predictions.parquet'),
    'xgb dgi base+label':  Path(setup.temp_path).joinpath('52_DATA_combined_predictions.parquet')
}

##### Clustering - Mean Encoding

In [9]:
dict_pred_4_clus_menc = {
    'xgb clus menc nn':  Path(setup.temp_path).joinpath('62_DATA_combined_predictions.parquet'),
    'xgb clus menc dgi base':  Path(setup.temp_path).joinpath('65_DATA_combined_predictions.parquet'),
    'xgb clus menc dgi base 20':  Path(setup.temp_path).joinpath('66_DATA_combined_predictions.parquet'),
    'xgb clus menc dgi base 80':  Path(setup.temp_path).joinpath('67_DATA_combined_predictions.parquet'),
    'xgb clus menc dgi base+label':  Path(setup.temp_path).joinpath('72_DATA_combined_predictions.parquet')
}

##### Clustering - One Hot Encoding

In [10]:
dict_pred_5_oh = {
    'xgb clus oh nn':  Path(setup.temp_path).joinpath('61_DATA_combined_predictions.parquet'),
    'xgb clus oh dgi base':  Path(setup.temp_path).joinpath('64_DATA_combined_predictions.parquet'),
    'xgb clus oh dgi base+label':  Path(setup.temp_path).joinpath('71_DATA_combined_predictions.parquet')
}

## Combine predictions

##### Raw predictions - will get metrics for low volume naics

In [11]:
all_dict_list = [dict_pred_1_xgb_basic, dict_pred_2_nn_basic,
             dict_pred_3_emb, dict_pred_4_clus_menc, dict_pred_5_oh]

In [12]:
dict_pred = {k: v for d in all_dict_list for k, v in d.items()}

In [13]:
# Verify these all exist
all([Path(v).exists() for k, v in dict_pred.items()])

False

##### Standard metrics
Get filenames for standard metrics calculated in all scripts. These all have similar file naming

In [14]:
dict_metrics = {k:Path(setup.temp_path).joinpath(v.name[0:2] + '_REPORT_metrics.csv') \
                for k, v in dict_pred.items()}

In [15]:
# Verify these all exist
all([Path(v).exists() for k, v in dict_metrics.items()])

False

## Combine standard metrics

In [17]:
all_metrics = pd.concat([pd.read_csv(v) for k, v in dict_metrics.items()],
                        keys = dict_metrics.keys()) \
    .reset_index(level=0) \
    .drop(columns=['Unnamed: 0'], errors='ignore') \
    .rename(columns={'level_0':'model'})

In [25]:
all_metrics['script_num'] = all_metrics['model'].apply(lambda x: dict_metrics[x].name[0:3])

In [26]:
all_metrics.head(3)

Unnamed: 0,model,dset,accuracy_score,f1_score,precision_score,recall_score,average_precision_score,roc_auc_score,dset_naics_holdout,script_num
0,xgb base,test,0.615897,0.430079,0.303129,0.739982,0.349659,0.716077,,03_
1,xgb base,train,0.604517,0.432929,0.306368,0.737659,0.352549,0.707118,,03_
2,xgb base,val,0.602189,0.4323,0.30707,0.730015,0.353011,0.703115,,03_


In [27]:
all_metrics.to_csv(Path(setup.temp_path).joinpath('80_REPORT_metrics_combined.csv'), index=False)

## Summary Data
For test dataset only, show AUC and f1 scores for test dataset, along with f1 scores for holdout and low-volume NAICS

In [28]:
overall_stats =  all_metrics[(all_metrics['dset'] == 'test') & \
                             (all_metrics['dset_naics_holdout'] == 0)] \
    [['model', 'roc_auc_score', 'average_precision_score', 'f1_score']]

In [29]:
holdout_stats = all_metrics[(all_metrics['dset'] == 'test') & \
                            (all_metrics['dset_naics_holdout'] == 1)] \
    [['model', 'roc_auc_score', 'average_precision_score', 'f1_score']] \
    .rename(columns={'f1_score':'f1_score_ho', 'average_precision_score':'ap_ho',
                    'roc_auc_score':'roc_ho'})

In [30]:
overall_stats = overall_stats.merge(holdout_stats, on='model')

##### Selected stats

In [40]:
overall_stats.sort_values('average_precision_score', ascending=False).head(12)

Unnamed: 0,model,roc_auc_score,average_precision_score,f1_score,roc_ho,ap_ho,f1_score_ho
9,xgb emb nn,0.73049,0.386461,0.451821,0.717261,0.332862,0.390534
14,xgb clus menc dgi base 20,0.728654,0.384045,0.450194,0.736758,0.357617,0.423053
1,xgb mean,0.728321,0.383767,0.45014,0.729034,0.347848,0.420046
12,xgb clus menc nn,0.728576,0.383574,0.449642,0.714015,0.313323,0.393603
6,xgb mean x naics industry,0.728229,0.383484,0.449909,0.738906,0.370568,0.433739
4,xgb mean x naics sector,0.727947,0.382885,0.449732,0.727478,0.344311,0.424421
5,xgb mean x naics subsector,0.728005,0.382785,0.449474,0.735679,0.360168,0.426707
15,xgb clus menc dgi base 80,0.72815,0.382723,0.449186,0.730578,0.346508,0.424069
16,xgb clus menc dgi base+label,0.727949,0.382644,0.448784,0.733238,0.352632,0.428239
13,xgb clus menc dgi base,0.727987,0.382625,0.4494,0.734168,0.356434,0.421747


In [39]:
overall_stats.sort_values('ap_ho', ascending=False).head(12)

Unnamed: 0,model,roc_auc_score,average_precision_score,f1_score,roc_ho,ap_ho,f1_score_ho
6,xgb mean x naics industry,0.728229,0.383484,0.449909,0.738906,0.370568,0.433739
3,xgb numeric naics,0.723166,0.373796,0.446564,0.738718,0.362137,0.429141
18,xgb clus oh dgi base,0.711046,0.356489,0.438078,0.7376,0.360423,0.42791
5,xgb mean x naics subsector,0.728005,0.382785,0.449474,0.735679,0.360168,0.426707
19,xgb clus oh dgi base+label,0.711826,0.359239,0.436635,0.737109,0.358599,0.428162
14,xgb clus menc dgi base 20,0.728654,0.384045,0.450194,0.736758,0.357617,0.423053
13,xgb clus menc dgi base,0.727987,0.382625,0.4494,0.734168,0.356434,0.421747
11,xgb dgi base+label,0.723915,0.373773,0.446626,0.735716,0.35502,0.429074
16,xgb clus menc dgi base+label,0.727949,0.382644,0.448784,0.733238,0.352632,0.428239
0,xgb base,0.704601,0.348985,0.434293,0.732046,0.351437,0.423421


In [32]:
overall_stats[overall_stats['model'].isin(list(dict_pred_2_nn_basic.keys()))] \
.sort_values('average_precision_score', ascending=False)

Unnamed: 0,model,roc_auc_score,average_precision_score,f1_score,roc_ho,ap_ho,f1_score_ho
8,nn emb,0.728026,0.382612,0.448303,0.709593,0.318813,0.395795
7,nn base,0.700443,0.343516,0.43085,0.728409,0.344602,0.419681


In [34]:
overall_stats[overall_stats['model'].isin(list(dict_pred_3_emb.keys()))] \
.sort_values('average_precision_score', ascending=False)

Unnamed: 0,model,roc_auc_score,average_precision_score,f1_score,roc_ho,ap_ho,f1_score_ho
9,xgb emb nn,0.73049,0.386461,0.451821,0.717261,0.332862,0.390534
10,xgb dgi base,0.724011,0.375218,0.447336,0.733705,0.346721,0.425681
11,xgb dgi base+label,0.723915,0.373773,0.446626,0.735716,0.35502,0.429074


In [35]:
overall_stats[overall_stats['model'].isin(dict_pred_4_clus_menc.keys())] \
    .sort_values('average_precision_score', ascending=False)

Unnamed: 0,model,roc_auc_score,average_precision_score,f1_score,roc_ho,ap_ho,f1_score_ho
14,xgb clus menc dgi base 20,0.728654,0.384045,0.450194,0.736758,0.357617,0.423053
12,xgb clus menc nn,0.728576,0.383574,0.449642,0.714015,0.313323,0.393603
15,xgb clus menc dgi base 80,0.72815,0.382723,0.449186,0.730578,0.346508,0.424069
16,xgb clus menc dgi base+label,0.727949,0.382644,0.448784,0.733238,0.352632,0.428239
13,xgb clus menc dgi base,0.727987,0.382625,0.4494,0.734168,0.356434,0.421747


In [36]:
overall_stats[overall_stats['model'].isin(list(dict_pred_5_oh.keys()) + \
                                          ['xgb one hot'])] \
    .sort_values('average_precision_score', ascending=False)

Unnamed: 0,model,roc_auc_score,average_precision_score,f1_score,roc_ho,ap_ho,f1_score_ho
17,xgb clus oh nn,0.726223,0.374882,0.448602,0.727376,0.343861,0.162614
19,xgb clus oh dgi base+label,0.711826,0.359239,0.436635,0.737109,0.358599,0.428162
2,xgb one hot,0.713595,0.358867,0.439001,0.733087,0.350732,0.423228
18,xgb clus oh dgi base,0.711046,0.356489,0.438078,0.7376,0.360423,0.42791


In [37]:
overall_stats.to_csv(Path(setup.temp_path).joinpath('80_REPORT_summary_stats.csv'),
                    index=False)

In [None]:
TO DO
ANOVA
Cross-silhouette scores
