In [1]:
# Modify the file 00_setup.py to define input/output file paths on your system
# The information in 00_setup.py will be used across notebooks
from importlib.machinery import SourceFileLoader
setup = SourceFileLoader("setup", "./00_setup.py").load_module()

# ANOVA
Various ANOVA and related analyses

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
import importlib, os

In [3]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats

## Import Data

In [4]:
sba_loans = pd.read_parquet(Path(setup.temp_path).joinpath('01_DATA_transformed.parquet'))

In [5]:
train_df = sba_loans[sba_loans['dset'] == 'train']

##### NAICS info

In [6]:
naics_info = pd.read_parquet(Path(setup.temp_path).joinpath('60_DATA_naics_summary_stats.parquet')) 

In [7]:
naics_map = pd.read_parquet(Path(setup.temp_path).joinpath('60_DATA_naics_map.parquet'))

In [8]:
naics_info.sample(2)

Unnamed: 0,NAICS,train_count,train_target_mean,NAICS_sector,menc_NAICS,cenc_NAICS,dset_naics_holdout,all_count,all_target_mean,NAICS_sector_sel
391,327991,472.0,0.182203,31-33,0.186393,0.001109,0,709,0.170663,1
809,445291,690.0,0.226087,44-45,0.220184,0.001621,0,1051,0.229305,1


##### Neural Network Embeddings
With cluster and NAICS info appended

In [9]:
emb_nn_clus = pd.read_parquet(Path(setup.temp_path).joinpath('60_DATA_embeddings_tsne_naics.parquet'))

In [10]:
emb_nn_feat = [c for c in emb_nn_clus.columns if c.startswith('emb_')]

##### DGI embeddings

In [11]:
emb_dgi_clus = pd.read_parquet(Path(setup.temp_path).joinpath('72_DATA_embeddings_tsne_naics.parquet'))
emb_dgi_feat = [c for c in emb_dgi_clus.columns if c.startswith('emb_')]

## Functions

##### ANOVA with variation explained

In [12]:
def anova_var(data, eqn):
    """Perform one way ANOVA, return variance explained"""
    lm = ols(eqn, data=data).fit()
    res = sm.stats.anova_lm(lm, typ=1)
    res = pd.concat([res, 
                     (res['sum_sq'].transform(lambda x: x/x.sum())).rename('var_f')],
                    axis=1)
    return res

## NAICS sector - Baseline
ANOVA results in the training data

##### Stats oneway summary

In [13]:
sector_groups = train_df.groupby('NAICS_sector')
sectors_list = [g['target'].to_numpy() for n, g in sector_groups]

In [14]:
f_sector, p_sector = stats.f_oneway(*sectors_list)

In [15]:
print(f'sector oneway f: {f_sector}, p: {p_sector}')

sector oneway f: 286.3593129860846, p: 0.0


In [16]:
naics_groups = train_df.groupby('NAICS')
naics_list = [g['target'].to_numpy() for n, g in naics_groups]

In [17]:
f_naics, p_naics = stats.f_oneway(*naics_list)

In [18]:
print(f'naics oneway f: {f_naics}, p: {p_naics}')

naics oneway f: 21.61331024668941, p: 0.0


##### Residuals analysis - sector

In [19]:
res = anova_var(train_df, 'target ~ C(NAICS_sector)')

In [20]:
res

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F),var_f
C(NAICS_sector),19.0,869.441236,45.760065,286.359313,0.0,0.012623
Residual,425574.0,68006.497591,0.159799,,,0.987377


##### NAICS level 
How much of the mean target by NAICS is accounted for by sector?

In [21]:
naics_info_train = naics_info[naics_info['train_count'] > 0]

In [22]:
anova_var(naics_info_train, 'train_target_mean ~ C(NAICS_sector)')

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F),var_f
C(NAICS_sector),19.0,2.196272,0.115593,6.241177,1.503126e-15,0.089121
Residual,1212.0,22.447534,0.018521,,,0.910879


## NN Embeddings

##### NAICS sector ANOVA

In [23]:
# OLS strings for each
emb_str = [ ef+ ' ~ C(NAICS_sector)' for ef in emb_nn_feat]

In [24]:
# All ANOVAs
anova_nn_sector = pd.concat([anova_var(emb_nn_clus, s) for s in emb_str],
                     keys=emb_nn_feat) \
    .reset_index()

In [25]:
anova_nn_sector[anova_nn_sector['level_1'] == 'C(NAICS_sector)']

Unnamed: 0,level_0,level_1,df,sum_sq,mean_sq,F,PR(>F),var_f
0,emb_000,C(NAICS_sector),19.0,29.41388,1.548099,4.594313,2.498359e-10,0.063333
2,emb_001,C(NAICS_sector),19.0,7.221122,0.380059,2.953026,2.103928e-05,0.04165
4,emb_002,C(NAICS_sector),19.0,13.784855,0.725519,5.854111,2.40528e-14,0.079322
6,emb_003,C(NAICS_sector),19.0,6.175088,0.325005,2.564529,0.0002446379,0.03637
8,emb_004,C(NAICS_sector),19.0,43.431819,2.285885,6.189425,1.973339e-15,0.083487
10,emb_005,C(NAICS_sector),19.0,9.936579,0.522978,3.557743,3.722568e-07,0.049755
12,emb_006,C(NAICS_sector),19.0,40.11696,2.111419,6.191398,1.944452e-15,0.083511
14,emb_007,C(NAICS_sector),19.0,8.898625,0.468349,3.359544,1.430425e-06,0.047114


##### NAICS level ANOVA (mean target)

In [32]:
naics_info_train_nn_cluster = naics_info[naics_info['train_count'] > 0] \
    .merge(emb_nn_clus.drop(columns='NAICS').rename(columns={'NAICS_orig':'NAICS'})[['NAICS', 'cluster']],
           how='left', on='NAICS')

In [34]:
anova_var(naics_info_train_nn_cluster, 'train_target_mean ~ C(cluster)')

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F),var_f
C(cluster),8.0,11.913865,1.489233,143.074672,1.631658e-169,0.483443
Residual,1223.0,12.729941,0.010409,,,0.516557


##### Loan level ANOVA (raw target)

In [36]:
train_nn_cluster = train_df \
    .merge(emb_nn_clus.drop(columns='NAICS').rename(columns={'NAICS_orig':'NAICS'})[['NAICS', 'cluster']],
           how='left', on='NAICS')

In [37]:
anova_var(train_nn_cluster, 'target ~ C(cluster)')

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F),var_f
C(cluster),8.0,3073.179506,384.147438,2484.506565,0.0,0.044619
Residual,425585.0,65802.75932,0.154617,,,0.955381


## DGI Embeddings

##### NAICS sector ANOVA

In [38]:
# OLS strings for each
emb_dgi_str = [ ef+ ' ~ C(NAICS_sector)' for ef in emb_dgi_feat]

In [39]:
# All ANOVAs
anova_dgi_sector = pd.concat([anova_var(emb_dgi_clus, s) for s in emb_dgi_str],
                     keys=emb_nn_feat) \
    .reset_index()

In [40]:
anova_dgi_sector[anova_nn_sector['level_1'] == 'C(NAICS_sector)']

Unnamed: 0,level_0,level_1,df,sum_sq,mean_sq,F,PR(>F),var_f
0,emb_000,C(NAICS_sector),19.0,0.007637,0.000402,2.409177,0.0006268709,0.034242
2,emb_001,C(NAICS_sector),19.0,0.015498,0.000816,2.409177,0.0006268718,0.034242
4,emb_002,C(NAICS_sector),19.0,0.007481,0.000394,2.409177,0.0006268732,0.034242
6,emb_003,C(NAICS_sector),19.0,0.014017,0.000738,2.409178,0.0006268692,0.034242
8,emb_004,C(NAICS_sector),19.0,19.231613,1.01219,8.203972,5.05117e-22,0.107732
10,emb_005,C(NAICS_sector),19.0,15.171987,0.798526,6.454871,2.705188e-16,0.086756
12,emb_006,C(NAICS_sector),19.0,51.820419,2.72739,28.530548,1.174964e-84,0.295721
14,emb_007,C(NAICS_sector),19.0,23.447497,1.234079,18.656589,1.4981590000000001e-55,0.215424


##### NAICS level ANOVA (mean target)

In [42]:
naics_info_train_dgi_cluster = naics_info[naics_info['train_count'] > 0] \
    .merge(emb_dgi_clus.rename(columns={'NAICS_orig':'NAICS'})[['NAICS', 'cluster']],
           how='left', on='NAICS')

In [43]:
anova_var(naics_info_train_dgi_cluster, 'train_target_mean ~ C(cluster)')

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F),var_f
C(cluster),8.0,6.54042,0.817552,55.230918,8.988914e-77,0.265398
Residual,1223.0,18.103386,0.014802,,,0.734602


##### Loan level ANOVA (raw target)

In [45]:
train_dgi_cluster = train_df \
    .merge(emb_dgi_clus.rename(columns={'NAICS_orig':'NAICS'})[['NAICS', 'cluster']],
           how='left', on='NAICS')

In [46]:
anova_var(train_dgi_cluster, 'target ~ C(cluster)')

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F),var_f
C(cluster),8.0,2277.735288,284.716911,1819.437164,0.0,0.03307
Residual,425585.0,66598.203539,0.156486,,,0.96693
