In [3]:
import logging, warnings
import numpy as np, pandas as pd
from pathlib import Path
from scipy.stats import spearmanr, ConstantInputWarning
from tqdm import tqdm
from umap import UMAP
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import pathlib
import importlib
# importlib.reload(src.X)
# from src.X import x,y,z

from src.utils import iter_dataset_dirs, load_json
from src.plot_style import apply_plot_style

apply_plot_style()
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
warnings.simplefilter("ignore", ConstantInputWarning)

##### **`CauchyNoise` vs `GaussianNoise`**

In [None]:
import pandas as pd
from pathlib import Path

classes = ("Var(1)", "Wave1D")
features_npz = "analysis/features_full_all.npz"
out_dir = Path("analysis/non-overlapping-distributions")
bandwidths = [0.25, 0.5, 0.75, 1.0, 1.25, 1.5]

pair_label = "_".join(sorted(classes))  # ensures ks_CML_GaussianNoise == ks_GaussianNoise_CML

!python find_non_overlapping_distributions.py \
    --features {features_npz} \
    --mts-classes {",".join(classes)} \
    --output-dir {out_dir} \
    --kde --kde-bandwidth {bandwidths}

for bw in bandwidths:
    csv_path = out_dir / f"kde-{bw}" / f"ks_{pair_label}.csv"
    df = pd.read_csv(csv_path)
    print(f"Top 5 KS for bw={bw}")
    display(df.sort_values("ks_statistic", ascending=False).head(5))


Top 5 KS for bw=0.25


Unnamed: 0,feature_index,SPI-1,SPI-2,ks_statistic,n_GaussianNoise,n_CML
1555,1555,cce_kozachenko,di_gaussian,1.0,64,64
17400,17400,mi_kernel_W-0.25,tlmi_kraskov_NN-4,1.0,64,64
1530,1530,cce_kozachenko,coint_johansen_trace_stat_order-1_ardiff-1,1.0,64,64
1512,1512,cce_kozachenko,cds,1.0,64,64
1526,1526,cce_kozachenko,coint_johansen_max_eig_stat_order-1_ardiff-1,1.0,64,64


Top 5 KS for bw=0.5


Unnamed: 0,feature_index,SPI-1,SPI-2,ks_statistic,n_GaussianNoise,n_CML
1555,1555,cce_kozachenko,di_gaussian,1.0,64,64
17400,17400,mi_kernel_W-0.25,tlmi_kraskov_NN-4,0.999715,64,64
8866,8866,ddtf_multitaper_max_fs-1_fmin-0-25_fmax-0-5,ddtf_multitaper_max_fs-1_fmin-0_fmax-0-5,0.999544,64,64
1530,1530,cce_kozachenko,coint_johansen_trace_stat_order-1_ardiff-1,0.999264,64,64
7801,7801,dcoh_multitaper_max_fs-1_fmin-0-25_fmax-0-5,dcoh_multitaper_max_fs-1_fmin-0_fmax-0-5,0.999204,64,64


Top 5 KS for bw=0.75


Unnamed: 0,feature_index,SPI-1,SPI-2,ks_statistic,n_GaussianNoise,n_CML
1555,1555,cce_kozachenko,di_gaussian,0.999919,64,64
8866,8866,ddtf_multitaper_max_fs-1_fmin-0-25_fmax-0-5,ddtf_multitaper_max_fs-1_fmin-0_fmax-0-5,0.99869,64,64
7801,7801,dcoh_multitaper_max_fs-1_fmin-0-25_fmax-0-5,dcoh_multitaper_max_fs-1_fmin-0_fmax-0-5,0.998094,64,64
17400,17400,mi_kernel_W-0.25,tlmi_kraskov_NN-4,0.997245,64,64
13337,13337,gc_gaussian_k-1_kt-1_l-1_lt-1,tlmi_gaussian,0.994583,64,64


Top 5 KS for bw=1.0


Unnamed: 0,feature_index,SPI-1,SPI-2,ks_statistic,n_GaussianNoise,n_CML
1555,1555,cce_kozachenko,di_gaussian,0.998498,64,64
8866,8866,ddtf_multitaper_max_fs-1_fmin-0-25_fmax-0-5,ddtf_multitaper_max_fs-1_fmin-0_fmax-0-5,0.997633,64,64
7801,7801,dcoh_multitaper_max_fs-1_fmin-0-25_fmax-0-5,dcoh_multitaper_max_fs-1_fmin-0_fmax-0-5,0.996868,64,64
13337,13337,gc_gaussian_k-1_kt-1_l-1_lt-1,tlmi_gaussian,0.991065,64,64
17400,17400,mi_kernel_W-0.25,tlmi_kraskov_NN-4,0.990359,64,64


Top 5 KS for bw=1.25


Unnamed: 0,feature_index,SPI-1,SPI-2,ks_statistic,n_GaussianNoise,n_CML
8866,8866,ddtf_multitaper_max_fs-1_fmin-0-25_fmax-0-5,ddtf_multitaper_max_fs-1_fmin-0_fmax-0-5,0.99658,64,64
7801,7801,dcoh_multitaper_max_fs-1_fmin-0-25_fmax-0-5,dcoh_multitaper_max_fs-1_fmin-0_fmax-0-5,0.995331,64,64
1555,1555,cce_kozachenko,di_gaussian,0.99284,64,64
13337,13337,gc_gaussian_k-1_kt-1_l-1_lt-1,tlmi_gaussian,0.986509,64,64
17630,17630,mi_kraskov_NN-4_DCE,tlmi_kraskov_NN-4_DCE,0.978341,64,64


Top 5 KS for bw=1.5


Unnamed: 0,feature_index,SPI-1,SPI-2,ks_statistic,n_GaussianNoise,n_CML
8866,8866,ddtf_multitaper_max_fs-1_fmin-0-25_fmax-0-5,ddtf_multitaper_max_fs-1_fmin-0_fmax-0-5,0.995318,64,64
7801,7801,dcoh_multitaper_max_fs-1_fmin-0-25_fmax-0-5,dcoh_multitaper_max_fs-1_fmin-0_fmax-0-5,0.993577,64,64
13337,13337,gc_gaussian_k-1_kt-1_l-1_lt-1,tlmi_gaussian,0.981544,64,64
1555,1555,cce_kozachenko,di_gaussian,0.981036,64,64
17630,17630,mi_kraskov_NN-4_DCE,tlmi_kraskov_NN-4_DCE,0.96567,64,64


##### **`CauchyNoise` vs `GaussianNoise`**

In [None]:
# Load a result CSV
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
ks_df = pd.read_csv("analysis\\non-overlapping_distributions\\ks_GaussianNoise_VAR(1).csv")
ks_df.sort_values("ks_statistic", ascending=False).head(5)

Unnamed: 0,feature_index,SPI-1,SPI-2,ks_statistic,n_GaussianNoise,n_VAR(1)
8827,8827,dcoh_multitaper_mean_fs-1_fmin-0_fmax-0-5,sgc_parametric_max_fs-1_fmin-1e-05_fmax-0-5_or...,1.0,64,64
22045,22045,prec_LedoitWolf,sgc_parametric_mean_fs-1_fmin-1e-05_fmax-0-5_o...,1.0,64,64
22102,22102,prec_OAS,sgc_parametric_mean_fs-1_fmin-0-25_fmax-0-5_or...,1.0,64,64
22099,22099,prec_OAS,sgc_parametric_max_fs-1_fmin-1e-05_fmax-0-5_or...,1.0,64,64
22096,22096,prec_OAS,sgc_parametric_max_fs-1_fmin-0_fmax-0-25_order...,1.0,64,64
22093,22093,prec_OAS,sgc_parametric_max_fs-1_fmin-0-25_fmax-0-5_ord...,1.0,64,64
14872,14872,hsic,sgc_parametric_max_fs-1_fmin-1e-05_fmax-0-5_or...,1.0,64,64
22062,22062,prec_LedoitWolf,xcorr-sq_max_sig-True,1.0,64,64
6103,6103,cov-sq_GraphicalLasso,sgc_parametric_mean_fs-1_fmin-0_fmax-0-25_orde...,1.0,64,64
22050,22050,prec_LedoitWolf,spearmanr-sq,1.0,64,64
