In [1]:
# Checking the features used for SVD and VOICED datasets
import pandas as pd
svd = pd.read_csv("data/flattened_features.csv")
voiced = pd.read_csv("data/voiced_features_8000_fft.csv")

svd_cols = set(svd.columns.to_list())
voiced_cols = set(voiced.columns.to_list())
svd_cols - voiced_cols

{'spectral_contrast_7'}

In [2]:
# Comparing new and old results
with open("latex_tables.tex", "r") as f1:
    content1 = f1.read()

with open("latex_tables_old.tex", "r") as f2:
    content2 = f2.read()

content1 == content2

False

In [3]:
# Statistical testing of differences between max / min differences for each combination of scaling transformation and dataset
import pandas as pd
from scipy.stats import wilcoxon, normaltest
from pathlib import Path
datasets = ["svd", "voiced"]

filename_list = list([file.name for file in Path("svd_results").glob("*.json")])

results_table = pd.DataFrame(data=[], columns=["dataset", "transformer", "model", "p_value"])
for dataset in datasets:
    for name in filename_list:
        model = name.split(".")[0].split("_1000_")[1]
        transformer = name.split("_")[1]
        path_random = Path(f"{dataset}_results", name)
        path_stratified = Path(f"{dataset}_results_stratified", name)
        data_random = pd.read_json(path_random).transpose()
        data_stratified = pd.read_json(path_stratified).transpose()
        data_random = pd.json_normalize(data_random["bcc"])
        data_stratified = pd.json_normalize(data_stratified["bcc"])
        data_random["diff"] = data_random["leakage"] - data_random["correct"]
        data_stratified["diff"] = data_stratified["leakage"] - data_stratified["correct"]

        for split, data in zip(["random", "stratified"], [data_random, data_stratified]):
            # Test of normality - are the differences drawn from a normal distribution?
            _, p_value = normaltest(data["diff"])
            print(f"{dataset:<6} {split:<10} {transformer:<20} {model:<20} {p_value:.3f}")
        # Test of similarity, do the pairs of differences for random and stratified split have the same mean value?
        # _, p_value = wilcoxon(data_random["diff"], data_stratified["diff"])
        # results_table.loc[results_table.shape[0], :] = [dataset, transformer, model, p_value]

# No test results are consistent so we cannot decide on the normality of data or the differences in results between the random split and stratified split

svd    random     MaxAbsScaler         adaboost             nan
svd    stratified MaxAbsScaler         adaboost             nan
svd    random     MaxAbsScaler         dt                   nan
svd    stratified MaxAbsScaler         dt                   nan
svd    random     MaxAbsScaler         gaussianNB           nan
svd    stratified MaxAbsScaler         gaussianNB           nan
svd    random     MaxAbsScaler         gaussian_process     0.000
svd    stratified MaxAbsScaler         gaussian_process     0.000
svd    random     MaxAbsScaler         knn                  0.049
svd    stratified MaxAbsScaler         knn                  0.046
svd    random     MaxAbsScaler         lda                  nan
svd    stratified MaxAbsScaler         lda                  nan
svd    random     MaxAbsScaler         mlp                  0.198
svd    stratified MaxAbsScaler         mlp                  0.633
svd    random     MaxAbsScaler         qda                  0.000
svd    stratified MaxAbsSc

In [4]:
# Testing the significancy of mean difference. We assume the results do not follow normal distribution.
import pandas as pd
import numpy as np
from scipy.stats import permutation_test
from pathlib import Path


stat_func = lambda a, b: np.mean(a - b)
datasets = ["svd", "voiced"]
filename_list = list([file.name for file in Path("svd_results").glob("*.json")])

results_table = pd.DataFrame(data=[], columns=["dataset", "transformer", "model", "split", "p_value"])
for dataset in datasets:
    for name in filename_list:
        model = name.split(".")[0].split("_1000_")[1]
        transformer = name.split("_")[1]
        path_random = Path(f"{dataset}_results", name)
        path_stratified = Path(f"{dataset}_results_stratified", name)
        data_random = pd.read_json(path_random).transpose()
        data_stratified = pd.read_json(path_stratified).transpose()
        data_random = pd.json_normalize(data_random["bcc"])
        data_stratified = pd.json_normalize(data_stratified["bcc"])
        # data_random["diff"] = data_random["leakage"] - data_random["correct"]

        for split, data in zip(["random", "stratified"], [data_random, data_stratified]):
            result = permutation_test((data["leakage"], data["correct"]), statistic=stat_func,
                       permutation_type='samples', alternative='two-sided',
                       vectorized=False, n_resamples=10000, random_state=0)
            results_table.loc[results_table.shape[0], :] = [dataset, transformer, model, split, result.pvalue]
            print(f"{dataset:<6} {split:<10} {transformer:<20} {model:<20} {result.pvalue:.3f}")



svd    random     MaxAbsScaler         adaboost             1.000
svd    stratified MaxAbsScaler         adaboost             1.000
svd    random     MaxAbsScaler         dt                   1.000
svd    stratified MaxAbsScaler         dt                   1.000
svd    random     MaxAbsScaler         gaussianNB           1.000
svd    stratified MaxAbsScaler         gaussianNB           1.000
svd    random     MaxAbsScaler         gaussian_process     0.000
svd    stratified MaxAbsScaler         gaussian_process     0.000
svd    random     MaxAbsScaler         knn                  0.347
svd    stratified MaxAbsScaler         knn                  0.274
svd    random     MaxAbsScaler         lda                  1.000
svd    stratified MaxAbsScaler         lda                  1.000
svd    random     MaxAbsScaler         mlp                  0.669
svd    stratified MaxAbsScaler         mlp                  0.838
svd    random     MaxAbsScaler         qda                  0.062
svd    str

In [6]:
results_table.to_csv("permutation_pair_test_mean.csv", index=False)

In [8]:
results_table = pd.read_csv("permutation_pair_test_mean.csv")
results_table[results_table.p_value < 0.05].groupby(["dataset", "transformer"])["p_value"].count().reset_index().pivot(index="transformer", columns=["dataset"], values="p_value").fillna(0).astype(int).to_csv("pivot_permutation_mean.csv")

In [9]:
results_table[results_table.p_value < 0.05]

Unnamed: 0,dataset,transformer,model,split,p_value
6,svd,MaxAbsScaler,gaussian_process,random,0.0002
7,svd,MaxAbsScaler,gaussian_process,stratified,0.0002
18,svd,MaxAbsScaler,svm,random,0.0002
19,svd,MaxAbsScaler,svm,stratified,0.0002
26,svd,MinMaxScaler,gaussian_process,random,0.0002
27,svd,MinMaxScaler,gaussian_process,stratified,0.0002
28,svd,MinMaxScaler,knn,random,0.0002
29,svd,MinMaxScaler,knn,stratified,0.0002
34,svd,MinMaxScaler,qda,random,0.0004
35,svd,MinMaxScaler,qda,stratified,0.006799
