Table maker

In [21]:
import json
from pathlib import Path

import pandas as pd
from matplotlib import pyplot as plt

results_dir = Path(".", "results_minimal_stratified")
scaling_transformations = set([str(file).split("_")[3] for file in list(results_dir.glob("*.json"))])
scaling_transformations

{'MaxAbsScaler',
 'MinMaxScaler',
 'QuantileTransformer',
 'RobustScaler',
 'StandardScaler'}

Table with results for each scaler

In [22]:
pandas_tables = []
for scaler in scaling_transformations:
    scaler_table = {"classifier": [],
                    "maxdiff": [],
                    "mindiff": [],
                    "meandiff": [],
                    "stddiff": [],
                    "bestdiff": [],
                    "bestcorrect": [],
                    "meancorrect": [],
                    "worstcorrect": []}
    for result in results_dir.glob(f"*{scaler}*"):
        with open(result, "r") as f:
            clf_results = json.load(f)
            to_pandas = {"seed": [], "bcc_correct": [], "bcc_leakage": []}
            for seed_experiment in clf_results.keys():
                to_pandas["seed"].append(int(seed_experiment))
                to_pandas["bcc_correct"].append(clf_results[seed_experiment]["bcc"]["correct"])
                to_pandas["bcc_leakage"].append(clf_results[seed_experiment]["bcc"]["leakage"])

            pd_data = pd.DataFrame(to_pandas)
            scaler_table["classifier"].append(str(result.stem).split("_")[4])
            diff = pd_data["bcc_leakage"] - pd_data["bcc_correct"]
            # plt.figure()
            # plt.hist(diff, bins=100)
            # plt.title(f"{result.stem} - {scaler}")

            scaler_table["maxdiff"].append(diff.max())
            scaler_table["mindiff"].append(diff.min())
            scaler_table["meandiff"].append(diff.mean())
            scaler_table["stddiff"].append(diff.std())
            scaler_table["bestdiff"].append(pd_data["bcc_leakage"].max() - pd_data["bcc_correct"].max())
            scaler_table["bestcorrect"].append(pd_data["bcc_correct"].max())
            scaler_table["meancorrect"].append(pd_data["bcc_correct"].mean())
            scaler_table["worstcorrect"].append(pd_data["bcc_correct"].min())

        pd_to_dump = pd.DataFrame(scaler_table).to_csv(f"scaler_{scaler}.csv")
plt.show()

In [23]:
pd.DataFrame(scaler_table)

Unnamed: 0,classifier,maxdiff,mindiff,meandiff,stddiff,bestdiff,bestcorrect,meancorrect,worstcorrect
0,adaboost,0.003788,-0.003788,1e-05,0.000277,0.0,0.867115,0.813267,0.740414
1,dt,0.176098,-0.141698,0.017324,0.055235,0.013992,0.784246,0.620505,0.503092
2,gaussianNB,0.011441,-0.010127,0.000141,0.003295,0.0,0.750309,0.679216,0.619357
3,gaussian,0.01778,-0.022805,-0.001031,0.006503,0.005025,0.701917,0.632138,0.561224
4,knn,0.025356,-0.024041,0.000991,0.008156,0.007189,0.763142,0.697465,0.622372
5,lda,0.027829,-0.034168,0.000494,0.008284,-0.003788,0.855442,0.790537,0.718847
6,mlp,0.034246,-0.044372,9.2e-05,0.011103,-0.006107,0.838899,0.767466,0.684292
7,qda,0.032931,-0.035328,-0.000554,0.010017,-0.013915,0.771877,0.679933,0.602118
8,rf,0.011441,-0.012678,2.3e-05,0.003192,0.0,0.867038,0.801496,0.736704
9,svm,0.020176,-0.01778,0.000504,0.004831,0.0,0.840445,0.774032,0.692177


In [16]:
# Code for table setup
TABLE_START = \
"""
\\begin{table}[]
\t\\centering
\t\\begin{tabular}{lcccc}
"""

# Encoding estimator names
estimator_names = {    "adaboost":"AdaBoost",
    "dt": "\\gls{DT}",
    "gaussianNB": "Gaussian \\gls{NB}",
    "gaussian": "\\gls{GP}",
    "knn": "\\gls{KNN}",
    "lda": "\\gls{LDA}",
    "mlp": "\\gls{MLP}",
    "qda": "\\gls{QDA}",
    "rf": "\\gls{RF}",
    "svm": "\\gls{SVM}"
    }

# Encoding column names
col_names = {
    "classifier": "Estimator",
    "maxdiff": "MAX",
    "mindiff": "MIN",
    "meandiff": "$\\mu$",
    "stddiff": "$\\sigma$",
}
# Code for table end -> needs to be formatted to add the name of the tranformer
table_end = \
"""
\t\\end{{tabular}}
\t\\caption{{Data leakage results for {transformer}}}
\t\\label{{tab:{transformer_label}}}
\\end{{table}}
"""

# Getting the name of the transformer
table_latex = ""
for transformer in ["MaxAbsScaler", "MinMaxScaler", "QuantileTransformer", "RobustScaler", "StandardScaler"]:
    # Loading data
    data = pd.read_csv(f"scaler_{transformer}.csv").iloc[:, 1:6]
    # Adding a toprule
    table_header = "\t\t\\toprule\n"
    # Creating a table header from the dataframe column names
    table_header += "\t\t" + " & ".join( [col_names[x] for x in data.columns.to_list()]) + " \\\\\n"
    # Adding a midrule
    table_header += "\t\t\\midrule"
    # Creating the table body with values for each model
    table_body = ""
    for row in data.iterrows():
      # Saving only the values (discarding indices) as a list
      row_as_list = row[1].to_list()
      # Formatting the values as floats with 3 decimals
      row_formatted = [estimator_names[row_as_list[0]]] + [f"{val:.3f}" for val in row_as_list[1:]]
      # Creating a latex code for the row with values
      if row[0] == 0:
        table_row = "\n\t\t" + " & ".join(row_formatted)
      else:
        table_row = " \\\\\n\t\t" + " & ".join(row_formatted)
      # Adding the code to the existing table body
      table_body += table_row
    # Adding a bottomrule
    table_body += "\\\\\n\t\t\\bottomrule"

    #
    table_latex += TABLE_START + table_header + table_body + table_end.format(transformer=transformer, transformer_label=transformer.replace(" ","_").lower())

with open("latex_tables.tex", "w") as f:
  f.write(table_latex)