Table maker

In [3]:
import json
from pathlib import Path

import pandas as pd
from matplotlib import pyplot as plt

results_dir = Path(".", "results_minimal")
scaling_transformations = set([str(file).split("_")[2] for file in list(results_dir.glob("*.json"))])
scaling_transformations

{'MaxAbsScaler',
 'MinMaxScaler',
 'QuantileTransformer',
 'RobustScaler',
 'StandardScaler'}

Table with results for each scaler

In [4]:
pandas_tables = []
for scaler in scaling_transformations:
    scaler_table = {"classifier": [],
                    "maxdiff": [],
                    "mindiff": [],
                    "meandiff": [],
                    "stddiff": [],
                    "bestdiff": [],
                    "bestcorrect": [],
                    "meancorrect": [],
                    "worstcorrect": []}
    for result in results_dir.glob(f"*{scaler}*"):
        with open(result, "r") as f:
            clf_results = json.load(f)
            to_pandas = {"seed": [], "bcc_correct": [], "bcc_leakage": []}
            for seed_experiment in clf_results.keys():
                to_pandas["seed"].append(int(seed_experiment))
                to_pandas["bcc_correct"].append(clf_results[seed_experiment]["bcc"]["correct"])
                to_pandas["bcc_leakage"].append(clf_results[seed_experiment]["bcc"]["leakage"])

            pd_data = pd.DataFrame(to_pandas)
            scaler_table["classifier"].append(str(result.stem).split("_")[4])
            diff = pd_data["bcc_leakage"] - pd_data["bcc_correct"]
            # plt.figure()
            # plt.hist(diff, bins=100)
            # plt.title(f"{result.stem} - {scaler}")

            scaler_table["maxdiff"].append(diff.max())
            scaler_table["mindiff"].append(diff.min())
            scaler_table["meandiff"].append(diff.mean())
            scaler_table["stddiff"].append(diff.std())
            scaler_table["bestdiff"].append(pd_data["bcc_leakage"].max() - pd_data["bcc_correct"].max())
            scaler_table["bestcorrect"].append(pd_data["bcc_correct"].max())
            scaler_table["meancorrect"].append(pd_data["bcc_correct"].mean())
            scaler_table["worstcorrect"].append(pd_data["bcc_correct"].min())

        pd_to_dump = pd.DataFrame(scaler_table).to_csv(f"scaler_{scaler}.csv")
plt.show()

Unnamed: 0.1,Unnamed: 0,classifier,maxdiff,mindiff,meandiff,stddiff,bestdiff,bestcorrect,meancorrect,worstcorrect
0,0,adaboost,0.0,0.0,0.0,0.0,0.0,0.879339,0.813902,0.751368
1,1,dt,0.0,0.0,0.0,0.0,0.0,0.774359,0.655292,0.562936
2,2,gaussianNB,0.0,0.0,0.0,0.0,0.0,0.708333,0.655398,0.588788
3,3,gaussian,0.01777,-0.015768,4e-06,0.004012,0.0,0.881159,0.82052,0.747188
4,4,knn,0.042629,-0.034776,0.001746,0.011062,-0.011811,0.847456,0.766959,0.701185
5,5,lda,0.0,0.0,0.0,0.0,0.0,0.8716,0.822992,0.745961
6,6,mlp,0.083005,-0.06748,-0.000927,0.022519,-0.017961,0.862918,0.781438,0.707452
7,7,qda,0.022325,-0.016618,0.000564,0.005075,0.005435,0.724034,0.668456,0.604277
8,8,rf,0.006618,-0.006243,-9e-06,0.000631,0.0,0.870128,0.801635,0.733758
9,9,svm,0.031872,-0.019871,0.000953,0.005715,0.000374,0.886424,0.829541,0.761733


In [16]:
# Code for table setup
TABLE_START = \
"""
\\begin{table}[]
\t\\centering
\t\\begin{tabular}{lcccc}
"""

# Encoding estimator names
estimator_names = {    "adaboost":"AdaBoost",
    "dt": "\\gls{DT}",
    "gaussianNB": "Gaussian \\gls{NB}",
    "gaussian": "\\gls{GP}",
    "knn": "\\gls{KNN}",
    "lda": "\\gls{LDA}",
    "mlp": "\\gls{MLP}",
    "qda": "\\gls{QDA}",
    "rf": "\\gls{RF}",
    "svm": "\\gls{SVM}"
    }

# Encoding column names
col_names = {
    "classifier": "Estimator",
    "maxdiff": "MAX",
    "mindiff": "MIN",
    "meandiff": "$\\mu$",
    "stddiff": "$\\sigma$",
}
# Code for table end -> needs to be formatted to add the name of the tranformer
table_end = \
"""
\t\\end{{tabular}}
\t\\caption{{Data leakage results for {transformer}}}
\t\\label{{tab:{transformer_label}}}
\\end{{table}}
"""

# Getting the name of the transformer
table_latex = ""
for transformer in ["MaxAbsScaler", "MinMaxScaler", "QuantileTransformer", "RobustScaler", "StandardScaler"]:
    # Loading data
    data = pd.read_csv(f"scaler_{transformer}.csv").iloc[:, 1:6]
    # Adding a toprule
    table_header = "\t\t\\toprule\n"
    # Creating a table header from the dataframe column names
    table_header += "\t\t" + " & ".join( [col_names[x] for x in data.columns.to_list()]) + " \\\\\n"
    # Adding a midrule
    table_header += "\t\t\\midrule"
    # Creating the table body with values for each model
    table_body = ""
    for row in data.iterrows():
      # Saving only the values (discarding indices) as a list
      row_as_list = row[1].to_list()
      # Formatting the values as floats with 3 decimals
      row_formatted = [estimator_names[row_as_list[0]]] + [f"{val:.3f}" for val in row_as_list[1:]]
      # Creating a latex code for the row with values
      if row[0] == 0:
        table_row = "\n\t\t" + " & ".join(row_formatted)
      else:
        table_row = " \\\\\n\t\t" + " & ".join(row_formatted)
      # Adding the code to the existing table body
      table_body += table_row
    # Adding a bottomrule
    table_body += "\\\\\n\t\t\\bottomrule"

    #
    table_latex += TABLE_START + table_header + table_body + table_end.format(transformer=transformer, transformer_label=transformer.replace(" ","_").lower())

with open("latex_tables.tex", "w") as f:
  f.write(table_latex)