Table maker

In [60]:
import json
from pathlib import Path

import pandas as pd
from matplotlib import pyplot as plt

results_dir = Path(".", "svd_results_minimal")
# results_stratified_dir = Path(".", "results_minimal_stratified")
datasets = ["svd", "voiced"]
scaling_transformations = set([str(file.name).split("_")[1] for file in list(results_dir.glob("*.json"))])
scaling_transformations

set()

Table with results for each scaler

In [56]:
# Iterate through datasets
for dataset in datasets:
    results_dir = Path(".", f"{dataset}_results_minimal")
    results_stratified_dir = Path(".", f"{dataset}_results_minimal_stratified")
    # Iterate through the scaler types
    for scaler in scaling_transformations:
        # Iterate through non-stratified and stratified results
        for split_type, split_type_name in zip([results_dir, results_stratified_dir], ["nonstratified", "stratified"]):
            scaler_results = pd.DataFrame(data=None, columns=None, index=["max", "min", "mean", "std"])
            # Iterate through individual model results for the given scaler type and split type
            for result in split_type.glob(f"*{scaler}*"):
                # Load the result
                data = pd.read_json(result).transpose()
                # Get the leakage and correct BCC values
                data = pd.json_normalize(data["bcc"])
                # Calculate the difference
                data["diff"] = data["leakage"] - data["correct"]
                # Get the metrics
                col = data[["diff"]].agg(func=["max", "min", "mean", "std"])
                # Name the column with the name of the classifier
                col.columns = [result.stem.split("_")[-1]]
                # Concatenate the column with the performance of previous results
                scaler_results = pd.concat([scaler_results, col], axis=1)
            # Transpose the results so classifiers are rows
            scaler_results = scaler_results.transpose().reset_index().rename(columns={"index": "classifier"})
            # Once all results for the split type and scaler are added, save them
            print(f"scaler_{scaler}_split_{split_type_name}_dataset_{dataset}.csv")
            scaler_results.to_csv(f"scaler_{scaler}_split_{split_type_name}_dataset_{dataset}.csv", index=False)


In [57]:
# Code for table setup
TABLE_START = \
"""
\\begin{table}[t]
\t\\centering
\t\\begin{tabular}{llcccccccc}
\t\t\\toprule
\t\t & & \\multicolumn{4}{c}{Unstratified split} & \\multicolumn{4}{c}{Stratified split} \\\\
"""

# Encoding estimator names
estimator_names = {
    "adaboost":"\\gls{AB}",
    "dt": "\\gls{DT}",
    "gaussianNB": "\\gls{GNB}",
    "process": "\\gls{GP}",
    "knn": "\\gls{KNN}",
    "lda": "\\gls{LDA}",
    "mlp": "\\gls{MLP}",
    "qda": "\\gls{QDA}",
    "rf": "\\gls{RF}",
    "svm": "\\gls{SVM}"
    }

# Encoding column names
col_names = {
    "classifier": "Model",
    "max": "Max",
    "min": "Min",
    "mean": "$\\mu$",
    "std": "$\\sigma$",
}
# Code for table end -> needs to be formatted to add the name of the tranformer
table_end = \
"""
\t\\end{{tabular}}
\t\\caption{{Data leakage results for {transformer}}}
\t\\label{{tab:{transformer_label}}}
\\end{{table}}
"""

# Getting the name of the transformer
table_latex = ""
for transformer in scaling_transformations:
    # Loading data
    table_body = ""
    for index, dataset in enumerate(datasets):
        data = pd.read_csv(f"scaler_{transformer}_split_nonstratified_dataset_{dataset}.csv")
        data_stratified = pd.read_csv(f"scaler_{transformer}_split_stratified_dataset_{dataset}.csv")
        if index == 0:
            # Creating a table header from the dataframe column names for the first dataset
            table_body += "\t\t & Model & " + " & ".join( [col_names[x] for x in data.columns.to_list()[1:]] * 2)  + " \\\\\n"
        else:
            # For the second dataset, only line break is added
            table_body += " \\\\\n"
        # Adding a midrule
        table_body += "\t\t\\midrule\n"
        # Adding the dataset name -> because of formatted print, all curly brackets in the string have to be doubled
        table_body += f"\t\t\\multirow{{10}}{{*}}{{\\rotatebox[origin=c]{{90}}{{\\gls{{{dataset.upper()}}}}}}}"
        # Creating the table body with values for each model
        for row1, row2 in zip(data.iterrows(), data_stratified.iterrows()):
          # Saving only the values (discarding indices) as a list
          row_as_list = row1[1].to_list() + row2[1].to_list()[1:]
          # Formatting the values as floats with 3 decimals
          row_formatted = [estimator_names[row_as_list[0]]] + [f"{val:.3f}" for val in row_as_list[1:]]
          # Creating a latex code for the row with values
          if row1[0] == 0:
            table_row = " & " + " & ".join(row_formatted)
          else:
            table_row = " \\\\\n\t\t & " + " & ".join(row_formatted)
          # Adding the code to the existing table body
          table_body += table_row
        # Adding a bottomrule
    table_body += " \\\\\n\t\t\\bottomrule"

    #
    table_latex += TABLE_START + table_body + table_end.format(transformer=transformer, transformer_label=transformer.replace(" ","_").lower())

with open("latex_tables.tex", "w") as f:
  f.write(table_latex)

In [58]:
svd = pd.read_csv("data/flattened_features.csv")
voiced = pd.read_csv("data/voiced_features_8000_fft.csv")

In [59]:
svd_cols = set(svd.columns.to_list())
voiced_cols = set(voiced.columns.to_list())
svd_cols - voiced_cols

{'spectral_contrast_7'}