# **Merge results**

In this notebook the results of all notebooks/datasets will be merged into one table for each evaluation method. Additionally, the average over all datasets will be added.

**Difference in pairwise correlation:**

In [2]:
import pandas as pd

In [10]:
corr_diff = pd.read_csv('../data/results/tables/corr_diff.csv')
corr_diff_wide = corr_diff.pivot(index='Dataset', columns='Model', values='Pairwise Corr Diff')
avg_row = corr_diff_wide.mean(axis=0)
corr_diff_wide.loc['Average'] = avg_row
corr_diff_wide = corr_diff_wide.round(3)
corr_diff_wide.to_csv('../final_results/combined_tables/corr_diff_wide.csv')

# Print as latex table
print(corr_diff_wide.to_latex(
    index=True,                 # Print row names
    float_format="%.3f",        # Set precision
    escape=True,                # Escape special characters
    bold_rows=True,             # Bold row names
    column_format="lcccccc",    # Align columns
    header=True,                # Print column names
    label="tab:corr_diff",        # Set label
    caption="Difference in Pairwise Correlation", # Set caption       
))
corr_diff_wide


\begin{table}
\caption{Difference in Pairwise Correlation}
\label{tab:corr_diff}
\begin{tabular}{lcccccc}
\toprule
Model & copula\_gan & ctgan & ds & gaussian\_copula & synthpop & tvae \\
Dataset &  &  &  &  &  &  \\
\midrule
\textbf{1} & 0.157 & 0.163 & 0.155 & 0.117 & 0.058 & 0.150 \\
\textbf{2} & 0.049 & 0.070 & 0.052 & 0.041 & 0.026 & 0.179 \\
\textbf{3} & 0.283 & 0.276 & 0.281 & 0.233 & 0.158 & 0.192 \\
\textbf{4} & 0.030 & 0.029 & 0.035 & 0.035 & 0.006 & 0.025 \\
\textbf{5} & 0.091 & 0.091 & 0.133 & 0.086 & 0.065 & 0.107 \\
\textbf{Average} & 0.122 & 0.126 & 0.131 & 0.102 & 0.063 & 0.131 \\
\bottomrule
\end{tabular}
\end{table}



Model,copula_gan,ctgan,ds,gaussian_copula,synthpop,tvae
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.157,0.163,0.155,0.117,0.058,0.15
2,0.049,0.07,0.052,0.041,0.026,0.179
3,0.283,0.276,0.281,0.233,0.158,0.192
4,0.03,0.029,0.035,0.035,0.006,0.025
5,0.091,0.091,0.133,0.086,0.065,0.107
Average,0.122,0.126,0.131,0.102,0.063,0.131


**Jensen-Shannon Divergence**

In [12]:
jsd = pd.read_csv('../data/results/tables/jsd.csv')
jsd_wide = jsd.pivot(index='Dataset', columns='Model', values='JSD Diff')
avg_row = jsd_wide.mean(axis=0)
jsd_wide.loc['Average'] = avg_row
jsd_wide = jsd_wide.round(3)
jsd_wide.to_csv('../final_results/combined_tables/jsd_wide.csv')

# Print as latex table
print(jsd_wide.to_latex(
    index=True,                 # Print row names
    float_format="%.3f",        # Set precision
    escape=True,                # Escape special characters
    bold_rows=True,             # Bold row names
    column_format="lcccccc",    # Align columns
    header=True,                # Print column names
    label="tab:jsd",        # Set label
    caption="Jensen-Shannon Divergence", # Set caption       
))

jsd_wide

\begin{table}
\caption{Jensen-Shannon Divergence}
\label{tab:jsd}
\begin{tabular}{lcccccc}
\toprule
Model & copula\_gan & ctgan & ds & gaussian\_copula & synthpop & tvae \\
Dataset &  &  &  &  &  &  \\
\midrule
\textbf{1} & 0.045 & 0.064 & 0.089 & 0.051 & 0.030 & 0.152 \\
\textbf{2} & 0.007 & 0.015 & 0.021 & 0.001 & 0.000 & 0.087 \\
\textbf{3} & 0.182 & 0.168 & 0.201 & 0.185 & 0.040 & 0.196 \\
\textbf{4} & 0.074 & 0.066 & 0.113 & 0.107 & 0.013 & 0.075 \\
\textbf{5} & 0.011 & 0.016 & 0.091 & 0.005 & 0.002 & 0.181 \\
\textbf{Average} & 0.064 & 0.066 & 0.103 & 0.070 & 0.017 & 0.138 \\
\bottomrule
\end{tabular}
\end{table}



Model,copula_gan,ctgan,ds,gaussian_copula,synthpop,tvae
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.045,0.064,0.089,0.051,0.03,0.152
2,0.007,0.015,0.021,0.001,0.0,0.087
3,0.182,0.168,0.201,0.185,0.04,0.196
4,0.074,0.066,0.113,0.107,0.013,0.075
5,0.011,0.016,0.091,0.005,0.002,0.181
Average,0.064,0.066,0.103,0.07,0.017,0.138


**Wasserstein distance**

In [13]:
wd = pd.read_csv('../data/results/tables/wd.csv')
wd_wide = wd.pivot(index='Dataset', columns='Model', values='WD Diff')
avg_row = wd_wide.mean(axis=0)
wd_wide.loc['Average'] = avg_row
wd_wide = wd_wide.round(3)
wd_wide.to_csv('../final_results/combined_tables/wd_wide.csv')

# Print as latex table
print(wd_wide.to_latex(
    index=True,                 # Print row names
    float_format="%.3f",        # Set precision
    escape=True,                # Escape special characters
    bold_rows=True,             # Bold row names
    column_format="lcccccc",    # Align columns
    header=True,                # Print column names
    label="tab:wd",        # Set label
    caption="Wasserstein Distance", # Set caption       
))

wd_wide

\begin{table}
\caption{Wasserstein Distance}
\label{tab:wd}
\begin{tabular}{lcccccc}
\toprule
Model & copula\_gan & ctgan & ds & gaussian\_copula & synthpop & tvae \\
Dataset &  &  &  &  &  &  \\
\midrule
\textbf{1} & 0.027 & 0.014 & 0.055 & 0.015 & 0.021 & 0.120 \\
\textbf{2} & 0.050 & 0.043 & 0.077 & 0.015 & 0.010 & 0.126 \\
\textbf{3} & 0.036 & 0.037 & 0.080 & 0.035 & 0.022 & 0.135 \\
\textbf{4} & 0.023 & 0.019 & 0.016 & 0.005 & 0.002 & 0.026 \\
\textbf{5} & 0.047 & 0.039 & 0.130 & 0.017 & 0.023 & 0.292 \\
\textbf{Average} & 0.037 & 0.030 & 0.072 & 0.018 & 0.015 & 0.140 \\
\bottomrule
\end{tabular}
\end{table}



Model,copula_gan,ctgan,ds,gaussian_copula,synthpop,tvae
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.027,0.014,0.055,0.015,0.021,0.12
2,0.05,0.043,0.077,0.015,0.01,0.126
3,0.036,0.037,0.08,0.035,0.022,0.135
4,0.023,0.019,0.016,0.005,0.002,0.026
5,0.047,0.039,0.13,0.017,0.023,0.292
Average,0.037,0.03,0.072,0.018,0.015,0.14


**Distance to Closest Record**

In [13]:
dcr_test = pd.read_csv('../data/results/tables/dcr.csv')

# Calculate average values for each model
average_values = dcr_test.groupby('Model')[['DCR 5th Percentile', 'DCR 5th Percentile (within Real)', 'DCR 5th Percentile (within Synthetic)']].mean().reset_index()

final_data = []

for model in dcr_test['Model'].unique():
    
    model_data = dcr_test[dcr_test['Model'] == model]
    
    for idx, row in model_data.iterrows():
        if idx == model_data.index[2]:  # Only add model name once
            final_data.append([row['Model'], row['Dataset'], row['DCR 5th Percentile'], row['DCR 5th Percentile (within Real)'], row['DCR 5th Percentile (within Synthetic)']])
        else:
            final_data.append(['', row['Dataset'], row['DCR 5th Percentile'], row['DCR 5th Percentile (within Real)'], row['DCR 5th Percentile (within Synthetic)']])
    
    # Add average values
    avg_row = average_values[average_values['Model'] == model].iloc[0]
    final_data.append(['', 'Average', avg_row['DCR 5th Percentile'], avg_row['DCR 5th Percentile (within Real)'], avg_row['DCR 5th Percentile (within Synthetic)']])

# Create final dataframe
dcr_df = pd.DataFrame(final_data, columns=['Model', 'Dataset', 'Between Synth/Real', 'Within Real', 'Within Synth']).round(3)
# Print as latex table
print(dcr_df.to_latex(
    index=False,                 # Print row names
    float_format="%.3f",        # Set precision
    escape=True,                # Escape special characters
    bold_rows=True,             # Bold row names
    column_format="ccccc",    # Align columns
    header=True,                # Print column names
    label="tab:dcr",        # Set label
    caption="Distance to Closest Record", # Set caption       
))

\begin{table}
\caption{Distance to Closest Record}
\label{tab:dcr}
\begin{tabular}{ccccc}
\toprule
Model & Dataset & Synth/Real (5th perc) & Within Real (5th perc) & Within Synth (5th perc) \\
\midrule
 & 1 & 6.220 & 5.568 & 6.481 \\
 & 2 & 0.000 & 0.000 & 0.000 \\
synthpop & 3 & 6.022 & 6.009 & 5.847 \\
 & 4 & 2.449 & 2.236 & 2.449 \\
 & 5 & 2.828 & 3.000 & 3.000 \\
 & Average & 3.504 & 3.363 & 3.555 \\
 & 1 & 12.153 & 5.568 & 11.314 \\
 & 2 & 0.000 & 0.000 & 0.000 \\
ds & 3 & 47.448 & 6.009 & 28.485 \\
 & 4 & 10.536 & 2.236 & 9.798 \\
 & 5 & 5.099 & 3.000 & 3.427 \\
 & Average & 15.047 & 3.363 & 10.605 \\
 & 1 & 7.507 & 5.568 & 5.385 \\
 & 2 & 0.000 & 0.000 & 0.000 \\
tvae & 3 & 7.246 & 6.009 & 6.156 \\
 & 4 & 5.831 & 2.236 & 5.000 \\
 & 5 & 3.570 & 3.000 & 0.000 \\
 & Average & 4.831 & 3.363 & 3.308 \\
 & 1 & 8.449 & 5.568 & 7.937 \\
 & 2 & 0.000 & 0.000 & 0.000 \\
gaussian\_copula & 3 & 10.515 & 6.009 & 11.069 \\
 & 4 & 13.038 & 2.236 & 12.166 \\
 & 5 & 3.317 & 3.000 & 3.317 \\
 & 

In [5]:
dcr = pd.read_csv('../data/results/tables/dcr.csv')
dcr_wide = dcr.pivot(index='Dataset', columns='Model', values=['DCR 5th Percentile', 'DCR 5th Percentile (within Real)', 'DCR 5th Percentile (within Synthetic)'])
avg_row = dcr_wide.mean(axis=0)
dcr_wide.loc['Average'] = avg_row
dcr_wide = dcr_wide.round(3)
dcr_wide.to_csv('../final_results/combined_tables/dcr_wide.csv')
dcr_wide

Unnamed: 0_level_0,DCR 5th Percentile,DCR 5th Percentile,DCR 5th Percentile,DCR 5th Percentile,DCR 5th Percentile,DCR 5th Percentile,DCR 5th Percentile (within Real),DCR 5th Percentile (within Real),DCR 5th Percentile (within Real),DCR 5th Percentile (within Real),DCR 5th Percentile (within Real),DCR 5th Percentile (within Real),DCR 5th Percentile (within Synthetic),DCR 5th Percentile (within Synthetic),DCR 5th Percentile (within Synthetic),DCR 5th Percentile (within Synthetic),DCR 5th Percentile (within Synthetic),DCR 5th Percentile (within Synthetic)
Model,copula_gan,ctgan,ds,gaussian_copula,synthpop,tvae,copula_gan,ctgan,ds,gaussian_copula,synthpop,tvae,copula_gan,ctgan,ds,gaussian_copula,synthpop,tvae
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
1,12.016,12.457,12.153,8.449,6.22,7.507,5.568,5.568,5.568,5.568,5.568,5.568,11.563,8.832,11.314,7.937,6.481,5.385
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10.583,12.599,47.448,10.515,6.022,7.246,6.009,6.009,6.009,6.009,6.009,6.009,12.87,10.389,28.485,11.069,5.847,6.156
4,6.164,6.325,10.536,13.038,2.449,5.831,2.236,2.236,2.236,2.236,2.236,2.236,5.477,6.325,9.798,12.166,2.449,5.0
5,3.317,3.606,5.099,3.317,2.828,3.57,3.0,3.0,3.0,3.0,3.0,3.0,3.162,3.464,3.427,3.317,3.0,0.0
Average,6.416,6.997,15.047,7.064,3.504,4.831,3.363,3.363,3.363,3.363,3.363,3.363,6.614,5.802,10.605,6.898,3.555,3.308


**Nearest Neighbour Distance Ratio**

In [6]:
nndr = pd.read_csv('../data/results/tables/nndr.csv')
nndr_wide = nndr.pivot(index='Dataset', columns='Model', values=['NNDR 5th percentile', 'NNDR 5th percentile (within Real)', 'NNDR 5th percentile (within Synthetic)'])
avg_row = nndr_wide.mean(axis=0)
nndr_wide.loc['Average'] = avg_row
nndr_wide = nndr_wide.round(3)
nndr_wide.to_csv('../final_results/combined_tables/nndr_wide.csv')
nndr_wide

Unnamed: 0_level_0,NNDR 5th percentile,NNDR 5th percentile,NNDR 5th percentile,NNDR 5th percentile,NNDR 5th percentile,NNDR 5th percentile,NNDR 5th percentile (within Real),NNDR 5th percentile (within Real),NNDR 5th percentile (within Real),NNDR 5th percentile (within Real),NNDR 5th percentile (within Real),NNDR 5th percentile (within Real),NNDR 5th percentile (within Synthetic),NNDR 5th percentile (within Synthetic),NNDR 5th percentile (within Synthetic),NNDR 5th percentile (within Synthetic),NNDR 5th percentile (within Synthetic),NNDR 5th percentile (within Synthetic)
Model,copula_gan,ctgan,ds,gaussian_copula,synthpop,tvae,copula_gan,ctgan,ds,gaussian_copula,synthpop,tvae,copula_gan,ctgan,ds,gaussian_copula,synthpop,tvae
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
1,0.572,0.558,0.56,0.557,0.439,0.564,0.493,0.493,0.493,0.493,0.493,0.493,0.614,0.523,0.569,0.499,0.449,0.522
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.494,0.513,0.497,0.568,0.426,0.557,0.583,0.583,0.583,0.583,0.583,0.583,0.559,0.393,0.517,0.637,0.449,0.479
4,0.179,0.181,0.223,0.13,0.046,0.188,0.073,0.073,0.073,0.073,0.073,0.073,0.186,0.204,0.216,0.207,0.072,0.18
5,0.837,0.5,0.403,0.799,0.75,0.858,0.771,0.771,0.771,0.771,0.771,0.771,0.832,0.775,0.663,0.794,0.791,0.0
Average,0.416,0.35,0.337,0.411,0.332,0.433,0.384,0.384,0.384,0.384,0.384,0.384,0.438,0.379,0.393,0.427,0.352,0.236


**Random Forest**

In [6]:
rf_test = pd.read_csv('../data/results/tables/rf.csv')

# Calculate average values for each model
average_values = rf_test.groupby('Model')[['Accuracy Difference', 'F1 Score Difference', 'ROC AUC Difference']].mean().reset_index()

final_data = []

for model in rf_test['Model'].unique():
    
    model_data = rf_test[rf_test['Model'] == model]
    
    for idx, row in model_data.iterrows():
        if idx == model_data.index[2]:  # Only add model name once
            final_data.append([row['Model'], row['Dataset'], row['Accuracy Difference'], row['F1 Score Difference'], row['ROC AUC Difference']])
        else:
            final_data.append(['', row['Dataset'], row['Accuracy Difference'], row['F1 Score Difference'], row['ROC AUC Difference']])
    
    # Add average values
    avg_row = average_values[average_values['Model'] == model].iloc[0]
    final_data.append(['', 'Average', avg_row['Accuracy Difference'], avg_row['F1 Score Difference'], avg_row['ROC AUC Difference']])

# Create final dataframe
rf_df = pd.DataFrame(final_data, columns=['Model', 'Dataset', 'Accuracy Diff', 'F1 Score Diff', 'ROC AUC Diff'])

# # Print as latex table
print(rf_df.to_latex(
    index=False,                 # Print row names
    float_format="%.3f",        # Set precision
    escape=True,                # Escape special characters
    bold_rows=True,             # Bold row names
    column_format="ccccc",    # Align columns
    header=True,                # Print column names
    label="tab:rf",        # Set label
    caption="Random Forest", # Set caption       
))



\begin{table}
\caption{Random Forest}
\label{tab:rf}
\begin{tabular}{ccccc}
\toprule
Model & Dataset & Accuracy Diff & F1 Score Diff & ROC AUC Diff \\
\midrule
 & 1 & 0.110 & 0.100 & 0.060 \\
 & 2 & -0.020 & -0.030 & -0.020 \\
synthpop & 3 & -0.040 & -0.040 & 0.010 \\
 & 4 & 0.020 & 0.020 & 0.020 \\
 & 5 & 0.040 & 0.080 & 0.200 \\
 & Average & 0.022 & 0.026 & 0.054 \\
 & 1 & 0.460 & 0.440 & 0.460 \\
 & 2 & 0.030 & 0.050 & 0.040 \\
ds & 3 & 0.080 & 0.090 & 0.290 \\
 & 4 & 0.020 & 0.030 & 0.050 \\
 & 5 & 0.030 & 0.080 & 0.140 \\
 & Average & 0.124 & 0.138 & 0.196 \\
 & 1 & 0.230 & 0.320 & 0.070 \\
 & 2 & 0.000 & -0.020 & -0.060 \\
tvae & 3 & 0.000 & 0.050 & 0.180 \\
 & 4 & 0.060 & 0.040 & 0.030 \\
 & 5 & 0.040 & 0.030 & 0.160 \\
 & Average & 0.066 & 0.084 & 0.076 \\
 & 1 & 0.340 & 0.390 & 0.320 \\
 & 2 & 0.010 & 0.010 & 0.030 \\
gaussian\_copula & 3 & 0.000 & 0.050 & 0.140 \\
 & 4 & 0.060 & 0.090 & 0.110 \\
 & 5 & 0.120 & 0.070 & 0.170 \\
 & Average & 0.106 & 0.122 & 0.154 \\
 & 1 & 0.37

In [19]:
rf = pd.read_csv('../data/results/tables/rf.csv')
rf_wide = rf.pivot(index="Dataset", columns="Model", values=["Accuracy Difference", "F1 Score Difference", "ROC AUC Difference"])
avg_row = rf_wide.mean(axis=0)
rf_wide.loc['Average'] = avg_row
rf_wide.to_csv('../final_results/combined_tables/rf_wide.csv')

# # Print as latex table
# print(rf_wide.to_latex(
#     index=True,                 # Print row names
#     float_format="%.3f",        # Set precision
#     escape=True,                # Escape special characters
#     bold_rows=True,             # Bold row names
#     column_format="lcccccc",    # Align columns
#     header=True,                # Print column names
#     label="tab:rf",        # Set label
#     caption="Random Forest", # Set caption       
# ))

rf_wide

Unnamed: 0_level_0,Accuracy Difference,Accuracy Difference,Accuracy Difference,Accuracy Difference,Accuracy Difference,Accuracy Difference,F1 Score Difference,F1 Score Difference,F1 Score Difference,F1 Score Difference,F1 Score Difference,F1 Score Difference,ROC AUC Difference,ROC AUC Difference,ROC AUC Difference,ROC AUC Difference,ROC AUC Difference,ROC AUC Difference
Model,copula_gan,ctgan,ds,gaussian_copula,synthpop,tvae,copula_gan,ctgan,ds,gaussian_copula,synthpop,tvae,copula_gan,ctgan,ds,gaussian_copula,synthpop,tvae
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
1,0.37,0.39,0.46,0.34,0.11,0.23,0.44,0.38,0.44,0.39,0.1,0.32,0.32,0.28,0.46,0.32,0.06,0.07
2,0.05,0.02,0.03,0.01,-0.02,0.0,0.06,0.06,0.05,0.01,-0.03,-0.02,0.06,0.01,0.04,0.03,-0.02,-0.06
3,0.0,0.0,0.08,0.0,-0.04,0.0,0.05,0.05,0.09,0.05,-0.04,0.05,0.21,0.28,0.29,0.14,0.01,0.18
4,0.03,0.04,0.02,0.06,0.02,0.06,0.06,0.03,0.03,0.09,0.02,0.04,0.04,0.02,0.05,0.11,0.02,0.03
5,0.0,0.3,0.03,0.12,0.04,0.04,0.08,0.27,0.08,0.07,0.08,0.03,0.26,0.24,0.14,0.17,0.2,0.16
Average,0.09,0.15,0.124,0.106,0.022,0.066,0.138,0.158,0.138,0.122,0.026,0.084,0.178,0.166,0.196,0.154,0.054,0.076


**Logistic Regression**

In [5]:
lr_test = pd.read_csv('../data/results/tables/lr.csv')

# Calculate average values for each model
average_values = lr_test.groupby('Model')[['Accuracy Difference', 'F1 Score Difference', 'ROC AUC Difference']].mean().reset_index()

final_data = []

for model in lr_test['Model'].unique():
    
    model_data = lr_test[lr_test['Model'] == model]
    
    for idx, row in model_data.iterrows():
        if idx == model_data.index[2]:  # Only add model name once
            final_data.append([row['Model'], row['Dataset'], row['Accuracy Difference'], row['F1 Score Difference'], row['ROC AUC Difference']])
        else:
            final_data.append(['', row['Dataset'], row['Accuracy Difference'], row['F1 Score Difference'], row['ROC AUC Difference']])
    
    # Add average values
    avg_row = average_values[average_values['Model'] == model].iloc[0]
    final_data.append(['', 'Average', avg_row['Accuracy Difference'], avg_row['F1 Score Difference'], avg_row['ROC AUC Difference']])

# Create final dataframe
lr_df = pd.DataFrame(final_data, columns=['Model', 'Dataset', 'Accuracy Diff', 'F1 Score Diff', 'ROC AUC Diff'])

# # Print as latex table
print(lr_df.to_latex(
    index=False,                 # Print row names
    float_format="%.3f",        # Set precision
    escape=True,                # Escape special characters
    bold_rows=True,             # Bold row names
    column_format="ccccc",     # Align columns
    header=True,                # Print column names
    label="tab:lr",        # Set label
    caption="Logistic Regression", # Set caption       
))

\begin{table}
\caption{Logistic Regression}
\label{tab:lr}
\begin{tabular}{ccccc}
\toprule
Model & Dataset & Accuracy Diff & F1 Score Diff & ROC AUC Diff \\
\midrule
 & 1 & 0.090 & 0.090 & 0.040 \\
 & 2 & 0.010 & 0.000 & 0.020 \\
synthpop & 3 & 0.000 & 0.000 & -0.030 \\
 & 4 & 0.070 & 0.060 & 0.010 \\
 & 5 & 0.140 & 0.140 & 0.160 \\
 & Average & 0.062 & 0.058 & 0.040 \\
 & 1 & 0.460 & 0.430 & 0.440 \\
 & 2 & 0.040 & 0.150 & 0.110 \\
ds & 3 & 0.210 & 0.120 & -0.080 \\
 & 4 & -0.080 & -0.070 & 0.000 \\
 & 5 & 0.310 & 0.260 & 0.130 \\
 & Average & 0.188 & 0.178 & 0.120 \\
 & 1 & 0.160 & 0.230 & 0.150 \\
 & 2 & -0.010 & -0.010 & 0.000 \\
tvae & 3 & 0.000 & 0.000 & 0.000 \\
 & 4 & -0.080 & -0.080 & 0.000 \\
 & 5 & 0.250 & 0.240 & 0.120 \\
 & Average & 0.064 & 0.076 & 0.054 \\
 & 1 & 0.230 & 0.240 & 0.200 \\
 & 2 & 0.030 & 0.030 & 0.060 \\
gaussian\_copula & 3 & 0.000 & 0.000 & 0.010 \\
 & 4 & 0.070 & 0.060 & 0.010 \\
 & 5 & 0.270 & 0.250 & 0.150 \\
 & Average & 0.120 & 0.116 & 0.086 \\
 & 1

In [8]:
lr = pd.read_csv('../data/results/tables/lr.csv')
lr_wide = lr.pivot(index="Dataset", columns="Model", values=["Accuracy Difference", "F1 Score Difference", "ROC AUC Difference"])
avg_row = lr_wide.mean(axis=0)
lr_wide.loc['Average'] = avg_row
lr_wide.to_csv('../final_results/combined_tables/lr_wide.csv')
lr_wide

Unnamed: 0_level_0,Accuracy Difference,Accuracy Difference,Accuracy Difference,Accuracy Difference,Accuracy Difference,Accuracy Difference,F1 Score Difference,F1 Score Difference,F1 Score Difference,F1 Score Difference,F1 Score Difference,F1 Score Difference,ROC AUC Difference,ROC AUC Difference,ROC AUC Difference,ROC AUC Difference,ROC AUC Difference,ROC AUC Difference
Model,copula_gan,ctgan,ds,gaussian_copula,synthpop,tvae,copula_gan,ctgan,ds,gaussian_copula,synthpop,tvae,copula_gan,ctgan,ds,gaussian_copula,synthpop,tvae
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
1,0.16,0.25,0.46,0.23,0.09,0.16,0.17,0.23,0.43,0.24,0.09,0.23,0.22,0.27,0.44,0.2,0.04,0.15
2,0.14,0.08,0.04,0.03,0.01,-0.01,0.18,0.16,0.15,0.03,0.0,-0.01,0.17,0.09,0.11,0.06,0.02,0.0
3,0.0,0.04,0.21,0.0,0.0,0.0,0.0,0.02,0.12,0.0,0.0,0.0,0.01,-0.02,-0.08,0.01,-0.03,0.0
4,0.0,0.07,-0.08,0.07,0.07,-0.08,0.0,0.06,-0.07,0.06,0.06,-0.08,0.01,0.01,0.0,0.01,0.01,0.0
5,0.08,0.33,0.31,0.27,0.14,0.25,0.2,0.33,0.26,0.25,0.14,0.24,0.24,0.2,0.13,0.15,0.16,0.12
Average,0.076,0.154,0.188,0.12,0.062,0.064,0.11,0.16,0.178,0.116,0.058,0.076,0.13,0.11,0.12,0.086,0.04,0.054


**Multilayer Perceptron**

In [8]:
mlp_test = pd.read_csv('../data/results/tables/mlp.csv')

# Calculate average values for each model
average_values = mlp_test.groupby('Model')[['Accuracy Difference', 'F1 Score Difference', 'ROC AUC Difference']].mean().reset_index()

final_data = []

for model in mlp_test['Model'].unique():
    
    model_data = mlp_test[mlp_test['Model'] == model]
    
    for idx, row in model_data.iterrows():
        if idx == model_data.index[2]:  # Only add model name once
            final_data.append([row['Model'], row['Dataset'], row['Accuracy Difference'], row['F1 Score Difference'], row['ROC AUC Difference']])
        else:
            final_data.append(['', row['Dataset'], row['Accuracy Difference'], row['F1 Score Difference'], row['ROC AUC Difference']])
    
    # Add average values
    avg_row = average_values[average_values['Model'] == model].iloc[0]
    final_data.append(['', 'Average', avg_row['Accuracy Difference'], avg_row['F1 Score Difference'], avg_row['ROC AUC Difference']])

# Create final dataframe
mlp_df = pd.DataFrame(final_data, columns=['Model', 'Dataset', 'Accuracy Diff', 'F1 Score Diff', 'ROC AUC Diff'])

# # Print as latex table
print(mlp_df.to_latex(
    index=False,                 # Print row names
    float_format="%.3f",        # Set precision
    escape=True,                # Escape special characters
    bold_rows=True,             # Bold row names
    column_format="ccccc",     # Align columns
    header=True,                # Print column names
    label="tab:mlp",        # Set label
    caption="Multilayer Perceptron", # Set caption       
))
mlp_df

\begin{table}
\caption{Multilayer Perceptron}
\label{tab:mlp}
\begin{tabular}{ccccc}
\toprule
Model & Dataset & Accuracy Diff & F1 Score Diff & ROC AUC Diff \\
\midrule
 & 1 & 0.090 & 0.090 & 0.040 \\
 & 2 & 0.010 & 0.000 & 0.020 \\
synthpop & 3 & 0.000 & 0.000 & -0.030 \\
 & 4 & 0.070 & 0.060 & 0.010 \\
 & 5 & 0.150 & 0.150 & 0.150 \\
 & Average & 0.064 & 0.060 & 0.038 \\
 & 1 & 0.460 & 0.430 & 0.440 \\
 & 2 & 0.040 & 0.150 & 0.110 \\
ds & 3 & 0.210 & 0.120 & -0.080 \\
 & 4 & -0.080 & -0.070 & 0.000 \\
 & 5 & 0.140 & 0.140 & 0.140 \\
 & Average & 0.154 & 0.154 & 0.122 \\
 & 1 & 0.160 & 0.230 & 0.150 \\
 & 2 & -0.010 & -0.010 & 0.000 \\
tvae & 3 & 0.000 & 0.000 & 0.000 \\
 & 4 & -0.080 & -0.080 & 0.000 \\
 & 5 & 0.180 & 0.170 & 0.090 \\
 & Average & 0.050 & 0.062 & 0.048 \\
 & 1 & 0.230 & 0.240 & 0.200 \\
 & 2 & 0.030 & 0.030 & 0.060 \\
gaussian\_copula & 3 & 0.000 & 0.000 & 0.010 \\
 & 4 & 0.070 & 0.060 & 0.010 \\
 & 5 & 0.260 & 0.230 & 0.200 \\
 & Average & 0.118 & 0.112 & 0.096 \\
 

Unnamed: 0,Model,Dataset,Accuracy Diff,F1 Score Diff,ROC AUC Diff
0,,1,0.09,0.09,0.04
1,,2,0.01,0.0,0.02
2,synthpop,3,0.0,0.0,-0.03
3,,4,0.07,0.06,0.01
4,,5,0.15,0.15,0.15
5,,Average,0.064,0.06,0.038
6,,1,0.46,0.43,0.44
7,,2,0.04,0.15,0.11
8,ds,3,0.21,0.12,-0.08
9,,4,-0.08,-0.07,0.0


In [9]:
mlp = pd.read_csv('../data/results/tables/mlp.csv')
mlp_wide = mlp.pivot(index="Dataset", columns="Model", values=["Accuracy Difference", "F1 Score Difference", "ROC AUC Difference"])
avg_row = mlp_wide.mean(axis=0)
mlp_wide.loc['Average'] = avg_row
mlp_wide.to_csv('../final_results/combined_tables/mlp_wide.csv')
mlp_wide

Unnamed: 0_level_0,Accuracy Difference,Accuracy Difference,Accuracy Difference,Accuracy Difference,Accuracy Difference,Accuracy Difference,F1 Score Difference,F1 Score Difference,F1 Score Difference,F1 Score Difference,F1 Score Difference,F1 Score Difference,ROC AUC Difference,ROC AUC Difference,ROC AUC Difference,ROC AUC Difference,ROC AUC Difference,ROC AUC Difference
Model,copula_gan,ctgan,ds,gaussian_copula,synthpop,tvae,copula_gan,ctgan,ds,gaussian_copula,synthpop,tvae,copula_gan,ctgan,ds,gaussian_copula,synthpop,tvae
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
1,0.16,0.25,0.46,0.23,0.09,0.16,0.17,0.23,0.43,0.24,0.09,0.23,0.22,0.27,0.44,0.2,0.04,0.15
2,0.14,0.08,0.04,0.03,0.01,-0.01,0.18,0.16,0.15,0.03,0.0,-0.01,0.17,0.09,0.11,0.06,0.02,0.0
3,0.0,0.04,0.21,0.0,0.0,0.0,0.0,0.02,0.12,0.0,0.0,0.0,0.01,-0.02,-0.08,0.01,-0.03,0.0
4,0.0,0.07,-0.08,0.07,0.07,-0.08,0.0,0.06,-0.07,0.06,0.06,-0.08,0.01,0.01,0.0,0.01,0.01,0.0
5,0.02,0.26,0.14,0.26,0.15,0.18,0.15,0.25,0.14,0.23,0.15,0.17,0.25,0.17,0.14,0.2,0.15,0.09
Average,0.064,0.14,0.154,0.118,0.064,0.05,0.1,0.144,0.154,0.112,0.06,0.062,0.132,0.104,0.122,0.096,0.038,0.048


In [10]:
# Save plots also in Final_Results
import shutil
import os

# Define the source and destination directories
source_dir = '../data/results/plots'
destination_dir = '../final_results/plots'

# Remove the destination directory if it exists
if os.path.exists(destination_dir):
    shutil.rmtree(destination_dir)

# Copy all files from source to destination
shutil.copytree(source_dir, destination_dir)


'../final_results/plots'