# **Merge results**

In this notebook the results of all notebooks/datasets will be merged into one table for each evaluation method. Additionally, the average over all datasets will be added.

In [1]:
# Loads the required libraries
import pandas as pd

## **Resemblance**

**Difference in pairwise correlation:**

In [8]:
# Load the PCD results
corr_diff = pd.read_csv('../data/results/tables/corr_diff.csv')

# Pivot the data to wide format
corr_diff_wide = corr_diff.pivot(index='Dataset', columns='Model', values='Pairwise Corr Diff')

# Calculate the average correlation difference for each model and add it as a new row
avg_row = corr_diff_wide.mean(axis=0)
corr_diff_wide.loc['Average'] = avg_row

# Round the values to 3 decimal places
corr_diff_wide = corr_diff_wide.round(3)

# Save the wide format table to a CSV file
corr_diff_wide.to_csv('../final_results/combined_tables/corr_diff.csv')

# Print as latex table
print(corr_diff_wide.to_latex(
    index=True,                 # Print row names
    float_format="%.3f",        # Set precision
    escape=True,                # Escape special characters
    bold_rows=True,             # Bold row names
    column_format="lcccccc",    # Align columns
    header=True,                # Print column names
    label="tab:corr_diff",        # Set label
    caption="Difference in Pairwise Correlation", # Set caption       
))
corr_diff_wide


\begin{table}
\caption{Difference in Pairwise Correlation}
\label{tab:corr_diff}
\begin{tabular}{lcccccc}
\toprule
Model & copula\_gan & ctgan & ds & gaussian\_copula & synthpop & tvae \\
Dataset &  &  &  &  &  &  \\
\midrule
\textbf{1} & 0.157 & 0.163 & 0.155 & 0.117 & 0.058 & 0.150 \\
\textbf{2} & 0.198 & 0.194 & 0.177 & 0.069 & 0.025 & 0.166 \\
\textbf{3} & 0.283 & 0.276 & 0.281 & 0.233 & 0.158 & 0.192 \\
\textbf{4} & 0.030 & 0.029 & 0.035 & 0.035 & 0.006 & 0.025 \\
\textbf{5} & 0.099 & 0.099 & 0.136 & 0.088 & 0.069 & 0.112 \\
\textbf{Average} & 0.153 & 0.152 & 0.157 & 0.109 & 0.063 & 0.129 \\
\bottomrule
\end{tabular}
\end{table}



Model,copula_gan,ctgan,ds,gaussian_copula,synthpop,tvae
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.157,0.163,0.155,0.117,0.058,0.15
2,0.198,0.194,0.177,0.069,0.025,0.166
3,0.283,0.276,0.281,0.233,0.158,0.192
4,0.03,0.029,0.035,0.035,0.006,0.025
5,0.099,0.099,0.136,0.088,0.069,0.112
Average,0.153,0.152,0.157,0.109,0.063,0.129


**Jensen-Shannon Divergence**

In [9]:
# Load JSD results
jsd = pd.read_csv('../data/results/tables/jsd.csv')

# Pivot the data to create a wide format table
jsd_wide = jsd.pivot(index='Dataset', columns='Model', values='JSD Diff')

# Calculate the average for each model and add it as a new row
avg_row = jsd_wide.mean(axis=0)
jsd_wide.loc['Average'] = avg_row

# Round the values to 3 decimal places
jsd_wide = jsd_wide.round(3)

# Save the wide format table to a CSV file
jsd_wide.to_csv('../final_results/combined_tables/jsd.csv')

# Print as latex table
print(jsd_wide.to_latex(
    index=True,                 # Print row names
    float_format="%.3f",        # Set precision
    escape=True,                # Escape special characters
    bold_rows=True,             # Bold row names
    column_format="lcccccc",    # Align columns
    header=True,                # Print column names
    label="tab:jsd",        # Set label
    caption="Jensen-Shannon Divergence", # Set caption       
))

jsd_wide

\begin{table}
\caption{Jensen-Shannon Divergence}
\label{tab:jsd}
\begin{tabular}{lcccccc}
\toprule
Model & copula\_gan & ctgan & ds & gaussian\_copula & synthpop & tvae \\
Dataset &  &  &  &  &  &  \\
\midrule
\textbf{1} & 0.045 & 0.064 & 0.089 & 0.051 & 0.030 & 0.152 \\
\textbf{2} & 0.039 & 0.050 & 0.060 & 0.011 & 0.006 & 0.076 \\
\textbf{3} & 0.182 & 0.168 & 0.201 & 0.185 & 0.040 & 0.196 \\
\textbf{4} & 0.074 & 0.066 & 0.113 & 0.107 & 0.013 & 0.075 \\
\textbf{5} & 0.025 & 0.026 & 0.094 & 0.010 & 0.003 & 0.173 \\
\textbf{Average} & 0.073 & 0.075 & 0.111 & 0.073 & 0.018 & 0.135 \\
\bottomrule
\end{tabular}
\end{table}



Model,copula_gan,ctgan,ds,gaussian_copula,synthpop,tvae
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.045,0.064,0.089,0.051,0.03,0.152
2,0.039,0.05,0.06,0.011,0.006,0.076
3,0.182,0.168,0.201,0.185,0.04,0.196
4,0.074,0.066,0.113,0.107,0.013,0.075
5,0.025,0.026,0.094,0.01,0.003,0.173
Average,0.073,0.075,0.111,0.073,0.018,0.135


**Wasserstein distance**

In [10]:
# Load WD results
wd = pd.read_csv('../data/results/tables/wd.csv')

# Pivot the data to create a wide format table
wd_wide = wd.pivot(index='Dataset', columns='Model', values='WD Diff')

# Calculate the average for each model and add it as a new row
avg_row = wd_wide.mean(axis=0)
wd_wide.loc['Average'] = avg_row

# Round the values to 3 decimal places
wd_wide = wd_wide.round(3)

# Save the wide format table to a CSV file
wd_wide.to_csv('../final_results/combined_tables/wd.csv')

# Print as latex table
print(wd_wide.to_latex(
    index=True,                 # Print row names
    float_format="%.3f",        # Set precision
    escape=True,                # Escape special characters
    bold_rows=True,             # Bold row names
    column_format="lcccccc",    # Align columns
    header=True,                # Print column names
    label="tab:wd",        # Set label
    caption="Wasserstein Distance", # Set caption       
))

wd_wide

\begin{table}
\caption{Wasserstein Distance}
\label{tab:wd}
\begin{tabular}{lcccccc}
\toprule
Model & copula\_gan & ctgan & ds & gaussian\_copula & synthpop & tvae \\
Dataset &  &  &  &  &  &  \\
\midrule
\textbf{1} & 0.027 & 0.014 & 0.055 & 0.015 & 0.021 & 0.120 \\
\textbf{2} & 0.031 & 0.019 & 0.047 & 0.009 & 0.007 & 0.092 \\
\textbf{3} & 0.036 & 0.037 & 0.080 & 0.035 & 0.022 & 0.135 \\
\textbf{4} & 0.023 & 0.019 & 0.016 & 0.005 & 0.002 & 0.026 \\
\textbf{5} & 0.040 & 0.036 & 0.124 & 0.016 & 0.021 & 0.274 \\
\textbf{Average} & 0.031 & 0.025 & 0.064 & 0.016 & 0.014 & 0.130 \\
\bottomrule
\end{tabular}
\end{table}



Model,copula_gan,ctgan,ds,gaussian_copula,synthpop,tvae
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.027,0.014,0.055,0.015,0.021,0.12
2,0.031,0.019,0.047,0.009,0.007,0.092
3,0.036,0.037,0.08,0.035,0.022,0.135
4,0.023,0.019,0.016,0.005,0.002,0.026
5,0.04,0.036,0.124,0.016,0.021,0.274
Average,0.031,0.025,0.064,0.016,0.014,0.13


## **Privacy**

**Distance to Closest Record**

In [12]:
# Load DCR results
dcr_test = pd.read_csv('../data/results/tables/dcr.csv')

# Calculate average values for each model
average_values = dcr_test.groupby('Model')[['DCR 5th Percentile', 'DCR 5th Percentile (within Real)', 'DCR 5th Percentile (within Synthetic)']].mean().reset_index()

final_data = []

for model in dcr_test['Model'].unique():
    
    model_data = dcr_test[dcr_test['Model'] == model]
    
    for idx, row in model_data.iterrows():
        if idx == model_data.index[2]:  # Only add model name once
            final_data.append([row['Model'], row['Dataset'], row['DCR 5th Percentile'], row['DCR 5th Percentile (within Real)'], row['DCR 5th Percentile (within Synthetic)']])
        else:
            final_data.append(['', row['Dataset'], row['DCR 5th Percentile'], row['DCR 5th Percentile (within Real)'], row['DCR 5th Percentile (within Synthetic)']])
    
    # Add average values
    avg_row = average_values[average_values['Model'] == model].iloc[0]
    final_data.append(['', 'Average', avg_row['DCR 5th Percentile'], avg_row['DCR 5th Percentile (within Real)'], avg_row['DCR 5th Percentile (within Synthetic)']])

# Create final dataframe
dcr_df = pd.DataFrame(final_data, columns=['Model', 'Dataset', 'Between Synth/Real', 'Within Real', 'Within Synth']).round(3)
dcr_df.to_csv('../final_results/combined_tables/dcr.csv', index=False)

# Print as latex table
print(dcr_df.to_latex(
    index=False,                 # Print row names
    float_format="%.3f",        # Set precision
    escape=True,                # Escape special characters
    bold_rows=True,             # Bold row names
    column_format="ccccc",    # Align columns
    header=True,                # Print column names
    label="tab:dcr",        # Set label
    caption="Distance to Closest Record", # Set caption       
))

\begin{table}
\caption{Distance to Closest Record}
\label{tab:dcr}
\begin{tabular}{ccccc}
\toprule
Model & Dataset & Between Synth/Real & Within Real & Within Synth \\
\midrule
 & 1 & 6.220 & 5.568 & 6.481 \\
 & 2 & 1.414 & 1.732 & 1.414 \\
synthpop & 3 & 6.022 & 6.009 & 5.847 \\
 & 4 & 2.449 & 2.236 & 2.449 \\
 & 5 & 3.708 & 3.708 & 3.742 \\
 & Average & 3.963 & 3.851 & 3.987 \\
 & 1 & 12.153 & 5.568 & 11.314 \\
 & 2 & 3.873 & 1.732 & 2.449 \\
ds & 3 & 47.448 & 6.009 & 28.485 \\
 & 4 & 10.536 & 2.236 & 9.798 \\
 & 5 & 6.671 & 3.708 & 4.975 \\
 & Average & 16.136 & 3.851 & 11.404 \\
 & 1 & 7.507 & 5.568 & 5.385 \\
 & 2 & 2.000 & 1.732 & 1.414 \\
tvae & 3 & 7.246 & 6.009 & 6.156 \\
 & 4 & 5.831 & 2.236 & 5.000 \\
 & 5 & 4.243 & 3.708 & 2.000 \\
 & Average & 5.365 & 3.851 & 3.991 \\
 & 1 & 8.449 & 5.568 & 7.937 \\
 & 2 & 1.732 & 1.732 & 1.732 \\
gaussian\_copula & 3 & 10.515 & 6.009 & 11.069 \\
 & 4 & 13.038 & 2.236 & 12.166 \\
 & 5 & 4.243 & 3.708 & 4.243 \\
 & Average & 7.595 & 3.851 &

**Nearest Neighbour Distance Ratio**

In [13]:
# Load NNDR results
nndr_test = pd.read_csv('../data/results/tables/nndr.csv')

# Calculate average values for each model
average_values = nndr_test.groupby('Model')[['NNDR 5th percentile', 'NNDR 5th percentile (within Real)', 'NNDR 5th percentile (within Synthetic)']].mean().reset_index()

final_data = []

for model in nndr_test['Model'].unique():
    
    model_data = nndr_test[nndr_test['Model'] == model]
    
    for idx, row in model_data.iterrows():
        if idx == model_data.index[2]:  # Only add model name once
            final_data.append([row['Model'], row['Dataset'], row['NNDR 5th percentile'], row['NNDR 5th percentile (within Real)'], row['NNDR 5th percentile (within Synthetic)']])
        else:
            final_data.append(['', row['Dataset'], row['NNDR 5th percentile'], row['NNDR 5th percentile (within Real)'], row['NNDR 5th percentile (within Synthetic)']])
    
    # Add average values
    avg_row = average_values[average_values['Model'] == model].iloc[0]
    final_data.append(['', 'Average', avg_row['NNDR 5th percentile'], avg_row['NNDR 5th percentile (within Real)'], avg_row['NNDR 5th percentile (within Synthetic)']])

# Create final dataframe
nndr_df = pd.DataFrame(final_data, columns=['Model', 'Dataset', 'Between Synth/Real', 'Within Real', 'Within Synth']).round(3)
nndr_df.to_csv('../final_results/combined_tables/nndr.csv', index=False)

# Print as latex table
print(nndr_df.to_latex(
    index=False,                 # Print row names
    float_format="%.3f",        # Set precision
    escape=True,                # Escape special characters
    bold_rows=True,             # Bold row names
    column_format="ccccc",    # Align columns
    header=True,                # Print column names
    label="tab:nndr",        # Set label
    caption="Nearest Neighbour Distance Ratio", # Set caption       
))

\begin{table}
\caption{Nearest Neighbour Distance Ratio}
\label{tab:nndr}
\begin{tabular}{ccccc}
\toprule
Model & Dataset & Between Synth/Real & Within Real & Within Synth \\
\midrule
 & 1 & 0.439 & 0.493 & 0.449 \\
 & 2 & 0.408 & 0.555 & 0.425 \\
synthpop & 3 & 0.426 & 0.583 & 0.449 \\
 & 4 & 0.046 & 0.073 & 0.072 \\
 & 5 & 0.750 & 0.771 & 0.791 \\
 & Average & 0.414 & 0.495 & 0.437 \\
 & 1 & 0.560 & 0.493 & 0.569 \\
 & 2 & 0.756 & 0.555 & 0.371 \\
ds & 3 & 0.497 & 0.583 & 0.517 \\
 & 4 & 0.223 & 0.073 & 0.216 \\
 & 5 & 0.403 & 0.771 & 0.663 \\
 & Average & 0.488 & 0.495 & 0.467 \\
 & 1 & 0.564 & 0.493 & 0.522 \\
 & 2 & 0.577 & 0.555 & 0.458 \\
tvae & 3 & 0.557 & 0.583 & 0.479 \\
 & 4 & 0.188 & 0.073 & 0.180 \\
 & 5 & 0.858 & 0.771 & 0.000 \\
 & Average & 0.549 & 0.495 & 0.328 \\
 & 1 & 0.557 & 0.493 & 0.499 \\
 & 2 & 0.555 & 0.555 & 0.577 \\
gaussian\_copula & 3 & 0.568 & 0.583 & 0.637 \\
 & 4 & 0.130 & 0.073 & 0.207 \\
 & 5 & 0.799 & 0.771 & 0.794 \\
 & Average & 0.522 & 0.495 & 0.5

**Membership inference attack**

In [None]:
# Load MIA results
df_mia_large = pd.read_csv('../data/results/tables/mia.csv')

# Calculate average value of all proportions for each threshold
large_test = df_mia_large.groupby(['Dataset', 'Model', 'Metric', 'Threshold'])[ 'Value'].median().round(3).reset_index()

# Calculate average for each model over all datasets
avg = large_test.groupby(['Model', 'Metric', 'Threshold'])['Value'].mean().reset_index().assign(Dataset='Average')
large_test = pd.concat([large_test, avg])

# Pivot table
wide_test = large_test.pivot(index=['Model', 'Dataset'], columns=['Threshold', 'Metric'], values='Value').round(3)
wide_test = wide_test.sort_index(axis=1, level=0)
wide_test.to_csv('../final_results/combined_tables/mia.csv')

# Print as latex table
print(wide_test.to_latex(
    index=True,                 # Print row names
    float_format="%.2f",        # Set precision
    escape=True,                # Escape special characters            # Bold row names
    column_format="cccccccccc",     # Align columns
    header=True,                # Print column names
    multicolumn=True,
    multirow=True,
    label="tab:mia",        # Set label
    caption="Membership Inference Attack", # Set caption       
))

wide_test


\begin{table}
\caption{Membership Inference Attack}
\label{tab:mia}
\begin{tabular}{cccccccccc}
\toprule
 & Threshold & \multicolumn{2}{r}{0.100000} & \multicolumn{2}{r}{0.200000} & \multicolumn{2}{r}{0.300000} & \multicolumn{2}{r}{0.400000} \\
 & Metric & accuracy & precision & accuracy & precision & accuracy & precision & accuracy & precision \\
Model & Dataset &  &  &  &  &  &  &  &  \\
\midrule
\multirow[t]{6}{*}{copula\_gan} & 1 & 0.40 & 0.00 & 0.40 & 0.00 & 0.41 & 0.57 & 0.43 & 0.56 \\
 & 2 & 0.28 & 0.00 & 0.28 & 0.00 & 0.40 & 0.82 & 0.72 & 0.72 \\
 & 3 & 0.27 & 0.00 & 0.27 & 0.00 & 0.27 & 0.00 & 0.27 & 0.00 \\
 & 4 & 0.33 & 0.69 & 0.56 & 0.72 & 0.68 & 0.70 & 0.70 & 0.70 \\
 & 5 & 0.30 & 0.00 & 0.30 & 0.00 & 0.35 & 0.94 & 0.72 & 0.76 \\
 & Average & 0.32 & 0.14 & 0.36 & 0.14 & 0.42 & 0.61 & 0.57 & 0.55 \\
\cline{1-10}
\multirow[t]{6}{*}{ctgan} & 1 & 0.40 & 0.00 & 0.40 & 0.00 & 0.40 & 0.36 & 0.46 & 0.64 \\
 & 2 & 0.28 & 0.00 & 0.28 & 0.50 & 0.37 & 0.73 & 0.70 & 0.71 \\
 & 3 & 0.27

Unnamed: 0_level_0,Threshold,0.1,0.1,0.2,0.2,0.3,0.3,0.4,0.4
Unnamed: 0_level_1,Metric,accuracy,precision,accuracy,precision,accuracy,precision,accuracy,precision
Model,Dataset,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
copula_gan,1,0.4,0.0,0.4,0.0,0.41,0.57,0.43,0.56
copula_gan,2,0.28,0.0,0.28,0.0,0.4,0.82,0.72,0.72
copula_gan,3,0.27,0.0,0.27,0.0,0.27,0.0,0.27,0.0
copula_gan,4,0.33,0.69,0.56,0.72,0.68,0.7,0.7,0.7
copula_gan,5,0.3,0.0,0.3,0.0,0.35,0.94,0.72,0.76
copula_gan,Average,0.316,0.138,0.362,0.144,0.422,0.606,0.568,0.548
ctgan,1,0.4,0.0,0.4,0.0,0.4,0.36,0.46,0.64
ctgan,2,0.28,0.0,0.28,0.5,0.37,0.73,0.7,0.71
ctgan,3,0.27,0.0,0.27,0.0,0.27,0.0,0.27,0.0
ctgan,4,0.32,0.6,0.52,0.67,0.68,0.7,0.7,0.7


## **Utility**

**Random Forest**

In [14]:
# Load RF results
rf_test = pd.read_csv('../data/results/tables/rf.csv')

# Calculate average values for each model
average_values = rf_test.groupby('Model')[['Accuracy Difference', 'F1 Score Difference', 'ROC AUC Difference']].mean().reset_index()

final_data = []

for model in rf_test['Model'].unique():
    
    model_data = rf_test[rf_test['Model'] == model]
    
    for idx, row in model_data.iterrows():
        if idx == model_data.index[2]:  # Only add model name once
            final_data.append([row['Model'], row['Dataset'], row['Accuracy Difference'], row['F1 Score Difference'], row['ROC AUC Difference']])
        else:
            final_data.append(['', row['Dataset'], row['Accuracy Difference'], row['F1 Score Difference'], row['ROC AUC Difference']])
    
    # Add average values
    avg_row = average_values[average_values['Model'] == model].iloc[0]
    final_data.append(['', 'Average', avg_row['Accuracy Difference'], avg_row['F1 Score Difference'], avg_row['ROC AUC Difference']])

# Create final dataframe
rf_df = pd.DataFrame(final_data, columns=['Model', 'Dataset', 'Accuracy Diff', 'F1 Score Diff', 'ROC AUC Diff']).round(2)
rf_df.to_csv('../final_results/combined_tables/rf.csv', index=False)

# Print as latex table
print(rf_df.to_latex(
    index=False,                 # Print row names
    float_format="%.2f",        # Set precision
    escape=True,                # Escape special characters
    bold_rows=True,             # Bold row names
    column_format="ccccc",    # Align columns
    header=True,                # Print column names
    label="tab:rf",        # Set label
    caption="Random Forest", # Set caption       
))

rf_df


\begin{table}
\caption{Random Forest}
\label{tab:rf}
\begin{tabular}{ccccc}
\toprule
Model & Dataset & Accuracy Diff & F1 Score Diff & ROC AUC Diff \\
\midrule
 & 1 & 0.11 & 0.10 & 0.06 \\
 & 2 & -0.02 & -0.03 & -0.02 \\
synthpop & 3 & 0.00 & -0.03 & -0.13 \\
 & 4 & 0.02 & 0.02 & 0.02 \\
 & 5 & 0.04 & 0.08 & 0.20 \\
 & Average & 0.03 & 0.03 & 0.03 \\
 & 1 & 0.46 & 0.44 & 0.46 \\
 & 2 & 0.03 & 0.05 & 0.04 \\
ds & 3 & 0.08 & 0.09 & 0.29 \\
 & 4 & 0.02 & 0.03 & 0.05 \\
 & 5 & 0.03 & 0.08 & 0.14 \\
 & Average & 0.12 & 0.14 & 0.20 \\
 & 1 & 0.23 & 0.32 & 0.07 \\
 & 2 & 0.00 & -0.02 & -0.06 \\
tvae & 3 & 0.00 & 0.05 & 0.18 \\
 & 4 & 0.06 & 0.04 & 0.03 \\
 & 5 & 0.04 & 0.03 & 0.16 \\
 & Average & 0.07 & 0.08 & 0.08 \\
 & 1 & 0.34 & 0.39 & 0.32 \\
 & 2 & 0.01 & 0.01 & 0.03 \\
gaussian\_copula & 3 & 0.00 & 0.05 & 0.14 \\
 & 4 & 0.06 & 0.09 & 0.11 \\
 & 5 & 0.12 & 0.07 & 0.17 \\
 & Average & 0.11 & 0.12 & 0.15 \\
 & 1 & 0.37 & 0.44 & 0.32 \\
 & 2 & 0.05 & 0.06 & 0.06 \\
copula\_gan & 3 & 0.00 & 

Unnamed: 0,Model,Dataset,Accuracy Diff,F1 Score Diff,ROC AUC Diff
0,,1,0.11,0.1,0.06
1,,2,-0.02,-0.03,-0.02
2,synthpop,3,0.0,-0.03,-0.13
3,,4,0.02,0.02,0.02
4,,5,0.04,0.08,0.2
5,,Average,0.03,0.03,0.03
6,,1,0.46,0.44,0.46
7,,2,0.03,0.05,0.04
8,ds,3,0.08,0.09,0.29
9,,4,0.02,0.03,0.05


**Logistic Regression**

In [15]:
# Load LR results
lr_test = pd.read_csv('../data/results/tables/lr.csv')

# Calculate average values for each model
average_values = lr_test.groupby('Model')[['Accuracy Difference', 'F1 Score Difference', 'ROC AUC Difference']].mean().reset_index()

final_data = []

for model in lr_test['Model'].unique():
    
    model_data = lr_test[lr_test['Model'] == model]
    
    for idx, row in model_data.iterrows():
        if idx == model_data.index[2]:  # Only add model name once
            final_data.append([row['Model'], row['Dataset'], row['Accuracy Difference'], row['F1 Score Difference'], row['ROC AUC Difference']])
        else:
            final_data.append(['', row['Dataset'], row['Accuracy Difference'], row['F1 Score Difference'], row['ROC AUC Difference']])
    
    # Add average values
    avg_row = average_values[average_values['Model'] == model].iloc[0]
    final_data.append(['', 'Average', avg_row['Accuracy Difference'], avg_row['F1 Score Difference'], avg_row['ROC AUC Difference']])

# Create final dataframe
lr_df = pd.DataFrame(final_data, columns=['Model', 'Dataset', 'Accuracy Diff', 'F1 Score Diff', 'ROC AUC Diff']).round(2)
lr_df.to_csv('../final_results/combined_tables/lr.csv', index=False)

# Print as latex table
print(lr_df.to_latex(
    index=False,                 # Print row names
    float_format="%.2f",        # Set precision
    escape=True,                # Escape special characters
    bold_rows=True,             # Bold row names
    column_format="ccccc",     # Align columns
    header=True,                # Print column names
    label="tab:lr",        # Set label
    caption="Logistic Regression", # Set caption       
))
lr_df

\begin{table}
\caption{Logistic Regression}
\label{tab:lr}
\begin{tabular}{ccccc}
\toprule
Model & Dataset & Accuracy Diff & F1 Score Diff & ROC AUC Diff \\
\midrule
 & 1 & 0.08 & 0.09 & 0.03 \\
 & 2 & 0.04 & 0.04 & 0.01 \\
synthpop & 3 & 0.04 & 0.01 & 0.01 \\
 & 4 & 0.00 & 0.00 & -0.01 \\
 & 5 & 0.14 & 0.14 & 0.16 \\
 & Average & 0.06 & 0.06 & 0.04 \\
 & 1 & 0.38 & 0.38 & 0.46 \\
 & 2 & 0.07 & 0.21 & 0.11 \\
ds & 3 & 0.09 & 0.09 & 0.16 \\
 & 4 & 0.01 & 0.02 & 0.05 \\
 & 5 & 0.31 & 0.26 & 0.13 \\
 & Average & 0.17 & 0.19 & 0.18 \\
 & 1 & 0.12 & 0.21 & 0.16 \\
 & 2 & 0.00 & 0.01 & -0.01 \\
tvae & 4 & 0.04 & 0.00 & 0.00 \\
 & 5 & 0.25 & 0.24 & 0.12 \\
 & Average & 0.10 & 0.12 & 0.07 \\
 & 1 & 0.26 & 0.32 & 0.27 \\
 & 2 & 0.05 & 0.07 & 0.05 \\
gaussian\_copula & 3 & -0.17 & -0.15 & 0.03 \\
 & 4 & 0.03 & 0.12 & 0.05 \\
 & 5 & 0.27 & 0.25 & 0.15 \\
 & Average & 0.09 & 0.12 & 0.11 \\
 & 1 & 0.23 & 0.29 & 0.31 \\
 & 2 & 0.16 & 0.20 & 0.18 \\
copula\_gan & 3 & -0.04 & -0.02 & 0.11 \\
 & 4 & 0.

Unnamed: 0,Model,Dataset,Accuracy Diff,F1 Score Diff,ROC AUC Diff
0,,1,0.08,0.09,0.03
1,,2,0.04,0.04,0.01
2,synthpop,3,0.04,0.01,0.01
3,,4,0.0,0.0,-0.01
4,,5,0.14,0.14,0.16
5,,Average,0.06,0.06,0.04
6,,1,0.38,0.38,0.46
7,,2,0.07,0.21,0.11
8,ds,3,0.09,0.09,0.16
9,,4,0.01,0.02,0.05


**Multilayer Perceptron**

In [16]:
# Load MLP results
mlp_test = pd.read_csv('../data/results/tables/mlp.csv')

# Calculate average values for each model
average_values = mlp_test.groupby('Model')[['Accuracy Difference', 'F1 Score Difference', 'ROC AUC Difference']].mean().reset_index()

final_data = []


for model in mlp_test['Model'].unique():
    
    model_data = mlp_test[mlp_test['Model'] == model]
    
    for idx, row in model_data.iterrows():
        if idx == model_data.index[2]:  # Only add model name once
            final_data.append([row['Model'], row['Dataset'], row['Accuracy Difference'], row['F1 Score Difference'], row['ROC AUC Difference']])
        else:
            final_data.append(['', row['Dataset'], row['Accuracy Difference'], row['F1 Score Difference'], row['ROC AUC Difference']])
    
    # Add average values
    avg_row = average_values[average_values['Model'] == model].iloc[0]
    final_data.append(['', 'Average', avg_row['Accuracy Difference'], avg_row['F1 Score Difference'], avg_row['ROC AUC Difference']])

# Create final dataframe
mlp_df = pd.DataFrame(final_data, columns=['Model', 'Dataset', 'Accuracy Diff', 'F1 Score Diff', 'ROC AUC Diff']).round(2)
mlp_df.to_csv('../final_results/combined_tables/mlp.csv', index=False)

# Print as latex table
print(mlp_df.to_latex(
    index=False,                 # Print row names
    float_format="%.2f",        # Set precision
    escape=True,                # Escape special characters
    bold_rows=True,             # Bold row names
    column_format="ccccc",     # Align columns
    header=True,                # Print column names
    label="tab:mlp",        # Set label
    caption="Multilayer Perceptron", # Set caption       
))
mlp_df

\begin{table}
\caption{Multilayer Perceptron}
\label{tab:mlp}
\begin{tabular}{ccccc}
\toprule
Model & Dataset & Accuracy Diff & F1 Score Diff & ROC AUC Diff \\
\midrule
 & 1 & 0.09 & 0.09 & 0.04 \\
 & 2 & 0.01 & 0.00 & 0.02 \\
synthpop & 3 & 0.00 & 0.00 & -0.03 \\
 & 4 & 0.07 & 0.06 & 0.01 \\
 & 5 & 0.15 & 0.15 & 0.15 \\
 & Average & 0.06 & 0.06 & 0.04 \\
 & 1 & 0.46 & 0.43 & 0.44 \\
 & 2 & 0.04 & 0.15 & 0.11 \\
ds & 3 & 0.21 & 0.12 & -0.08 \\
 & 4 & -0.08 & -0.07 & 0.00 \\
 & 5 & 0.14 & 0.14 & 0.14 \\
 & Average & 0.15 & 0.15 & 0.12 \\
 & 1 & 0.16 & 0.23 & 0.15 \\
 & 2 & -0.01 & -0.01 & 0.00 \\
tvae & 3 & 0.00 & 0.00 & 0.00 \\
 & 4 & -0.08 & -0.08 & 0.00 \\
 & 5 & 0.18 & 0.17 & 0.09 \\
 & Average & 0.05 & 0.06 & 0.05 \\
 & 1 & 0.23 & 0.24 & 0.20 \\
 & 2 & 0.03 & 0.03 & 0.06 \\
gaussian\_copula & 3 & 0.00 & 0.00 & 0.01 \\
 & 4 & 0.07 & 0.06 & 0.01 \\
 & 5 & 0.26 & 0.23 & 0.20 \\
 & Average & 0.12 & 0.11 & 0.10 \\
 & 1 & 0.16 & 0.17 & 0.22 \\
 & 2 & 0.14 & 0.18 & 0.17 \\
copula\_gan & 3

Unnamed: 0,Model,Dataset,Accuracy Diff,F1 Score Diff,ROC AUC Diff
0,,1,0.09,0.09,0.04
1,,2,0.01,0.0,0.02
2,synthpop,3,0.0,0.0,-0.03
3,,4,0.07,0.06,0.01
4,,5,0.15,0.15,0.15
5,,Average,0.06,0.06,0.04
6,,1,0.46,0.43,0.44
7,,2,0.04,0.15,0.11
8,ds,3,0.21,0.12,-0.08
9,,4,-0.08,-0.07,0.0


**Compute the average of all utility averages**

In [17]:
for df in [rf_df, lr_df, mlp_df]:
    df["Model"].replace("", None, inplace=True)
    df["Model"].ffill(inplace=True)

# Cut out the average rows
rf_avgs = rf_df[rf_df['Dataset'] == 'Average']
lr_avgs = lr_df[lr_df['Dataset'] == 'Average']
mlp_avgs = mlp_df[mlp_df['Dataset'] == 'Average']

# Combine the average rows into one dataframe
utility_avg = pd.concat([rf_avgs, lr_avgs, mlp_avgs])

# Calculate average values for each model
result = utility_avg.groupby("Model")[["Accuracy Diff", "F1 Score Diff", "ROC AUC Diff"]].mean().round(2)

# Print as latex table
print(result.to_latex(
    index=True,                 # Print row names
    float_format="%.2f",        # Set precision
    escape=True,                # Escape special characters
    column_format="lc",    # Align columns
    header=True,                # Print column names
    label="tab:utility_avg",        # Set label
    caption="Average Utility", # Set caption
    multicolumn=True,
    multirow=True
))

\begin{table}
\caption{Average Utility}
\label{tab:utility_avg}
\begin{tabular}{lc}
\toprule
 & Accuracy Diff & F1 Score Diff & ROC AUC Diff \\
Model &  &  &  \\
\midrule
copula\_gan & 0.08 & 0.13 & 0.16 \\
ctgan & 0.14 & 0.15 & 0.12 \\
ds & 0.15 & 0.16 & 0.17 \\
gaussian\_copula & 0.11 & 0.12 & 0.12 \\
synthpop & 0.05 & 0.05 & 0.04 \\
tvae & 0.07 & 0.09 & 0.07 \\
\bottomrule
\end{tabular}
\end{table}



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Model"].replace("", None, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Model"].ffill(inplace=True)


**Save plots in _final_results_**

In [1]:
# Save plots also in Final_Results
import shutil
import os

# Define the source and destination directories
source_dir = '../data/results/plots'
destination_dir = '../final_results/plots'

# Remove the destination directory if it exists
if os.path.exists(destination_dir):
    shutil.rmtree(destination_dir)

# Copy all files from source to destination
shutil.copytree(source_dir, destination_dir)


'../final_results/plots'