In [13]:
import pandas as pd
import numpy as np
from scipy.stats import gaussian_kde, entropy

In [14]:
encoder_dense_layers_trial = [[10, 8], [12, 10], [14, 12], [16, 14], [18, 16], [20, 18] ,[22,20]]
bottle_neck_trial = [8, 10, 12, 14, 16, 18]
decoder_dense_layers_trial = [[8, 10], [10, 12], [12, 14], [14, 16], [16, 18], [18, 20], [20,22]]

In [15]:
def plot_density(df1, df2):
        num_columns = len(df2.columns)

        highlighted_areas = {} 
        kl_divergences = {}  

        for i, column in enumerate(df1.columns):
            # Fill between the curves
            x = np.linspace(0, 1, 1000)  
            kde_original = gaussian_kde(df1[column])
            kde_synthetic = gaussian_kde(df2[column])
            y1 = kde_original(x)
            y2 = kde_synthetic(x)


            # Calculate and store the highlighted area for the column
            highlighted_area = np.sum(np.maximum(y1 - y2, 0) * np.diff(x)[0])
            highlighted_areas[column] = highlighted_area

            # Calculate and store the KL divergence for the column
            kl_divergence = entropy(y1, y2) # REF https://www.kaggle.com/code/nhan1212/some-statistical-distances
            kl_divergences[column] = kl_divergence

        total_highlighted_area = np.sum(list(highlighted_areas.values()))
        total_kl_divergence = np.sum(list(kl_divergences.values()))

        print(f"Total highlighted area: {total_highlighted_area:.2f}")
        print(f"Average KL divergence: {total_kl_divergence / num_columns:.6f}")

In [16]:
def calculate(original_data, synthetic_data):

    num_columns = len(synthetic_data.columns)
    highlighted_areas = {}  
    kl_divergences = {}  

    for i, column in enumerate(original_data.columns):
        x = np.linspace(0, 1, 1000)  
        kde_original = gaussian_kde(original_data[column])
        kde_synthetic = gaussian_kde(synthetic_data[column])
        y1 = kde_original(x)
        y2 = kde_synthetic(x)
       
        # Area
        highlighted_area = np.sum(np.maximum(y1 - y2, 0) * np.diff(x)[0])
        highlighted_areas[column] = highlighted_area

        # KL divergence using entropy
        kl_divergence = entropy(y1, y2) 
        kl_divergences[column] = kl_divergence

        total_highlighted_area = np.sum(list(highlighted_areas.values()))
        total_kl_divergence = np.sum(list(kl_divergences.values()))

    return total_highlighted_area, total_kl_divergence / num_columns

In [17]:
result = []

for bn in bottle_neck_trial:
    for enc_layers in encoder_dense_layers_trial:
        for dec_layers in decoder_dense_layers_trial:
            model_name = f"L27_E{enc_layers[0]}_{enc_layers[1]}_B{bn}_D{dec_layers[0]}_{dec_layers[1]}"
            original_df = pd.read_csv(f"{model_name}_Original_minority_data.csv")  
            synthetic_df = pd.read_csv(f"{model_name}_Synthetic_minority_data.csv")
            original_df.drop('class', axis=1, inplace=True)
            synthetic_df.drop('class', axis=1, inplace=True)

            total_highlighted_area, average_kl_divergence = calculate(original_df, synthetic_df)

            result.append([model_name, total_highlighted_area, average_kl_divergence])
            print(model_name, "," ,total_highlighted_area, ",", average_kl_divergence)

L27_E10_8_B8_D8_10 , 2.972247260348869 , 0.3749207312931829
L27_E10_8_B8_D10_12 , 3.7560698810307414 , 2.3500098212484115
L27_E10_8_B8_D12_14 , 2.919864521935129 , 0.37607376402721765
L27_E10_8_B8_D14_16 , 3.274742462567329 , 0.4872367964681068
L27_E10_8_B8_D16_18 , 3.0885261866367646 , 0.4057240013349049
L27_E10_8_B8_D18_20 , 3.279007825574798 , 0.6434079379684363
L27_E10_8_B8_D20_22 , 2.68844492561233 , 0.2975298872284236
L27_E12_10_B8_D8_10 , 3.3223511466476237 , 1.183564360365124
L27_E12_10_B8_D10_12 , 2.3617189281915643 , 0.1900833930622959
L27_E12_10_B8_D12_14 , 3.2342845631624977 , 1.114234536339237
L27_E12_10_B8_D14_16 , 3.1608418966757132 , 1.335328599038798
L27_E12_10_B8_D16_18 , 3.2024072163453563 , 0.47188125465337916
L27_E12_10_B8_D18_20 , 2.3218833789598925 , 0.19355957970640225
L27_E12_10_B8_D20_22 , 3.1975034209235256 , 0.35326403991035693
L27_E14_12_B8_D8_10 , 3.54744893323935 , 1.6851653434485319
L27_E14_12_B8_D10_12 , 2.88734734317718 , 0.38365291537312557
L27_E14_12

In [18]:
result

[['L27_E10_8_B8_D8_10', 2.972247260348869, 0.3749207312931829],
 ['L27_E10_8_B8_D10_12', 3.7560698810307414, 2.3500098212484115],
 ['L27_E10_8_B8_D12_14', 2.919864521935129, 0.37607376402721765],
 ['L27_E10_8_B8_D14_16', 3.274742462567329, 0.4872367964681068],
 ['L27_E10_8_B8_D16_18', 3.0885261866367646, 0.4057240013349049],
 ['L27_E10_8_B8_D18_20', 3.279007825574798, 0.6434079379684363],
 ['L27_E10_8_B8_D20_22', 2.68844492561233, 0.2975298872284236],
 ['L27_E12_10_B8_D8_10', 3.3223511466476237, 1.183564360365124],
 ['L27_E12_10_B8_D10_12', 2.3617189281915643, 0.1900833930622959],
 ['L27_E12_10_B8_D12_14', 3.2342845631624977, 1.114234536339237],
 ['L27_E12_10_B8_D14_16', 3.1608418966757132, 1.335328599038798],
 ['L27_E12_10_B8_D16_18', 3.2024072163453563, 0.47188125465337916],
 ['L27_E12_10_B8_D18_20', 2.3218833789598925, 0.19355957970640225],
 ['L27_E12_10_B8_D20_22', 3.1975034209235256, 0.35326403991035693],
 ['L27_E14_12_B8_D8_10', 3.54744893323935, 1.6851653434485319],
 ['L27_E14_1