In [1]:
import pandas as pd
import numpy as np

from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import evaluate_quality
from sdv.single_table import GaussianCopulaSynthesizer

import warnings

warnings.filterwarnings('ignore')

# COVTYPE

In [2]:
covtype_org = pd.read_csv('covtype.csv')

y_data_cov_syn = pd.read_csv('y_data_syn_covtype.csv')

gckm_cov_syn = pd.read_csv('covtype_gckm.csv')

gckm_varies_cov_syn = pd.read_csv('mydata.csv')

sdv_cov_syn = pd.read_csv('sdv_covtype.csv')



In [10]:
dsicard_column = ['Soil_Type15', 'Soil_Type7', 'Soil_Type25']

In [11]:
gckm_cov_syn[dsicard_column] = covtype_org[dsicard_column].values.copy()

sdv_cov_syn[dsicard_column] = covtype_org[dsicard_column].values.copy()

gckm_varies_cov_syn[dsicard_column] = covtype_org[dsicard_column].values.copy()

In [9]:
metadata1 = SingleTableMetadata()

metadata1.detect_from_dataframe(data = covtype_org)
synthesier = GaussianCopulaSynthesizer(metadata1)
synthesier.fit(covtype_org)
synthetic_data = synthesier.sample(num_rows = 581012) 
synthetic_data.to_csv('sdv_covtype.csv')

## SDV SCORE FOR COVTYPE

### Y-data

In [10]:
sdv_score_dict = {}
dataset_name = 'cov_type'
model_name = 'Y-data_regular'

In [11]:
metadata1 = SingleTableMetadata()
metadata1.detect_from_dataframe(data = covtype_org)

baseline_quality_report = evaluate_quality(
    covtype_org,  #real data
    y_data_cov_syn, #synthetic data(perfect synthesizer generated(holdout))
    metadata1
)

Creating report:   0%|          | 0/4 [00:00<?, ?it/s]

Creating report: 100%|██████████| 4/4 [01:08<00:00, 17.10s/it]



Overall Quality Score: 95.76%

Properties:
Column Shapes: 94.28%
Column Pair Trends: 97.24%


In [12]:
score = baseline_quality_report.get_score()
if model_name not in sdv_score_dict:
    sdv_score_dict[model_name] = {}

sdv_score_dict[model_name][dataset_name] = score

In [13]:
sdv_score_dict

{'Y-data_regular': {'cov_type': 0.9576128119235201}}

### SDV

In [14]:
dataset_name = 'cov_type'
model_name = 'sdv_gaussian_copula'

In [15]:
metadata1 = SingleTableMetadata()
metadata1.detect_from_dataframe(data = covtype_org)

baseline_quality_report = evaluate_quality(
    covtype_org,  #real data
    sdv_cov_syn, #synthetic data(perfect synthesizer generated(holdout))
    metadata1
)

Creating report:   0%|          | 0/4 [00:00<?, ?it/s]

Creating report: 100%|██████████| 4/4 [00:58<00:00, 14.70s/it]



Overall Quality Score: 68.38%

Properties:
Column Shapes: 39.35%
Column Pair Trends: 97.41%


In [16]:
score = baseline_quality_report.get_score()
if model_name not in sdv_score_dict:
    sdv_score_dict[model_name] = {}

sdv_score_dict[model_name][dataset_name] = score

In [17]:
sdv_score_dict

{'Y-data_regular': {'cov_type': 0.9576128119235201},
 'sdv_gaussian_copula': {'cov_type': 0.6838269560351897}}

### GCKM

In [18]:
gckm_cov_syn1 = gckm_cov_syn.drop(['Unnamed: 0'], axis= 1)
gckm_cov_syn1 = gckm_cov_syn1.astype(int)

In [19]:
dataset_name = 'cov_type'
model_name = 'gckm'

metadata1 = SingleTableMetadata()
metadata1.detect_from_dataframe(data = covtype_org)

baseline_quality_report = evaluate_quality(
    covtype_org,  #real data
    gckm_cov_syn1, #synthetic data(perfect synthesizer generated(holdout))
    metadata1
)


score = baseline_quality_report.get_score()
if model_name not in sdv_score_dict:
    sdv_score_dict[model_name] = {}

sdv_score_dict[model_name][dataset_name] = score


sdv_score_dict

Creating report:   0%|          | 0/4 [00:00<?, ?it/s]

Creating report: 100%|██████████| 4/4 [01:00<00:00, 15.06s/it]



Overall Quality Score: 96.02%

Properties:
Column Shapes: 94.87%
Column Pair Trends: 97.17%


{'Y-data_regular': {'cov_type': 0.9576128119235201},
 'sdv_gaussian_copula': {'cov_type': 0.6838269560351897},
 'gckm': {'cov_type': 0.9602017584912201}}

### GCKM VARIES

In [20]:
dataset_name = 'cov_type'
model_name = 'gckm_varies'

metadata1 = SingleTableMetadata()
metadata1.detect_from_dataframe(data = covtype_org)

gckm_varies_cov_syn = gckm_varies_cov_syn.astype(int)

baseline_quality_report = evaluate_quality(
    covtype_org,  #real data
    gckm_varies_cov_syn, #synthetic data(perfect synthesizer generated(holdout))
    metadata1
)


score = baseline_quality_report.get_score()
if model_name not in sdv_score_dict:
    sdv_score_dict[model_name] = {}

sdv_score_dict[model_name][dataset_name] = score


sdv_score_dict

Creating report: 100%|██████████| 4/4 [01:00<00:00, 15.19s/it]



Overall Quality Score: 95.25%

Properties:
Column Shapes: 93.62%
Column Pair Trends: 96.88%


{'Y-data_regular': {'cov_type': 0.9576128119235201},
 'sdv_gaussian_copula': {'cov_type': 0.6838269560351897},
 'gckm': {'cov_type': 0.9602017584912201},
 'gckm_varies': {'cov_type': 0.952498451413039}}

In [46]:
sdv_score_dict_table_1 = {'Y-data_regular': {'cov_type': 0.9576128119235201},
 'sdv_gaussian_copula': {'cov_type': 0.6838269560351897},
 'gckm': {'cov_type': 0.9602017584912201},
 'gckm_varies': {'cov_type': 0.9598792682498309}}

In [47]:
## This dict is directly copied from covtyep_mnist_data_generation.ipynb

sdv_score_previous_dataset = {'gckm_varies': {'American_Income': 0.8078128628994186,
  'fraud_insurance': 0.749636716580705,
  'travel_insurance': 0.7337016942147853},
 'Y-data_regular': {'American_Income': 0.8197638911830831,
  'fraud_insurance': 0.8060678312888359,
  'travel_insurance': 0.9049101361577259}}

In [48]:
# Merge the two dictionaries
merged_dict = {**sdv_score_previous_dataset, **sdv_score_dict_table_1}

# For keys that appear in both dictionaries, merge their sub-dictionaries
for key in sdv_score_previous_dataset:
    if key in sdv_score_dict_table_1:
        merged_dict[key] = {**sdv_score_previous_dataset[key], **sdv_score_dict_table_1[key]}

merged_dict

{'gckm_varies': {'American_Income': 0.8078128628994186,
  'fraud_insurance': 0.749636716580705,
  'travel_insurance': 0.7337016942147853,
  'cov_type': 0.9598792682498309},
 'Y-data_regular': {'American_Income': 0.8197638911830831,
  'fraud_insurance': 0.8060678312888359,
  'travel_insurance': 0.9049101361577259,
  'cov_type': 0.9576128119235201},
 'sdv_gaussian_copula': {'cov_type': 0.6838269560351897},
 'gckm': {'cov_type': 0.9602017584912201}}

In [53]:
sdv_score_table_summary = pd.DataFrame(merged_dict)

############### copied those value from previous files #######################
sdv_score_table_summary.loc['American_Income','sdv_gaussian_copula'] = 0.7242

sdv_score_table_summary.loc['American_Income','gckm'] = 0.7859

sdv_score_table_summary.loc['fraud_insurance','sdv_gaussian_copula'] = 0.7820

sdv_score_table_summary.loc['fraud_insurance','gckm'] = 0.7217

sdv_score_table_summary.loc['travel_insurance','sdv_gaussian_copula'] = 0.6021

sdv_score_table_summary.loc['travel_insurance','gckm'] = 0.8385

sdv_score_table_summary.loc['travel_insurance','gmcm'] = 0.9154

sdv_score_table_summary.loc['fraud_insurance','gmcm'] = 0.8073

sdv_score_table_summary.loc['American_Income','gmcm'] = 0.9064
###############################################################################
sdv_score_table_summary

Unnamed: 0,gckm_varies,Y-data_regular,sdv_gaussian_copula,gckm,gmcm
American_Income,0.807813,0.819764,0.7242,0.7859,0.9064
fraud_insurance,0.749637,0.806068,0.782,0.7217,0.8073
travel_insurance,0.733702,0.90491,0.6021,0.8385,0.9154
cov_type,0.959879,0.957613,0.683827,0.960202,


## MNIST_12

In [22]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [23]:
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Normalize the dataset to a range of 0 to 1
x_train, x_test = x_train / 255.0, x_test / 255.0

# Merge the training and test datasets
x_combined = np.concatenate((x_train, x_test), axis=0)

# Resize the images to 12x12
x_resized = tf.image.resize(x_combined[..., tf.newaxis], [12, 12])

# Flatten the images to a shape of (n, 144)
x_flattened = tf.reshape(x_resized, (-1, 12*12))

# Apply a threshold to convert pixel values to boolean (True for pixels > 0.5, False otherwise)
x_boolean = (x_flattened.numpy() > 0.5).astype(int)

# Create a DataFrame from the boolean array
df_mnist_boolean = pd.DataFrame(x_boolean)

# Optional: Give columns meaningful names
column_names = [f'pixel_{i}' for i in range(1, 145)]
df_mnist_boolean.columns = column_names

y_combined = np.concatenate((y_train, y_test), axis=0)

df_mnist_boolean['Number'] = y_combined

df_mnist_boolean.to_csv('mnist12.csv', index = False)

In [24]:
mnist_12 = pd.read_csv('mnist12.csv')

y_data_mnist = pd.read_csv('minist_12_y_data.csv')

gmcm_mnist = pd.read_csv('gmcm_minist_12.csv')

gckm_mnist = pd.read_csv('gckm_minst_12.csv')

gckm_varies_mnist = pd.read_csv('gckm_varies_mnist_12_new.csv')

sdv_mnist = pd.read_csv('sdv_minst_12.csv')

In [25]:
gmcm_mnist1 = gmcm_mnist.drop(['Unnamed: 0'], axis = 1)

gckm_mnist1 = gckm_mnist.drop(['Unnamed: 0'], axis = 1)

#gckm_varies_mnist1 = gckm_varies_mnist.drop(['Unnamed: 0'], axis = 1)

In [26]:
gckm_mnist1.head()

Unnamed: 0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,pixel_8,pixel_9,pixel_10,...,pixel_136,pixel_137,pixel_138,pixel_139,pixel_140,pixel_141,pixel_142,pixel_143,pixel_144,Number
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,8


In [27]:
gckm_varies_mnist

Unnamed: 0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,pixel_8,pixel_9,pixel_10,...,pixel_136,pixel_137,pixel_138,pixel_139,pixel_140,pixel_141,pixel_142,pixel_143,pixel_144,Number
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
69997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6
69998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### GMCM

In [28]:
dataset_name = 'mnist_12'
model_name = 'gmcm'

metadata1 = SingleTableMetadata()
metadata1.detect_from_dataframe(data = mnist_12)

baseline_quality_report = evaluate_quality(
    mnist_12,  #real data
    gmcm_mnist1, #synthetic data(perfect synthesizer generated(holdout))
    metadata1
)


score = baseline_quality_report.get_score()
if model_name not in sdv_score_dict_table_1:
    sdv_score_dict_table_1[model_name] = {}

sdv_score_dict_table_1[model_name][dataset_name] = score


sdv_score_dict_table_1

Creating report:   0%|          | 0/4 [00:00<?, ?it/s]

Creating report: 100%|██████████| 4/4 [01:04<00:00, 16.03s/it]



Overall Quality Score: 94.04%

Properties:
Column Shapes: 89.83%
Column Pair Trends: 98.25%


{'Y-data_regular': {'cov_type': 0.9576128119235201},
 'sdv_gaussian_copula': {'cov_type': 0.6838269560351897},
 'gckm': {'cov_type': 0.6334059166169852},
 'gckm_varies': {'cov_type': 0.9598792682498309},
 'gmcm': {'mnist_12': 0.9403969842667326}}

### GCKM

In [29]:
dataset_name = 'mnist_12'
model_name = 'gckm'

metadata1 = SingleTableMetadata()
metadata1.detect_from_dataframe(data = mnist_12)

baseline_quality_report = evaluate_quality(
    mnist_12,  #real data
    gckm_mnist1, #synthetic data(perfect synthesizer generated(holdout))
    metadata1
)


score = baseline_quality_report.get_score()
if model_name not in sdv_score_dict_table_1:
    sdv_score_dict_table_1[model_name] = {}

sdv_score_dict_table_1[model_name][dataset_name] = score


sdv_score_dict_table_1

Creating report:   0%|          | 0/4 [00:00<?, ?it/s]

Creating report: 100%|██████████| 4/4 [01:04<00:00, 16.01s/it]



Overall Quality Score: 94.1%

Properties:
Column Shapes: 89.83%
Column Pair Trends: 98.36%


{'Y-data_regular': {'cov_type': 0.9576128119235201},
 'sdv_gaussian_copula': {'cov_type': 0.6838269560351897},
 'gckm': {'cov_type': 0.6334059166169852, 'mnist_12': 0.9409525675203124},
 'gckm_varies': {'cov_type': 0.9598792682498309},
 'gmcm': {'mnist_12': 0.9403969842667326}}

### SDV

In [30]:
from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import evaluate_quality
from sdv.single_table import GaussianCopulaSynthesizer


metadata1 = SingleTableMetadata()
metadata1.detect_from_dataframe(data = mnist_12)
synthesier = GaussianCopulaSynthesizer(metadata1)
synthesier.fit(mnist_12)
synthetic_data = synthesier.sample(num_rows = 70000)

synthetic_data.to_csv('sdv_minst_12.csv',  index = False)

In [31]:
synthetic_data

Unnamed: 0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,pixel_8,pixel_9,pixel_10,...,pixel_136,pixel_137,pixel_138,pixel_139,pixel_140,pixel_141,pixel_142,pixel_143,pixel_144,Number
0,0,0,0,1,1,1,1,0,0,0,...,1,1,0,1,0,1,0,0,0,9
1,0,0,0,1,1,1,1,0,0,0,...,1,1,0,1,0,1,0,0,0,1
2,0,0,0,1,1,1,1,0,0,0,...,1,1,1,1,0,1,0,0,0,3
3,0,0,0,1,1,1,1,0,0,0,...,1,1,0,1,0,1,0,0,0,4
4,0,0,0,1,1,1,1,0,0,0,...,1,1,0,1,0,1,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,0,0,0,1,1,1,1,0,0,0,...,1,1,0,1,0,1,0,0,0,7
69996,0,0,0,1,1,1,1,0,0,0,...,1,1,1,1,0,1,0,0,0,7
69997,0,0,0,1,1,1,1,0,0,0,...,1,1,1,1,0,1,0,0,0,9
69998,0,0,0,1,1,1,1,0,0,0,...,1,1,1,1,0,1,0,0,0,9


In [32]:
dataset_name = 'mnist_12'
model_name = 'sdv_gaussian_copula'

metadata1 = SingleTableMetadata()
metadata1.detect_from_dataframe(data = mnist_12)

baseline_quality_report = evaluate_quality(
    mnist_12,  #real data
    synthetic_data, #synthetic data(perfect synthesizer generated(holdout))
    metadata1
)


score = baseline_quality_report.get_score()
if model_name not in sdv_score_dict_table_1:
    sdv_score_dict_table_1[model_name] = {}

sdv_score_dict_table_1[model_name][dataset_name] = score


sdv_score_dict_table_1

Creating report:   0%|          | 0/4 [00:00<?, ?it/s]

Creating report: 100%|██████████| 4/4 [00:59<00:00, 14.98s/it]



Overall Quality Score: 79.88%

Properties:
Column Shapes: 62.12%
Column Pair Trends: 97.64%


{'Y-data_regular': {'cov_type': 0.9576128119235201},
 'sdv_gaussian_copula': {'cov_type': 0.6838269560351897,
  'mnist_12': 0.7988264088131569},
 'gckm': {'cov_type': 0.6334059166169852, 'mnist_12': 0.9409525675203124},
 'gckm_varies': {'cov_type': 0.9598792682498309},
 'gmcm': {'mnist_12': 0.9403969842667326}}

### GCKM VARIES

In [33]:
dataset_name = 'mnist_12'
model_name = 'gckm_varies'

metadata1 = SingleTableMetadata()
metadata1.detect_from_dataframe(data = mnist_12)

baseline_quality_report = evaluate_quality(
    mnist_12,  #real data
    gckm_varies_mnist, #synthetic data(perfect synthesizer generated(holdout))
    metadata1
)


score = baseline_quality_report.get_score()
if model_name not in sdv_score_dict_table_1:
    sdv_score_dict_table_1[model_name] = {}

sdv_score_dict_table_1[model_name][dataset_name] = score


sdv_score_dict_table_1

Creating report:   0%|          | 0/4 [00:00<?, ?it/s]

Creating report: 100%|██████████| 4/4 [01:02<00:00, 15.70s/it]



Overall Quality Score: 94.05%

Properties:
Column Shapes: 89.81%
Column Pair Trends: 98.28%


{'Y-data_regular': {'cov_type': 0.9576128119235201},
 'sdv_gaussian_copula': {'cov_type': 0.6838269560351897,
  'mnist_12': 0.7988264088131569},
 'gckm': {'cov_type': 0.6334059166169852, 'mnist_12': 0.9409525675203124},
 'gckm_varies': {'cov_type': 0.9598792682498309,
  'mnist_12': 0.9404632530999799},
 'gmcm': {'mnist_12': 0.9403969842667326}}

### Y-data

In [34]:
dataset_name = 'mnist_12'
model_name = 'Y-data_regular'

metadata1 = SingleTableMetadata()
metadata1.detect_from_dataframe(data = mnist_12)

baseline_quality_report = evaluate_quality(
    mnist_12,  #real data
    y_data_mnist, #synthetic data(perfect synthesizer generated(holdout))
    metadata1
)


score = baseline_quality_report.get_score()
if model_name not in sdv_score_dict_table_1:
    sdv_score_dict_table_1[model_name] = {}

sdv_score_dict_table_1[model_name][dataset_name] = score


sdv_score_dict_table_1

Creating report:   0%|          | 0/4 [00:00<?, ?it/s]

Creating report: 100%|██████████| 4/4 [00:59<00:00, 14.82s/it]



Overall Quality Score: 93.36%

Properties:
Column Shapes: 90.07%
Column Pair Trends: 96.65%


{'Y-data_regular': {'cov_type': 0.9576128119235201,
  'mnist_12': 0.9335927265758175},
 'sdv_gaussian_copula': {'cov_type': 0.6838269560351897,
  'mnist_12': 0.7988264088131569},
 'gckm': {'cov_type': 0.6334059166169852, 'mnist_12': 0.9409525675203124},
 'gckm_varies': {'cov_type': 0.9598792682498309,
  'mnist_12': 0.9404632530999799},
 'gmcm': {'mnist_12': 0.9403969842667326}}

In [54]:
sdv_score_dict_table_1 = {'Y-data_regular': {'cov_type': 0.9576128119235201,
  'mnist_12': 0.9335927265758175},
 'sdv_gaussian_copula': {'cov_type': 0.6838269560351897,
  'mnist_12': 0.7988264088131569},
 'gckm': {'cov_type': 0.9602017584912201, 'mnist_12': 0.9409525675203124},
 'gckm_varies': {'cov_type': 0.9598792682498309,
  'mnist_12': 0.9414358660840151},
 'gmcm': {'mnist_12': 0.9403969842667326}}

In [55]:
sdv_score_table_summary

Unnamed: 0,gckm_varies,Y-data_regular,sdv_gaussian_copula,gckm,gmcm
American_Income,0.807813,0.819764,0.7242,0.7859,0.9064
fraud_insurance,0.749637,0.806068,0.782,0.7217,0.8073
travel_insurance,0.733702,0.90491,0.6021,0.8385,0.9154
cov_type,0.959879,0.957613,0.683827,0.960202,


In [56]:
sdv_score_df = pd.DataFrame(data = sdv_score_dict_table_1)

sdv_score_df

Unnamed: 0,Y-data_regular,sdv_gaussian_copula,gckm,gckm_varies,gmcm
cov_type,0.957613,0.683827,0.960202,0.959879,
mnist_12,0.933593,0.798826,0.940953,0.941436,0.940397


## Table Summary For All Dataset(SDV Score)

In [57]:
sdv_score_table_summary_all_data = sdv_score_df.combine_first(sdv_score_table_summary)

sdv_score_table_summary_all_data

Unnamed: 0,Y-data_regular,gckm,gckm_varies,gmcm,sdv_gaussian_copula
American_Income,0.819764,0.7859,0.807813,0.9064,0.7242
cov_type,0.957613,0.960202,0.959879,,0.683827
fraud_insurance,0.806068,0.7217,0.749637,0.8073,0.782
mnist_12,0.933593,0.940953,0.941436,0.940397,0.798826
travel_insurance,0.90491,0.8385,0.733702,0.9154,0.6021


## CUSTOM ANALYSIS

In [8]:
def boundary_adherence(original_data, synthetic_data, column_names):
    """
    Compute the Boundary Adherence for multiple numerical columns.

    Parameters:
    - original_data: DataFrame containing the original data.
    - synthetic_data: DataFrame containing the synthetic data.
    - column_names: List of numerical column names to compute Boundary Adherence for.

    Returns:
    - Dictionary with Boundary Adherence scores for the specified columns.
    """
    results = {}
    
    for column_name in column_names:
        min_val = original_data[column_name].min()
        max_val = original_data[column_name].max()
        
        adhering_values = synthetic_data[(synthetic_data[column_name] >= min_val) & (synthetic_data[column_name] <= max_val)]
        
        results[column_name] = len(adhering_values) / len(synthetic_data)
    
    return results


def category_coverage(original_data, synthetic_data, column_names):
    """
    Compute the Category Coverage for multiple categorical columns.

    Parameters:
    - original_data: DataFrame containing the original data.
    - synthetic_data: DataFrame containing the synthetic data.
    - column_names: List of numerical column names to compute category coverage for.

    Returns:
    - Dictionary with Category Coverage scores for the specified columns.
    """
    coverage_results = {}
    
    for column_name in column_names:
        # Get unique categories in original and synthetic data
        original_categories = set(original_data[column_name].unique())
        synthetic_categories = set(synthetic_data[column_name].unique())
        
        # Compute the intersection of categories
        common_categories = original_categories.intersection(synthetic_categories)
        
        # Calculate category coverage ratio
        coverage_ratio = len(common_categories) / len(original_categories)
        coverage_results[column_name] = coverage_ratio
    
    return coverage_results


def range_coverage(original_data, synthetic_data, column_names):
    """
    Compute the Range Coverage for multiple numerical columns.

    Parameters:
    - original_data: DataFrame containing the original data.
    - synthetic_data: DataFrame containing the synthetic data.
    - column_names: List of numerical column names to compute Range Coverage for.

    Returns:
    - Dictionary with Range Coverage scores for the specified columns.
    """
    
    coverage_results = {}
    
    for column_name in column_names:
        # Determine the support (range) of the original data
        min_r = original_data[column_name].min()
        max_r = original_data[column_name].max()

        # Determine the support (range) of the synthetic data
        min_s = synthetic_data[column_name].min()
        max_s = synthetic_data[column_name].max()

        # Calculate the components of the Range Coverage formula
        component_1 = max((min_s - min_r) / (max_r - min_r), 0)
        component_2 = max((max_r - max_s) / (max_r - min_r), 0)

        # Calculate Range Coverage score
        coverage_score = 1 - component_1 - component_2
        
        # Store the result in the dictionary
        coverage_results[column_name] = coverage_score
    
    return coverage_results

### COVTYPE

In [12]:
# only select the numerical columns
numerical_column = covtype_org.columns[(covtype_org.dtypes =='int64') | (covtype_org.dtypes=='float64')].tolist()
# List of model names
model_names = ["sdv_copula", "gckm", "gckm_varies", 'Y-data']

# List of datasets corresponding to the model names
datasets = [sdv_cov_syn, gckm_cov_syn, gckm_varies_cov_syn, y_data_cov_syn]

final_result_boundary_adherence = {}

# Iterate over model names and datasets
for model_name, dataset in zip(model_names, datasets):
    result = boundary_adherence(covtype_org, dataset, numerical_column)
    final_result_boundary_adherence[model_name] = result

final_result_boundary_adherence_df = pd.DataFrame(final_result_boundary_adherence)
avg_col = final_result_boundary_adherence_df.mean()
final_result_boundary_adherence_df = pd.concat([final_result_boundary_adherence_df, avg_col.to_frame().T], ignore_index=True)
final_result_boundary_adherence_df.index= numerical_column + ["Average"]
final_result_boundary_adherence_df.index.name = "Boundary Adherence For Feature"
# final_result_boundary_adherence_df

In [38]:
ba_covtype = final_result_boundary_adherence_df.tail(1)

ba_covtype

Unnamed: 0_level_0,sdv_copula,gckm,gckm_varies,Y-data
Boundary Adherence For Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Average,1.0,1.0,1.0,0.992283


In [13]:
range_coverage_column = covtype_org.columns[(covtype_org.dtypes =='int64') | (covtype_org.dtypes=='float64')].tolist()
# print(range_coverage_column)
# List of model names
model_names = ["sdv_copula", "gckm", "gckm_varies", 'Y-data']

# List of datasets corresponding to the model names
datasets = [sdv_cov_syn, gckm_cov_syn, gckm_varies_cov_syn, y_data_cov_syn]

final_result_range_coverage = {}

# Iterate over model names and datasets
for model_name, dataset in zip(model_names, datasets):
    result = range_coverage(covtype_org, dataset, range_coverage_column)
    final_result_range_coverage[model_name] = result

final_result_range_coverage_df = pd.DataFrame(final_result_range_coverage)
avg_col = final_result_range_coverage_df.mean()
final_result_range_coverage_df= pd.concat([final_result_range_coverage_df,avg_col.to_frame().T], ignore_index=True)
final_result_range_coverage_df.index= range_coverage_column + ["Average"]
final_result_range_coverage_df.index.name = 'Range Coverage For Feature'
final_result_range_coverage_df

Unnamed: 0_level_0,sdv_copula,gckm,gckm_varies,Y-data
Range Coverage For Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Elevation,0.944472,1.0,0.976988,1.0
Aspect,1.0,1.0,1.0,1.0
Slope,1.0,0.756296,0.939394,0.848485
Horizontal_Distance_To_Hydrology,1.0,0.908529,0.981389,0.930565
Vertical_Distance_To_Hydrology,0.667959,0.614279,0.627429,0.640827
Horizontal_Distance_To_Roadways,0.995925,1.0,1.0,1.0
Hillshade_9am,0.80315,0.661957,1.0,0.811024
Hillshade_Noon,0.677165,0.489375,0.484844,0.531496
Hillshade_3pm,1.0,1.0,0.980315,1.0
Horizontal_Distance_To_Fire_Points,1.0,1.0,0.988568,1.0


In [14]:
rc_covtype = final_result_range_coverage_df.tail(1)

In [15]:
rc_covtype


Unnamed: 0_level_0,sdv_copula,gckm,gckm_varies,Y-data
Range Coverage For Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Average,0.634946,0.698942,0.70996,0.632044


In [42]:
custom_ana = ['Range Coverage', 'Boundary Adherence']
model_name = ['gckm', 'gckm_varies', 'gmcm', 'sdv_copula', 'Y-data']


custom_table_covtype = pd.DataFrame(index=custom_ana, columns=model_name)

for model in model_name:
    try:
        custom_table_covtype.loc['Range Coverage'][model] = rc_covtype.loc['Average'][model]
    except KeyError:
        custom_table_covtype.loc['Range Coverage'][model] = "nan"


In [43]:
custom_ana = ['Range Coverage', 'Boundary Adherence']
model_name = ['gckm', 'gckm_varies', 'gmcm', 'sdv_copula', 'Y-data']


# custom_table_covtype = pd.DataFrame(index=custom_ana, columns=model_name)

for model in model_name:
    try:
        custom_table_covtype.loc['Boundary Adherence'][model] = ba_covtype.loc['Average'][model]
    except KeyError:
        custom_table_covtype.loc['Boundary Adherence'][model] = "nan"

In [44]:
custom_table_covtype

Unnamed: 0,gckm,gckm_varies,gmcm,sdv_copula,Y-data
Range Coverage,0.698942,0.581481,,0.580065,0.632044
Boundary Adherence,1.0,1.0,,1.0,0.992283


### MNIST

In [45]:
# only select the numerical columns
numerical_column = mnist_12.columns[(mnist_12.dtypes =='int64') | (mnist_12.dtypes=='float64')].tolist()
# List of model names
model_names = ["sdv_copula", "gckm", "gckm_varies", 'gmcm', 'Y-data']

# List of datasets corresponding to the model names
datasets = [sdv_mnist, gckm_mnist1, gckm_varies_mnist, gmcm_mnist1, y_data_mnist]

final_result_boundary_adherence = {}

# Iterate over model names and datasets
for model_name, dataset in zip(model_names, datasets):
    result = boundary_adherence(mnist_12, dataset, numerical_column)
    final_result_boundary_adherence[model_name] = result

final_result_boundary_adherence_df = pd.DataFrame(final_result_boundary_adherence)
avg_col = final_result_boundary_adherence_df.mean()
final_result_boundary_adherence_df = pd.concat([final_result_boundary_adherence_df, avg_col.to_frame().T], ignore_index=True)
final_result_boundary_adherence_df.index= numerical_column + ["Average"]
final_result_boundary_adherence_df.index.name = "Boundary Adherence For Feature"
final_result_boundary_adherence_df

ba_mnist = final_result_boundary_adherence_df.tail(1)

In [46]:
range_coverage_column = mnist_12.columns[(mnist_12.dtypes =='int64') | (mnist_12.dtypes=='float64')].tolist()
# print(range_coverage_column)
# List of model names
model_names = ["sdv_copula", "gckm", "gckm_varies", 'gmcm', 'Y-data']

# List of datasets corresponding to the model names
datasets = [sdv_mnist, gckm_mnist1, gckm_varies_mnist, gmcm_mnist1, y_data_mnist]

final_result_range_coverage = {}

# Iterate over model names and datasets
for model_name, dataset in zip(model_names, datasets):
    result = range_coverage(mnist_12, dataset, range_coverage_column)
    final_result_range_coverage[model_name] = result

final_result_range_coverage_df = pd.DataFrame(final_result_range_coverage)
avg_col = final_result_range_coverage_df.mean()
final_result_range_coverage_df= pd.concat([final_result_range_coverage_df,avg_col.to_frame().T], ignore_index=True)
final_result_range_coverage_df.index= range_coverage_column + ["Average"]
final_result_range_coverage_df.index.name = 'Range Coverage For Feature'

final_result_range_coverage_df

rc_mnist = final_result_range_coverage_df.tail(1)

In [47]:
custom_ana = ['Range Coverage', 'Boundary Adherence']
model_name = ['gckm', 'gckm_varies', 'gmcm', 'sdv_copula', 'Y-data']


custom_table_mnist = pd.DataFrame(index=custom_ana, columns=model_name)

for model in model_name:
    try:
        custom_table_mnist.loc['Range Coverage'][model] = rc_mnist.loc['Average'][model]
    except KeyError:
        custom_table_mnist.loc['Range Coverage'][model] = "nan"


for model in model_name:
    try:
        custom_table_mnist.loc['Boundary Adherence'][model] = ba_mnist.loc['Average'][model]
    except KeyError:
        custom_table_mnist.loc['Boundary Adherence'][model] = "nan"

In [48]:
custom_table_covtype

Unnamed: 0,gckm,gckm_varies,gmcm,sdv_copula,Y-data
Range Coverage,0.698942,0.581481,,0.580065,0.632044
Boundary Adherence,1.0,1.0,,1.0,0.992283


In [49]:
custom_table_mnist

Unnamed: 0,gckm,gckm_varies,gmcm,sdv_copula,Y-data
Range Coverage,0.951613,0.967742,0.830645,0.669355,0.556452
Boundary Adherence,1.0,1.0,1.0,1.0,0.998833


### American Income

In [71]:
AMI_org = pd.read_csv('American_Income.csv')
AMI_y_data = pd.read_csv('American_Income_y_data.csv')
AMI_GCKM_VARIES = pd.read_csv('gckm_varies_AMI.csv')
# only select the numerical columns
numerical_column = AMI_org.columns[(AMI_org.dtypes =='int64') | (AMI_org.dtypes=='float64')].tolist()
# List of model names
model_names = ['Y-data','gckm_varies']

# List of datasets corresponding to the model names
datasets = [AMI_y_data, AMI_GCKM_VARIES]

final_result_boundary_adherence = {}

# Iterate over model names and datasets
for model_name, dataset in zip(model_names, datasets):
    result = boundary_adherence(AMI_org, dataset, numerical_column)
    final_result_boundary_adherence[model_name] = result

final_result_boundary_adherence_df = pd.DataFrame(final_result_boundary_adherence)
avg_col = final_result_boundary_adherence_df.mean()
final_result_boundary_adherence_df = pd.concat([final_result_boundary_adherence_df, avg_col.to_frame().T], ignore_index=True)
final_result_boundary_adherence_df.index= numerical_column + ["Average"]
final_result_boundary_adherence_df.index.name = "Boundary Adherence For Feature"
final_result_boundary_adherence_df

Unnamed: 0_level_0,Y-data,gckm_varies
Boundary Adherence For Feature,Unnamed: 1_level_1,Unnamed: 2_level_1
age,0.944903,1.0
fnlwgt,0.950984,1.0
education-num,0.996468,1.0
capital-gain,0.555388,1.0
capital-loss,0.58871,1.0
hours-per-week,0.999386,1.0
Average,0.839307,1.0


In [72]:
ba_custom_table_AMI = final_result_boundary_adherence_df

ba_custom_table_AMI['gmcm'] = [1.0]*7

ba_custom_table_AMI['gckm'] = [1.0]*7

ba_custom_table_AMI['sdv_copula'] = [1.0]*7

ba_custom_table_AMI

Unnamed: 0_level_0,Y-data,gckm_varies,gmcm,gckm,sdv_copula
Boundary Adherence For Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
age,0.944903,1.0,1.0,1.0,1.0
fnlwgt,0.950984,1.0,1.0,1.0,1.0
education-num,0.996468,1.0,1.0,1.0,1.0
capital-gain,0.555388,1.0,1.0,1.0,1.0
capital-loss,0.58871,1.0,1.0,1.0,1.0
hours-per-week,0.999386,1.0,1.0,1.0,1.0
Average,0.839307,1.0,1.0,1.0,1.0


In [73]:
category_colum = list(filter(lambda col: AMI_org[col].dtypes == 'object', AMI_org.columns))

model_names = ['Y-data', 'gckm_varies']

# List of datasets corresponding to the model names
datasets = [AMI_y_data, AMI_GCKM_VARIES]

final_result_range_coverage = {}

# Iterate over model names and datasets
for model_name, dataset in zip(model_names, datasets):
    result = category_coverage(AMI_org, dataset, category_colum)
    final_result_range_coverage[model_name] = result

final_result_category_coverage_df = pd.DataFrame(final_result_range_coverage)
avg_col = final_result_category_coverage_df.mean()
final_result_category_coverage_df= pd.concat([final_result_category_coverage_df,avg_col.to_frame().T], ignore_index=True)
final_result_category_coverage_df.index= category_colum + ["Average"]
final_result_category_coverage_df.index.name = 'Categorical Features'
final_result_category_coverage_df

Unnamed: 0_level_0,Y-data,gckm_varies
Categorical Features,Unnamed: 1_level_1,Unnamed: 2_level_1
workclass,0.777778,0.777778
education,0.9375,1.0
marital-status,0.857143,1.0
occupation,0.933333,1.0
relationship,1.0,1.0
race,1.0,1.0
sex,1.0,1.0
native-country,0.404762,0.857143
salary,1.0,1.0
Average,0.878946,0.959436


In [74]:
cc_custom_table_AMI = final_result_category_coverage_df

cc_custom_table_AMI['gmcm'] = [1.0]*10

cc_custom_table_AMI['gckm'] = [1.0000, 1.0000, 0.8571, 1.0000, 1.0000, 1.0000, 1.0000, 0.9048, 1.0000, 0.9735]

cc_custom_table_AMI['sdv_copula'] = [1.0000, 1.0000, 1.0000,1.0000, 1.0000, 1.0000, 1.0000, 0.6667, 1.0000, 0.9630]

cc_custom_table_AMI

Unnamed: 0_level_0,Y-data,gckm_varies,gmcm,gckm,sdv_copula
Categorical Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
workclass,0.777778,0.777778,1.0,1.0,1.0
education,0.9375,1.0,1.0,1.0,1.0
marital-status,0.857143,1.0,1.0,0.8571,1.0
occupation,0.933333,1.0,1.0,1.0,1.0
relationship,1.0,1.0,1.0,1.0,1.0
race,1.0,1.0,1.0,1.0,1.0
sex,1.0,1.0,1.0,1.0,1.0
native-country,0.404762,0.857143,1.0,0.9048,0.6667
salary,1.0,1.0,1.0,1.0,1.0
Average,0.878946,0.959436,1.0,0.9735,0.963


In [75]:
range_coverage_column = AMI_org.columns[(AMI_org.dtypes =='int64') | (AMI_org.dtypes=='float64')].tolist()
# print(range_coverage_column)
# List of model names
model_names = ['Y-data','gckm_varies']

# List of datasets corresponding to the model names
datasets = [AMI_y_data, AMI_GCKM_VARIES]
final_result_range_coverage = {}

# Iterate over model names and datasets
for model_name, dataset in zip(model_names, datasets):
    result = range_coverage(AMI_org, dataset, range_coverage_column)
    final_result_range_coverage[model_name] = result

final_result_range_coverage_df = pd.DataFrame(final_result_range_coverage)
avg_col = final_result_range_coverage_df.mean()
final_result_range_coverage_df= pd.concat([final_result_range_coverage_df,avg_col.to_frame().T], ignore_index=True)
final_result_range_coverage_df.index= range_coverage_column + ["Average"]
final_result_range_coverage_df.index.name = 'Range Coverage For Feature'
final_result_range_coverage_df

Unnamed: 0_level_0,Y-data,gckm_varies
Range Coverage For Feature,Unnamed: 1_level_1,Unnamed: 2_level_1
age,1.0,1.0
fnlwgt,0.429001,0.414447
education-num,1.0,1.0
capital-gain,0.328643,1.0
capital-loss,0.447888,1.0
hours-per-week,0.867347,0.987034
Average,0.678813,0.900247


In [76]:
rc_custom_table_AMI = final_result_range_coverage_df

rc_custom_table_AMI['gmcm'] = [1.00000, 0.811228, 1.00000, 0.150242, 0.433114, 1.00000, 0.732597]

rc_custom_table_AMI['gckm'] = [0.896174, 1.000000, 1.000000, 0.597281, 0.332263, 0.954808, 0.805138]

rc_custom_table_AMI['sdv_copula'] = [1.000000, 0.54984, 1.000000, 1.000000, 0.82438, 0.949808, 0.88720]

rc_custom_table_AMI

Unnamed: 0_level_0,Y-data,gckm_varies,gmcm,gckm,sdv_copula
Range Coverage For Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
age,1.0,1.0,1.0,0.896174,1.0
fnlwgt,0.429001,0.414447,0.811228,1.0,0.54984
education-num,1.0,1.0,1.0,1.0,1.0
capital-gain,0.328643,1.0,0.150242,0.597281,1.0
capital-loss,0.447888,1.0,0.433114,0.332263,0.82438
hours-per-week,0.867347,0.987034,1.0,0.954808,0.949808
Average,0.678813,0.900247,0.732597,0.805138,0.8872


### Travel Insurance

In [56]:
import GCKM as G

TI_org = pd.read_csv('travel insurance.csv')

G = G.GaussianCopulaKmeansSynthesizer(TI_org)
G._identify_columns()
G.convert_datetime_to_numerical()
G.handle_missing_values()
G.assign_intervals()
G.preprocess_data()
G.get_distribution()
G.calculate_cdfs()
G.standard_gaussian_all()
G.optimal_clusters_dynamic()
G.get_Kmeans()
G.generate_data()
G.generate_synthetic_data(63326)
G.post_process()

  0%|          | 0/7 [00:00<?, ?it/s]

100%|██████████| 7/7 [02:45<00:00, 23.67s/it]
100%|██████████| 63326/63326 [01:27<00:00, 723.93it/s]


Unnamed: 0,Distribution Channel,Destination,Agency Type,Product Name,Gender,Claim,Duration,Net Sales,Agency,Age,Commision (in value)
0,Online,SINGAPORE,Travel Agency,Comprehensive Plan,F,No,28.551769,48.209894,JZI,44.059965,3.041092e+01
1,Online,UNITED ARAB EMIRATES,Travel Agency,2 way Comprehensive Plan,F,No,96.771106,82.908352,EPX,22.987774,2.074032e+00
2,Online,INDIA,Airlines,Rental Vehicle Excess Insurance,,No,-2.000000,52.764148,EPX,50.986516,3.405688e+01
3,Online,UNITED KINGDOM,Travel Agency,Cancellation Plan,,No,4.935964,24.268610,EPX,15.281905,1.722834e-02
4,Online,SINGAPORE,Travel Agency,2 way Comprehensive Plan,,No,81.995165,36.276668,EPX,40.037429,2.938991e+01
...,...,...,...,...,...,...,...,...,...,...,...
63321,Online,INDONESIA,Travel Agency,Cancellation Plan,F,No,112.610888,85.662025,C2B,21.905140,1.892702e+01
63322,Online,SINGAPORE,Travel Agency,Cancellation Plan,F,No,27.628936,-50.862684,EPX,17.329980,4.973799e-14
63323,Online,INDIA,Travel Agency,Rental Vehicle Excess Insurance,M,No,33.446542,-34.097021,EPX,39.851720,1.334590e+01
63324,Online,MALAYSIA,Travel Agency,1 way Comprehensive Plan,M,No,174.333868,35.692013,JZI,53.924408,6.042425e+00


In [57]:
TI_org = pd.read_csv('travel insurance.csv')
TI_org.columns

Index(['Agency', 'Agency Type', 'Distribution Channel', 'Product Name',
       'Claim', 'Duration', 'Destination', 'Net Sales', 'Commision (in value)',
       'Gender', 'Age'],
      dtype='object')

In [17]:
gckm_TI = G.synthetic_data

gckm_TI = gckm_TI[TI_org.columns]

gckm_TI.to_csv('travel_insurance_gckm.csv', index = False)

In [19]:
TI_org = pd.read_csv('travel insurance.csv')
TI_y_data = pd.read_csv('travel_insurance_y_data.csv')
gckm_TI = pd.read_csv('travel_insurance_gckm.csv')
gckm_varies_TI = pd.read_csv('gckm_varies_TI.csv')
# only select the numerical columns
numerical_column = TI_org.columns[(TI_org.dtypes =='int64') | (TI_org.dtypes=='float64')].tolist()
# List of model names
model_names = ['Y-data', 'gckm','gckm_varies']

# List of datasets corresponding to the model names
datasets = [TI_y_data, gckm_TI, gckm_varies_TI]

final_result_boundary_adherence = {}

# Iterate over model names and datasets
for model_name, dataset in zip(model_names, datasets):
    result = boundary_adherence(TI_org, dataset, numerical_column)
    final_result_boundary_adherence[model_name] = result

final_result_boundary_adherence_df = pd.DataFrame(final_result_boundary_adherence)
avg_col = final_result_boundary_adherence_df.mean()
final_result_boundary_adherence_df = pd.concat([final_result_boundary_adherence_df, avg_col.to_frame().T], ignore_index=True)
final_result_boundary_adherence_df.index= numerical_column + ["Average"]
final_result_boundary_adherence_df.index.name = "Boundary Adherence For Feature"
final_result_boundary_adherence_df

Unnamed: 0_level_0,Y-data,gckm,gckm_varies
Boundary Adherence For Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Duration,0.820027,1.0,1.0
Net Sales,1.0,1.0,1.0
Commision (in value),0.691406,1.0,1.0
Age,0.999305,1.0,1.0
Average,0.877685,1.0,1.0


In [20]:
ba_custom_table_TI = final_result_boundary_adherence_df

ba_custom_table_TI['gmcm'] = [1.0]*5

ba_custom_table_TI['gckm'] = [1.0]*5

ba_custom_table_TI['sdv_copula'] = [1.0]*5

ba_custom_table_TI

Unnamed: 0_level_0,Y-data,gckm,gckm_varies,gmcm,sdv_copula
Boundary Adherence For Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Duration,0.820027,1.0,1.0,1.0,1.0
Net Sales,1.0,1.0,1.0,1.0,1.0
Commision (in value),0.691406,1.0,1.0,1.0,1.0
Age,0.999305,1.0,1.0,1.0,1.0
Average,0.877685,1.0,1.0,1.0,1.0


In [21]:
range_coverage_column = TI_org.columns[(TI_org.dtypes =='int64') | (TI_org.dtypes=='float64')].tolist()
# print(range_coverage_column)
# List of model names
model_names = ['Y-data', 'gckm', 'gckm_varies']

# List of datasets corresponding to the model names
datasets = [TI_y_data, gckm_TI, gckm_varies_TI]
final_result_range_coverage = {}

# Iterate over model names and datasets
for model_name, dataset in zip(model_names, datasets):
    result = range_coverage(TI_org, dataset, range_coverage_column)
    final_result_range_coverage[model_name] = result

final_result_range_coverage_df = pd.DataFrame(final_result_range_coverage)
avg_col = final_result_range_coverage_df.mean()
final_result_range_coverage_df= pd.concat([final_result_range_coverage_df,avg_col.to_frame().T], ignore_index=True)
final_result_range_coverage_df.index= range_coverage_column + ["Average"]
final_result_range_coverage_df.index.name = 'Range Coverage For Feature'
final_result_range_coverage_df

Unnamed: 0_level_0,Y-data,gckm,gckm_varies
Range Coverage For Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Duration,0.356134,0.079746,4.6e-05
Net Sales,0.601135,0.372667,0.241156
Commision (in value),0.97011,1.0,0.261143
Age,1.0,0.834738,0.803903
Average,0.731845,0.571788,0.326562


In [22]:
rc_custom_table_TI = final_result_range_coverage_df

rc_custom_table_TI['gmcm'] = [1.000000, 0.861719, 0.937867, 0.920262, 0.929962]

# rc_custom_table_FI['gckm'] = [1.0]

rc_custom_table_TI['sdv_copula'] = [0.137825, 0.292744, 0.631146, 0.855932, 0.479412]

rc_custom_table_TI

Unnamed: 0_level_0,Y-data,gckm,gckm_varies,gmcm,sdv_copula
Range Coverage For Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Duration,0.356134,0.079746,4.6e-05,1.0,0.137825
Net Sales,0.601135,0.372667,0.241156,0.861719,0.292744
Commision (in value),0.97011,1.0,0.261143,0.937867,0.631146
Age,1.0,0.834738,0.803903,0.920262,0.855932
Average,0.731845,0.571788,0.326562,0.929962,0.479412


In [23]:
category_colum = list(filter(lambda col: TI_org[col].dtypes == 'object', TI_org.columns))

model_names = ['Y-data', 'gckm', 'gckm_varies']

# List of datasets corresponding to the model names
datasets = [TI_y_data, gckm_TI, gckm_varies_TI]

final_result_range_coverage = {}

# Iterate over model names and datasets
for model_name, dataset in zip(model_names, datasets):
    result = category_coverage(TI_org, dataset, category_colum)
    final_result_range_coverage[model_name] = result

final_result_category_coverage_df = pd.DataFrame(final_result_range_coverage)
avg_col = final_result_category_coverage_df.mean()
final_result_category_coverage_df= pd.concat([final_result_category_coverage_df,avg_col.to_frame().T], ignore_index=True)
final_result_category_coverage_df.index= category_colum + ["Average"]
final_result_category_coverage_df.index.name = 'Categorical Features'
final_result_category_coverage_df

Unnamed: 0_level_0,Y-data,gckm,gckm_varies
Categorical Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Agency,1.0,1.0,0.8125
Agency Type,1.0,1.0,1.0
Distribution Channel,1.0,1.0,1.0
Product Name,0.884615,1.0,0.576923
Claim,1.0,1.0,1.0
Destination,0.328859,0.85906,0.187919
Gender,1.0,1.0,1.0
Average,0.887639,0.979866,0.796763


In [24]:
cc_custom_table_TI = final_result_category_coverage_df

cc_custom_table_TI['gmcm'] = [1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 0.791946, 1.000000, 0.970278]

# rc_custom_table_FI['gckm'] = [1.0]

cc_custom_table_TI['sdv_copula'] = [1.000000, 1.000000, 0.500000, 1.000000, 1.000000, 0.738255, 1.000000, 0.891179]

cc_custom_table_TI

Unnamed: 0_level_0,Y-data,gckm,gckm_varies,gmcm,sdv_copula
Categorical Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Agency,1.0,1.0,0.8125,1.0,1.0
Agency Type,1.0,1.0,1.0,1.0,1.0
Distribution Channel,1.0,1.0,1.0,1.0,0.5
Product Name,0.884615,1.0,0.576923,1.0,1.0
Claim,1.0,1.0,1.0,1.0,1.0
Destination,0.328859,0.85906,0.187919,0.791946,0.738255
Gender,1.0,1.0,1.0,1.0,1.0
Average,0.887639,0.979866,0.796763,0.970278,0.891179


### Fraud_insurance

In [25]:
FI_org = pd.read_csv('fraud_insurance_claims.csv')
FI_y_data = pd.read_csv('fraud_insurance_y_data.csv')
FI_gckm_varies = pd.read_csv('gckm_varies_FI.csv')

# only select the numerical columns
numerical_column = FI_org.columns[(FI_org.dtypes =='int64') | (FI_org.dtypes=='float64')].tolist()
# List of model names
model_names = ['Y-data','gckm_varies']

# List of datasets corresponding to the model names
datasets = [FI_y_data, FI_gckm_varies]

final_result_boundary_adherence = {}

# Iterate over model names and datasets
for model_name, dataset in zip(model_names, datasets):
    result = boundary_adherence(FI_org, dataset, numerical_column)
    final_result_boundary_adherence[model_name] = result

final_result_boundary_adherence_df = pd.DataFrame(final_result_boundary_adherence)
avg_col = final_result_boundary_adherence_df.mean()
final_result_boundary_adherence_df = pd.concat([final_result_boundary_adherence_df, avg_col.to_frame().T], ignore_index=True)
final_result_boundary_adherence_df.index= numerical_column + ["Average"]
final_result_boundary_adherence_df.index.name = "Boundary Adherence For Feature"
final_result_boundary_adherence_df

Unnamed: 0_level_0,Y-data,gckm_varies
Boundary Adherence For Feature,Unnamed: 1_level_1,Unnamed: 2_level_1
months_as_customer,0.956,1.0
age,0.984,1.0
policy_number,0.921,1.0
policy_deductable,0.788,1.0
policy_annual_premium,1.0,1.0
umbrella_limit,0.827,1.0
insured_zip,0.783,1.0
capital-gains,0.809,1.0
capital-loss,0.835,1.0
incident_hour_of_the_day,0.935,1.0


In [26]:
ba_custom_table_FI = final_result_boundary_adherence_df

ba_custom_table_FI['gmcm'] = [1.000000] * 19
ba_custom_table_FI['gckm'] = [1.000000] * 19

ba_custom_table_FI['sdv_copula'] = [1.000000] * 19

ba_custom_table_FI

Unnamed: 0_level_0,Y-data,gckm_varies,gmcm,gckm,sdv_copula
Boundary Adherence For Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
months_as_customer,0.956,1.0,1.0,1.0,1.0
age,0.984,1.0,1.0,1.0,1.0
policy_number,0.921,1.0,1.0,1.0,1.0
policy_deductable,0.788,1.0,1.0,1.0,1.0
policy_annual_premium,1.0,1.0,1.0,1.0,1.0
umbrella_limit,0.827,1.0,1.0,1.0,1.0
insured_zip,0.783,1.0,1.0,1.0,1.0
capital-gains,0.809,1.0,1.0,1.0,1.0
capital-loss,0.835,1.0,1.0,1.0,1.0
incident_hour_of_the_day,0.935,1.0,1.0,1.0,1.0


In [27]:
range_coverage_column = FI_org.columns[(FI_org.dtypes =='int64') | (FI_org.dtypes=='float64')].tolist()
# print(range_coverage_column)
# List of model names
model_names = ['Y-data','gckm_varies']

# List of datasets corresponding to the model names
datasets = [FI_y_data, FI_gckm_varies]
final_result_range_coverage = {}

# Iterate over model names and datasets
for model_name, dataset in zip(model_names, datasets):
    result = range_coverage(FI_org, dataset, range_coverage_column)
    final_result_range_coverage[model_name] = result

final_result_range_coverage_df = pd.DataFrame(final_result_range_coverage)
avg_col = final_result_range_coverage_df.mean()
final_result_range_coverage_df= pd.concat([final_result_range_coverage_df,avg_col.to_frame().T], ignore_index=True)
final_result_range_coverage_df.index= range_coverage_column + ["Average"]
final_result_range_coverage_df.index.name = 'Range Coverage For Feature'
final_result_range_coverage_df

Unnamed: 0_level_0,Y-data,gckm_varies
Range Coverage For Feature,Unnamed: 1_level_1,Unnamed: 2_level_1
months_as_customer,1.0,1.0
age,1.0,1.0
policy_number,1.0,0.990419
policy_deductable,1.0,1.0
policy_annual_premium,0.908384,0.902342
umbrella_limit,0.721002,0.909091
insured_zip,1.0,0.0
capital-gains,1.0,1.0
capital-loss,1.0,1.0
incident_hour_of_the_day,1.0,0.992229


In [28]:
rc_custom_table_FI = final_result_range_coverage_df

rc_custom_table_FI['gmcm'] = [
    1.000000,  # months_as_customer
    0.971694,  # age
    0.999313,  # policy_number
    1.000000,  # policy_deductable
    0.871414,  # policy_annual_premium
    0.727272,  # umbrella_limit
    1.000000,  # insured_zip
    0.916203,  # capital-gains
    0.842261,  # capital-loss
    1.000000,  # incident_hour_of_the_day
    1.000000,  # number_of_vehicles_involved
    1.000000,  # bodily_injuries
    1.000000,  # witnesses
    0.996271,  # total_claim_amount
    0.989513,  # injury_claim
    0.938543,  # property_claim
    0.986754,  # vehicle_claim
    1.000000,  # auto_year
    0.957736   # Average
]
rc_custom_table_FI['gckm'] = [
    1.000000,  # months_as_customer
    1.000000,  # age
    0.993246,  # policy_number
    1.000000,  # policy_deductable
    0.871939,  # policy_annual_premium
    0.909091,  # umbrella_limit
    0.000000,  # insured_zip
    1.000000,  # capital-gains
    1.000000,  # capital-loss
    1.000000,  # incident_hour_of_the_day
    1.000000,  # number_of_vehicles_involved
    1.000000,  # bodily_injuries
    1.000000,  # witnesses
    1.000000,  # total_claim_amount
    0.963599,  # injury_claim
    0.932288,  # property_claim
    1.000000,  # vehicle_claim
    1.000000,  # auto_year
    0.926120   # Average
]

rc_custom_table_FI['sdv_copula'] = [
    0.997912,  # months_as_customer
    0.955556,  # age
    0.999499,  # policy_number
    0.976000,  # policy_deductable
    0.991625,  # policy_annual_premium
    1.000000,  # umbrella_limit
    1.000000,  # insured_zip
    1.000000,  # capital-gains
    1.000000,  # capital-loss
    1.000000,  # incident_hour_of_the_day
    1.000000,  # number_of_vehicles_involved
    1.000000,  # bodily_injuries
    1.000000,  # witnesses
    0.978601,  # total_claim_amount
    0.944336,  # injury_claim
    1.000000,  # property_claim
    0.980010,  # vehicle_claim
    1.000000,  # auto_year
    0.990197   # Average
]

rc_custom_table_FI

Unnamed: 0_level_0,Y-data,gckm_varies,gmcm,gckm,sdv_copula
Range Coverage For Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
months_as_customer,1.0,1.0,1.0,1.0,0.997912
age,1.0,1.0,0.971694,1.0,0.955556
policy_number,1.0,0.990419,0.999313,0.993246,0.999499
policy_deductable,1.0,1.0,1.0,1.0,0.976
policy_annual_premium,0.908384,0.902342,0.871414,0.871939,0.991625
umbrella_limit,0.721002,0.909091,0.727272,0.909091,1.0
insured_zip,1.0,0.0,1.0,0.0,1.0
capital-gains,1.0,1.0,0.916203,1.0,1.0
capital-loss,1.0,1.0,0.842261,1.0,1.0
incident_hour_of_the_day,1.0,0.992229,1.0,1.0,1.0


In [29]:
category_colum = list(filter(lambda col: FI_org[col].dtypes == 'object', FI_org.columns))

model_names = ['Y-data', 'gckm_varies']

# List of datasets corresponding to the model names
datasets = [FI_y_data, FI_gckm_varies]

final_result_range_coverage = {}

# Iterate over model names and datasets
for model_name, dataset in zip(model_names, datasets):
    result = category_coverage(FI_org, dataset, category_colum)
    final_result_range_coverage[model_name] = result

final_result_category_coverage_df = pd.DataFrame(final_result_range_coverage)
avg_col = final_result_category_coverage_df.mean()
final_result_category_coverage_df= pd.concat([final_result_category_coverage_df,avg_col.to_frame().T], ignore_index=True)
final_result_category_coverage_df.index= category_colum + ["Average"]
final_result_category_coverage_df.index.name = 'Categorical Features'
final_result_category_coverage_df

Unnamed: 0_level_0,Y-data,gckm_varies
Categorical Features,Unnamed: 1_level_1,Unnamed: 2_level_1
policy_bind_date,0.507886,0.0
policy_state,1.0,1.0
policy_csl,1.0,1.0
insured_sex,1.0,1.0
insured_education_level,1.0,1.0
insured_occupation,1.0,1.0
insured_hobbies,1.0,1.0
insured_relationship,1.0,1.0
incident_date,1.0,0.0
incident_type,1.0,1.0


In [30]:
cc_custom_table_FI = final_result_category_coverage_df

cc_custom_table_FI['gmcm'] = [
    0.000000,  # policy_bind_date
    1.000000,  # policy_state
    1.000000,  # policy_csl
    1.000000,  # insured_sex
    1.000000,  # insured_education_level
    1.000000,  # insured_occupation
    1.000000,  # insured_hobbies
    1.000000,  # insured_relationship
    0.000000,  # incident_date
    1.000000,  # incident_type
    1.000000,  # collision_type
    1.000000,  # incident_severity
    1.000000,  # authorities_contacted
    1.000000,  # incident_state
    1.000000,  # incident_city
    0.621000,  # incident_location
    1.000000,  # property_damage
    1.000000,  # police_report_available
    1.000000,  # auto_make
    1.000000,  # auto_model
    1.000000,  # fraud_reported
    0.886714   # Average
]
cc_custom_table_FI['gckm'] = [
    0.0,        # policy_bind_date
    1.0,        # policy_state
    1.0,        # policy_csl
    1.0,        # insured_sex
    1.0,        # insured_education_level
    1.0,        # insured_occupation
    1.0,        # insured_hobbies
    1.0,        # insured_relationship
    0.0,        # incident_date
    1.0,        # incident_type
    1.0,        # collision_type
    1.0,        # incident_severity
    1.0,        # authorities_contacted
    1.0,        # incident_state
    1.0,        # incident_city
    0.61600,    # incident_location
    1.0,        # property_damage
    1.0,        # police_report_available
    1.0,        # auto_make
    1.0,        # auto_model
    1.0,        # fraud_reported
    0.886476    # Average
]

cc_custom_table_FI['sdv_copula'] = [
    0.631966,   # policy_bind_date
    1.000000,   # policy_state
    1.000000,   # policy_csl
    1.000000,   # insured_sex
    1.000000,   # insured_education_level
    1.000000,   # insured_occupation
    1.000000,   # insured_hobbies
    1.000000,   # insured_relationship
    1.000000,   # incident_date
    1.000000,   # incident_type
    1.000000,   # collision_type
    1.000000,   # incident_severity
    1.000000,   # authorities_contacted
    1.000000,   # incident_state
    1.000000,   # incident_city
    0.655100,   # incident_location
    1.000000,   # property_damage
    1.000000,   # police_report_available
    1.000000,   # auto_make
    1.000000,   # auto_model
    1.000000,   # fraud_reported
    0.965856    # Average
]

cc_custom_table_FI

Unnamed: 0_level_0,Y-data,gckm_varies,gmcm,gckm,sdv_copula
Categorical Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
policy_bind_date,0.507886,0.0,0.0,0.0,0.631966
policy_state,1.0,1.0,1.0,1.0,1.0
policy_csl,1.0,1.0,1.0,1.0,1.0
insured_sex,1.0,1.0,1.0,1.0,1.0
insured_education_level,1.0,1.0,1.0,1.0,1.0
insured_occupation,1.0,1.0,1.0,1.0,1.0
insured_hobbies,1.0,1.0,1.0,1.0,1.0
insured_relationship,1.0,1.0,1.0,1.0,1.0
incident_date,1.0,0.0,0.0,0.0,1.0
incident_type,1.0,1.0,1.0,1.0,1.0
