In [6]:
import pandas as pd
import GMCM_class as G

In [26]:
data = pd.read_csv("travel insurance.csv")
data.head(10)

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commision (in value),Gender,Age
0,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,81
1,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,71
2,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,65,AUSTRALIA,-49.5,29.7,,32
3,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,60,AUSTRALIA,-39.6,23.76,,32
4,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,79,ITALY,-19.8,11.88,,41
5,JZI,Airlines,Online,Value Plan,No,66,UNITED STATES,-121.0,42.35,F,44
6,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,47,UNITED STATES,-39.6,23.76,,32
7,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,63,AUSTRALIA,-108.9,65.34,,29
8,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,57,THAILAND,-19.8,11.88,,44
9,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,186,AUSTRALIA,-99.0,59.4,,37


In [8]:
G = G.GaussianCopulaSynthesizer("travel insurance.csv")
G._identify_columns()
G.convert_datetime_to_numerical()
G.handle_missing_values()
G.assign_intervals()
G.preprocess_data()
G.fit_distributions()
G.compute_gmm_cdf()
G.standard_gaussian_all()
G.generate_synthetic_data(63326)
df = G.post_process()

100%|██████████| 7/7 [02:47<00:00, 23.93s/it]
100%|██████████| 63326/63326 [1:31:49<00:00, 11.49it/s]


In [9]:
modify_df = df[data.columns]

In [27]:
modify_df.head(10)

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commision (in value),Gender,Age
0,EPX,Travel Agency,Online,Cancellation Plan,No,37.095747,"TAIWAN, PROVINCE OF CHINA",9.999608,0.001328,F,36.000738
1,CWT,Travel Agency,Online,1 way Comprehensive Plan,No,17.099519,SINGAPORE,39.843473,10.184918,F,37.337357
2,EPX,Travel Agency,Online,Cancellation Plan,No,36.159906,PHILIPPINES,69.491595,18.223734,,31.878533
3,CWT,Travel Agency,Online,Bronze Plan,No,14.13169,SPAIN,18.630898,10.878142,,67.35579
4,JZI,Travel Agency,Online,Rental Vehicle Excess Insurance,No,63.251502,FRANCE,184.133853,0.001281,,30.586207
5,JWT,Airlines,Online,Value Plan,No,370.180818,ITALY,70.443199,41.732763,M,117.999004
6,EPX,Travel Agency,Offline,1 way Comprehensive Plan,No,14.012415,SINGAPORE,12.774985,0.0005,,44.136614
7,C2B,Travel Agency,Online,Rental Vehicle Excess Insurance,No,66.808467,SINGAPORE,36.256568,16.654798,,56.784525
8,C2B,Travel Agency,Online,Value Plan,No,35.509475,INDONESIA,9.99929,0.001021,,33.617303
9,JWT,Travel Agency,Online,1 way Comprehensive Plan,No,11.827586,HONG KONG,17.880542,10.358062,,37.655865


## Evaluation

In [10]:
from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import evaluate_quality
from sdv.single_table import GaussianCopulaSynthesizer

In [15]:
def boundary_adherence(original_data, synthetic_data, column_names):
    """
    Compute the Boundary Adherence for multiple numerical columns.

    Parameters:
    - original_data: DataFrame containing the original data.
    - synthetic_data: DataFrame containing the synthetic data.
    - column_names: List of numerical column names to compute Boundary Adherence for.

    Returns:
    - Dictionary with Boundary Adherence scores for the specified columns.
    """
    results = {}
    
    for column_name in column_names:
        min_val = original_data[column_name].min()
        max_val = original_data[column_name].max()
        
        adhering_values = synthetic_data[(synthetic_data[column_name] >= min_val) & (synthetic_data[column_name] <= max_val)]
        
        results[column_name] = len(adhering_values) / len(synthetic_data)
    
    return results


def category_coverage(original_data, synthetic_data, column_names):
    """
    Compute the Category Coverage for multiple categorical columns.

    Parameters:
    - original_data: DataFrame containing the original data.
    - synthetic_data: DataFrame containing the synthetic data.
    - column_names: List of numerical column names to compute category coverage for.

    Returns:
    - Dictionary with Category Coverage scores for the specified columns.
    """
    coverage_results = {}
    
    for column_name in column_names:
        # Get unique categories in original and synthetic data
        original_categories = set(original_data[column_name].unique())
        synthetic_categories = set(synthetic_data[column_name].unique())
        
        # Compute the intersection of categories
        common_categories = original_categories.intersection(synthetic_categories)
        
        # Calculate category coverage ratio
        coverage_ratio = len(common_categories) / len(original_categories)
        coverage_results[column_name] = coverage_ratio
    
    return coverage_results


def range_coverage(original_data, synthetic_data, column_names):
    """
    Compute the Range Coverage for multiple numerical columns.

    Parameters:
    - original_data: DataFrame containing the original data.
    - synthetic_data: DataFrame containing the synthetic data.
    - column_names: List of numerical column names to compute Range Coverage for.

    Returns:
    - Dictionary with Range Coverage scores for the specified columns.
    """
    
    coverage_results = {}
    
    for column_name in column_names:
        # Determine the support (range) of the original data
        min_r = original_data[column_name].min()
        max_r = original_data[column_name].max()

        # Determine the support (range) of the synthetic data
        min_s = synthetic_data[column_name].min()
        max_s = synthetic_data[column_name].max()

        # Calculate the components of the Range Coverage formula
        component_1 = max((min_s - min_r) / (max_r - min_r), 0)
        component_2 = max((max_r - max_s) / (max_r - min_r), 0)

        # Calculate Range Coverage score
        coverage_score = 1 - component_1 - component_2
        
        # Store the result in the dictionary
        coverage_results[column_name] = coverage_score
    
    return coverage_results


def compute_descriptive_stats(data):
    """Compute descriptive statistics for a given dataset."""
    # Convert to a Pandas Series if input is a numpy array
    if isinstance(data, np.ndarray):
        data = pd.Series(data)
    
    stats = {
        'Mean': data.mean(),
        'Median': data.median(),
        'Standard Deviation': data.std(),
        'Range': data.max() - data.min(),
        'IQR': data.quantile(0.75) - data.quantile(0.25),
        'Skewness': data.skew(),
        'Kurtosis': data.kurtosis()
    }
    
    return pd.Series(stats)

In [11]:
metadata1 = SingleTableMetadata()
metadata1.detect_from_dataframe(data = data)

baseline_quality_report = evaluate_quality(
    data,  #real data
    modify_df, #synthetic data(perfect synthesizer generated(holdout))
    metadata1
)

Creating report: 100%|██████████| 4/4 [00:00<00:00,  4.24it/s]



Overall Quality Score: 91.54%

Properties:
Column Shapes: 93.97%
Column Pair Trends: 89.11%


In [12]:
metadata1 = SingleTableMetadata()

metadata1.detect_from_dataframe(data = data)
synthesier = GaussianCopulaSynthesizer(metadata1)
synthesier.fit(data)
synthetic_data = synthesier.sample(num_rows = 63326)  #keep the number of rows equal to the data hold
synthetic_data

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commision (in value),Gender,Age
0,EPX,Travel Agency,Offline,Silver Plan,No,8,JAPAN,21.61,5.29,,28
1,JZI,Airlines,Offline,Rental Vehicle Excess Insurance,No,149,VIET NAM,92.36,79.83,,51
2,CWT,Travel Agency,Offline,Comprehensive Plan,No,57,DENMARK,107.28,33.11,,37
3,RAB,Airlines,Offline,Single Trip Travel Protect Silver,No,148,INDONESIA,20.63,12.76,,48
4,JZI,Travel Agency,Offline,Bronze Plan,No,83,VIET NAM,53.62,18.63,,38
...,...,...,...,...,...,...,...,...,...,...,...
63321,C2B,Airlines,Offline,Basic Plan,No,113,COSTA RICA,57.11,20.05,F,14
63322,KML,Airlines,Offline,Value Plan,No,59,NORWAY,90.19,16.70,,38
63323,RAB,Travel Agency,Offline,Ticket Protector,No,59,SINGAPORE,60.51,10.20,M,30
63324,KML,Travel Agency,Offline,1 way Comprehensive Plan,No,17,"TAIWAN, PROVINCE OF CHINA",96.52,30.79,F,46


In [13]:
metadata1 = SingleTableMetadata()
metadata1.detect_from_dataframe(data = data)

baseline_quality_report = evaluate_quality(
    data,  #real data
    synthetic_data, #synthetic data(perfect synthesizer generated(holdout))
    metadata1
)

Creating report: 100%|██████████| 4/4 [00:00<00:00,  4.67it/s]



Overall Quality Score: 60.21%

Properties:
Column Shapes: 67.94%
Column Pair Trends: 52.48%


In [16]:
# only select the numerical columns
numerical_column = data.columns[(data.dtypes =='int64') | (data.dtypes=='float64')].tolist()
# List of model names
model_names = ["GaussianMixtureCopulaSynthesizer", "sdv_synthetic_data"]

# List of datasets corresponding to the model names
datasets = [modify_df, synthetic_data]

final_result_boundary_adherence = {}

# Iterate over model names and datasets
for model_name, dataset in zip(model_names, datasets):
    result = boundary_adherence(data, dataset, numerical_column)
    final_result_boundary_adherence[model_name] = result

final_result_boundary_adherence_df = pd.DataFrame(final_result_boundary_adherence)
avg_col = final_result_boundary_adherence_df.mean()
final_result_boundary_adherence_df = pd.concat([final_result_boundary_adherence_df, avg_col.to_frame().T], ignore_index=True)
final_result_boundary_adherence_df.index= numerical_column + ["Average"]
final_result_boundary_adherence_df.index.name = "Boundary Adherence For Feature"
final_result_boundary_adherence_df

Unnamed: 0_level_0,GaussianMixtureCopulaSynthesizer,sdv_synthetic_data
Boundary Adherence For Feature,Unnamed: 1_level_1,Unnamed: 2_level_1
Duration,1.0,1.0
Net Sales,1.0,1.0
Commision (in value),1.0,1.0
Age,1.0,1.0
Average,1.0,1.0


In [17]:
category_colum = list(filter(lambda col: data[col].dtypes == 'object', data.columns))

model_names = ["GaussianMixtureCopulaSynthesizer", "sdv_synthetic_data"]

# List of datasets corresponding to the model names
datasets = [modify_df, synthetic_data]

final_result_range_coverage = {}

# Iterate over model names and datasets
for model_name, dataset in zip(model_names, datasets):
    result = category_coverage(data, dataset, category_colum)
    final_result_range_coverage[model_name] = result

final_result_category_coverage_df = pd.DataFrame(final_result_range_coverage)
avg_col = final_result_category_coverage_df.mean()
final_result_category_coverage_df= pd.concat([final_result_category_coverage_df,avg_col.to_frame().T], ignore_index=True)
final_result_category_coverage_df.index= category_colum + ["Average"]
final_result_category_coverage_df.index.name = 'Categorical Features'
final_result_category_coverage_df

Unnamed: 0_level_0,GaussianMixtureCopulaSynthesizer,sdv_synthetic_data
Categorical Features,Unnamed: 1_level_1,Unnamed: 2_level_1
Agency,1.0,1.0
Agency Type,1.0,1.0
Distribution Channel,1.0,0.5
Product Name,1.0,1.0
Claim,1.0,1.0
Destination,0.791946,0.738255
Gender,1.0,1.0
Average,0.970278,0.891179


In [18]:
range_coverage_column = data.columns[(data.dtypes =='int64') | (data.dtypes=='float64')].tolist()
# print(range_coverage_column)
# List of model names
model_names = ["GaussianMixtureCopulaSynthesizer", "sdv_synthetic_data"]

# List of datasets corresponding to the model names
datasets = [modify_df, synthetic_data]

final_result_range_coverage = {}

# Iterate over model names and datasets
for model_name, dataset in zip(model_names, datasets):
    result = range_coverage(data, dataset, range_coverage_column)
    final_result_range_coverage[model_name] = result

final_result_range_coverage_df = pd.DataFrame(final_result_range_coverage)
avg_col = final_result_range_coverage_df.mean()
final_result_range_coverage_df= pd.concat([final_result_range_coverage_df,avg_col.to_frame().T], ignore_index=True)
final_result_range_coverage_df.index= range_coverage_column + ["Average"]
final_result_range_coverage_df.index.name = 'Range Coverage For Feature'
final_result_range_coverage_df

Unnamed: 0_level_0,GaussianMixtureCopulaSynthesizer,sdv_synthetic_data
Range Coverage For Feature,Unnamed: 1_level_1,Unnamed: 2_level_1
Duration,1.0,0.137825
Net Sales,0.861719,0.292744
Commision (in value),0.937867,0.631146
Age,0.920262,0.855932
Average,0.929962,0.479412


In [22]:
display_column= ['Agency', 'Distribution Channel', 'Net Sales', 'Commision (in value)']

In [23]:
import matplotlib.pyplot as plt
from sdv.evaluation.single_table import get_column_plot

all_columns = data.columns.tolist()


for column_name in display_column:
    fig = get_column_plot(
        real_data=data,
        synthetic_data=modify_df,
        column_name=column_name,
        metadata=metadata1
    )

    fig.show()

In [24]:
for column_name in display_column:
    fig = get_column_plot(
        real_data=data,
        synthetic_data=synthetic_data,
        column_name=column_name,
        metadata=metadata1
    )

    fig.show()