In [1]:
import numpy as np

import matplotlib.pyplot as plt

import pandas as pd

from collections import defaultdict
from scipy.stats import kstest, norm, uniform, beta, expon, truncnorm

from scipy.optimize import brentq

from tqdm import tqdm
from sklearn.mixture import GaussianMixture
import scipy.stats as stats
import sdv
from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import evaluate_quality
from sdv.single_table import GaussianCopulaSynthesizer
tqdm.pandas()

import warnings
warnings.filterwarnings(action='ignore')
import seaborn as sns

In [2]:
data = pd.read_csv('American_Income.csv')

In [3]:
data_info = data.dtypes.to_dict()
num_column = []
category_column = []
for key in data_info.keys():
    if data_info[key] == 'int64':
        num_column.append(key)
    if data_info[key] == 'object':
        category_column.append(key)

In [4]:
## all functions

def findBestComp(data):
    bic = []
    n_components_range = range(1, 11)
    for n_components in n_components_range:
        gmm = GaussianMixture(n_components=n_components)
        gmm.fit(data.values.reshape(-1, 1))
        bic.append(gmm.bic(data.values.reshape(-1, 1)))

    optimal_components = n_components_range[np.argmin(bic)]

    return optimal_components


def gmm_cdf(x, gmm):
    """
    Compute the CDF of the GMM at point x.
    
    Parameters:
    - x: Point at which to evaluate the CDF
    - gmm: Fitted GaussianMixture object
    
    Returns:
    - CDF value at point x
    """
    cdf_value = 0
    for weight, mean, covariance in zip(gmm.weights_, gmm.means_, gmm.covariances_):
        cdf_value += weight * stats.norm.cdf(x, mean, np.sqrt(covariance)) 
    return cdf_value[0][0]


def gaussian_inverse_cdf(p_values):
    return norm.ppf(p_values)


def get_truncated_normal(mean=0, sd=1, low=0, upp=10):
    """ Return a truncated normal distribution. """
    return truncnorm((low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd)

def assign_intervals(categories):
    """ Assign intervals to categories based on their frequency. """
    freq = categories.value_counts(normalize=True)
    intervals = freq.cumsum()
    category_intervals = {}
    a = 0
    for category, cum_freq in intervals.items():
        b = cum_freq
        category_intervals[category] = (a, b)
        a = b
    return category_intervals

def sample_from_category(category, category_intervals):
    """ Sample a numerical value for a given category. """
    a, b = category_intervals[category]
    mean = (a + b) / 2
    sd = (b - a) / 6
    dist = get_truncated_normal(mean, sd, a, b)
    return dist.rvs()


def gmm_cdf_dict(x, gmm):
    """
    Compute the CDF of the GMM at point x using a dictionary to represent the GMM parameters.
    
    Parameters:
    - x: Point at which to evaluate the CDF
    - gmm: Dictionary with GMM parameters
    
    Returns:
    - CDF value at point x
    """
    cdf_value = 0
    for weight, mean, covariance in zip(gmm['weights_'], gmm['means_'], gmm['covariances_']):
        cdf_value += weight * norm.cdf(x, mean, np.sqrt(covariance))
    return cdf_value

def check_cdf_bounds(gmm, bounds, probabilities):
    results = []
    for p in probabilities:
        cdf_lower = gmm_cdf_dict(bounds[0], gmm)
        cdf_upper = gmm_cdf_dict(bounds[1], gmm)
        results.append((p, cdf_lower, cdf_upper, np.sign(cdf_lower - p), np.sign(cdf_upper - p)))
    return results


def compute_gmm_bounds(gmm, num_std=3):
    """
    Compute bounds for the GMM based on standard deviations from the mean.
    
    Parameters:
    - gmm: Dictionary with GMM parameters
    - num_std: Number of standard deviations to use for bounds
    
    Returns:
    - A tuple (lower_bound, upper_bound)
    """
    means = gmm['means_'].flatten()
    stds = np.sqrt(gmm['covariances_'].flatten())
    
    lower_bounds = means - num_std * stds
    upper_bounds = means + num_std * stds
    
    lower_bound = min(lower_bounds)
    upper_bound = max(upper_bounds)
    
    return lower_bound, upper_bound

def sample(F_inv, Sigma):
    """
    Sample numerical values from the distribution and covariances of the columns.
    
    Parameters:
    - F_inv: A list of inverse CDF functions for the marginals.
    - Sigma: The covariance matrix.
    
    Returns:
    - A sample vector x in the original space.
    """
    # Step 2: Random n-dimensional Gaussian vector
    n = Sigma.shape[0]
    v = np.random.randn(n)
    
    # Step 3: Cholesky decomposition
    L = np.linalg.cholesky(Sigma)
    
    # Step 4: u ← Lv
    u = L.dot(v)
    
    # Step 5: x ← [F_0^(-1)(Φ(u_0)), F_1^(-1)(Φ(u_1)), ..., F_n^(-1)(Φ(u_n))]
    x = [F_inv_i(norm.cdf(u_i)) for F_inv_i, u_i in zip(F_inv, u)]
    
    return x



In [5]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [6]:
category_column

[' workclass',
 ' education',
 ' marital-status',
 ' occupation',
 ' relationship',
 ' race',
 ' sex',
 ' native-country',
 ' salary']

In [7]:
num_column #eduaction_num belongs to category

['age',
 ' fnlwgt',
 ' education-num',
 ' capital-gain',
 ' capital-loss',
 ' hours-per-week']

In [9]:
age_components = findBestComp(data['age'])

fnlwgt_components = findBestComp(data[' fnlwgt'])

gain_components = findBestComp(data[' capital-gain'])

loss_components = findBestComp(data[' capital-loss'])

hours_components = findBestComp(data[' hours-per-week'])

In [11]:
#inital the gmm to fit
gmm_age = GaussianMixture(n_components=age_components)
gmm_age.fit(data['age'].values.reshape(-1, 1))

# Fit a GMM to the 'hours-per-week' column
gmm_hours = GaussianMixture(n_components=hours_components)  
gmm_hours.fit(data[' hours-per-week'].values.reshape(-1, 1))

gmm_fnlwgt = GaussianMixture(n_components=fnlwgt_components)
gmm_fnlwgt.fit(data[' fnlwgt'].values.reshape(-1, 1))

gmm_gain = GaussianMixture(n_components=gain_components)
gmm_gain.fit(data[' capital-gain'].values.reshape(-1, 1))

gmm_loss = GaussianMixture(n_components=loss_components)
gmm_loss.fit(data[' capital-loss'].values.reshape(-1, 1))

In [13]:
categories_workclass = data[' workclass']
category_intervals_workclass = assign_intervals(categories_workclass)
numerical_values_workclass = categories_workclass.apply(lambda x: sample_from_category(x, category_intervals_workclass))

categories_education = data[' education']
category_intervals_education = assign_intervals(categories_education)
numerical_values_education = categories_education.apply(lambda x: sample_from_category(x, category_intervals_education))

categories_marital = data[' marital-status']
category_intervals_marital = assign_intervals(categories_marital)
numerical_values_marital = categories_marital.apply(lambda x: sample_from_category(x, category_intervals_marital))

categories_occupation = data[' occupation']
category_intervals_occupation = assign_intervals(categories_occupation)
numerical_values_occupation = categories_occupation.apply(lambda x: sample_from_category(x, category_intervals_occupation))

categories_relationship = data[' relationship']
category_intervals_relationship = assign_intervals(categories_relationship)
numerical_values_relationship = categories_relationship.apply(lambda x: sample_from_category(x, category_intervals_relationship))

categories_race = data[' race']
category_intervals_race = assign_intervals(categories_race)
numerical_values_race = categories_race.apply(lambda x: sample_from_category(x, category_intervals_race))

categories_sex = data[' sex']
category_intervals_sex = assign_intervals(categories_sex)
numerical_values_sex = categories_sex.apply(lambda x: sample_from_category(x, category_intervals_sex))

categories_country = data[' native-country']
category_intervals_country = assign_intervals(categories_country)
numerical_values_country = categories_country.apply(lambda x: sample_from_category(x, category_intervals_country))

categories_salary = data[' salary']
category_intervals_salary = assign_intervals(categories_salary)
numerical_values_salary = categories_salary.apply(lambda x: sample_from_category(x, category_intervals_salary))

In [14]:
data[' workclass'] = numerical_values_workclass
data[' education'] = numerical_values_education
data[' marital-status'] = numerical_values_marital
data[' occupation'] = numerical_values_occupation
data[' relationship'] = numerical_values_relationship
data[' race'] = numerical_values_race
data[' sex'] = numerical_values_sex
data[' native-country'] = numerical_values_country
data[' salary'] = numerical_values_salary

In [15]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,0.906194,77516,0.633112,13,0.607033,0.442110,0.529146,0.436993,0.346928,2174,0,40,0.695305,0.489264
1,50,0.732790,83311,0.643058,13,0.327116,0.299604,0.152220,0.333890,0.342774,0,0,13,0.554602,0.446327
2,38,0.409777,215646,0.166287,9,0.841400,0.891692,0.574400,0.543855,0.272486,0,0,40,0.520416,0.462242
3,53,0.520768,234721,0.822816,7,0.161458,0.897766,0.163139,0.881013,0.245039,0,0,40,0.381167,0.355028
4,28,0.409488,338409,0.630088,13,0.157326,0.042700,0.945183,0.935365,0.749773,0,0,40,0.959067,0.392157
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,0.329630,257302,0.862439,12,0.200397,0.953115,0.940848,0.357642,0.851211,0,0,38,0.433700,0.480626
32557,40,0.589032,154374,0.185625,9,0.244058,0.721171,0.277247,0.686494,0.459192,0,0,40,0.260463,0.862169
32558,58,0.457204,151910,0.257307,9,0.975380,0.414197,0.905031,0.396496,0.865927,0,0,40,0.461434,0.590923
32559,22,0.337993,201490,0.174143,9,0.594160,0.413011,0.753158,0.540187,0.251806,0,0,20,0.323667,0.181488


In [16]:
workclass_components = findBestComp(data[' workclass'])

education_components = findBestComp(data[' education'])

marital_components = findBestComp(data[' marital-status'])

occupation_components = findBestComp(data[' occupation'])

relationship_components = findBestComp(data[' relationship'])

race_components = findBestComp(data[' race'])

sex_components = findBestComp(data[' sex'])

country_components = findBestComp(data[' native-country'])

salary_components = findBestComp(data[' salary'])

In [17]:
gmm_workclass = GaussianMixture(n_components=workclass_components)
gmm_workclass.fit(data[' workclass'].values.reshape(-1, 1))

gmm_education = GaussianMixture(n_components=education_components)
gmm_education.fit(data[' education'].values.reshape(-1, 1))

gmm_marital = GaussianMixture(n_components=marital_components)
gmm_marital.fit(data[' marital-status'].values.reshape(-1, 1))

gmm_occupation = GaussianMixture(n_components=occupation_components)
gmm_occupation.fit(data[' occupation'].values.reshape(-1, 1))

gmm_relationship = GaussianMixture(n_components= relationship_components)
gmm_relationship.fit(data[' relationship'].values.reshape(-1, 1))

gmm_race = GaussianMixture(n_components=race_components)
gmm_race.fit(data[' race'].values.reshape(-1, 1))

gmm_sex = GaussianMixture(n_components=sex_components)
gmm_sex.fit(data[' sex'].values.reshape(-1, 1))

gmm_country = GaussianMixture(n_components=country_components)
gmm_country.fit(data[' native-country'].values.reshape(-1, 1))

gmm_salary = GaussianMixture(n_components=salary_components)
gmm_salary.fit(data[' salary'].values.reshape(-1, 1))

In [18]:
GMM_cdf_table = pd.DataFrame()
GMM_cdf_table['age'] = data['age'].apply(lambda x: gmm_cdf(x, gmm_age))
GMM_cdf_table[' workclass'] = data[' workclass'].apply(lambda x: gmm_cdf(x, gmm_workclass))
GMM_cdf_table[' fnlwgt'] = data[' fnlwgt'].apply(lambda x: gmm_cdf(x, gmm_fnlwgt))
GMM_cdf_table[' education'] = data[' education'].apply(lambda x: gmm_cdf(x, gmm_education))
GMM_cdf_table[' marital-status'] = data[' marital-status'].apply(lambda x: gmm_cdf(x, gmm_marital))
GMM_cdf_table[' occupation'] = data[' occupation'].apply(lambda x: gmm_cdf(x, gmm_occupation))
GMM_cdf_table[' relationship'] = data[' relationship'].apply(lambda x: gmm_cdf(x, gmm_relationship))
GMM_cdf_table[' race'] = data[' race'].apply(lambda x: gmm_cdf(x, gmm_race))
GMM_cdf_table[' sex'] = data[' sex'].apply(lambda x: gmm_cdf(x, gmm_sex))
GMM_cdf_table[' capital-gain'] = data[' capital-gain'].apply(lambda x: gmm_cdf(x, gmm_gain))
GMM_cdf_table[' capital-loss'] = data[' capital-loss'].apply(lambda x: gmm_cdf(x, gmm_loss))
GMM_cdf_table[' hours-per-week'] = data[' hours-per-week'].apply(lambda x: gmm_cdf(x, gmm_hours))
GMM_cdf_table[' native-country'] = data[' native-country'].apply(lambda x: gmm_cdf(x, gmm_country))
GMM_cdf_table[' salary'] = data[' salary'].apply(lambda x: gmm_cdf(x, gmm_salary))


In [22]:
GMM_invers_cdf_table = pd.DataFrame()

GMM_invers_cdf_table['age'] = GMM_cdf_table['age'].apply(lambda x: gaussian_inverse_cdf(x))
GMM_invers_cdf_table[' workclass'] = GMM_cdf_table[' workclass'].apply(lambda x: gaussian_inverse_cdf(x))
GMM_invers_cdf_table[' fnlwgt'] = GMM_cdf_table[' fnlwgt'].apply(lambda x: gaussian_inverse_cdf(x))
GMM_invers_cdf_table[' education'] = GMM_cdf_table[' education'].apply(lambda x: gaussian_inverse_cdf(x))
GMM_invers_cdf_table[' marital-status'] = GMM_cdf_table[' marital-status'].apply(lambda x: gaussian_inverse_cdf(x))
GMM_invers_cdf_table[' occupation'] = GMM_cdf_table[' occupation'].apply(lambda x: gaussian_inverse_cdf(x))
GMM_invers_cdf_table[' relationship'] = GMM_cdf_table[' relationship'].apply(lambda x: gaussian_inverse_cdf(x))
GMM_invers_cdf_table[' race'] = GMM_cdf_table[' race'].apply(lambda x: gaussian_inverse_cdf(x))
GMM_invers_cdf_table[' sex'] = GMM_cdf_table[' sex'].apply(lambda x: gaussian_inverse_cdf(x))
GMM_invers_cdf_table[' capital-gain'] = GMM_cdf_table[' capital-gain'].apply(lambda x: gaussian_inverse_cdf(x))
GMM_invers_cdf_table[' capital-loss'] = GMM_cdf_table[' capital-loss'].apply(lambda x: gaussian_inverse_cdf(x))
GMM_invers_cdf_table[' hours-per-week'] = GMM_cdf_table[' hours-per-week'].apply(lambda x: gaussian_inverse_cdf(x))
GMM_invers_cdf_table[' native-country'] = GMM_cdf_table[' native-country'].apply(lambda x: gaussian_inverse_cdf(x))
GMM_invers_cdf_table[' salary'] = GMM_cdf_table[' salary'].apply(lambda x: gaussian_inverse_cdf(x))


In [25]:
covariance_matrix = GMM_invers_cdf_table.cov()

In [28]:
gmm_age_dict = {
    'weights_': gmm_age.weights_, 
    'means_': gmm_age.means_, 
    'covariances_': gmm_age.covariances_ 
}
gmm_workclass_dict = {
    'weights_': gmm_workclass.weights_, 
    'means_': gmm_workclass.means_, 
    'covariances_': gmm_workclass.covariances_ 
}
gmm_fnlwgt_dict = {
    'weights_': gmm_fnlwgt.weights_, 
    'means_': gmm_fnlwgt.means_, 
    'covariances_': gmm_fnlwgt.covariances_ 
}
gmm_education_dict = {
    'weights_': gmm_education.weights_, 
    'means_': gmm_education.means_, 
    'covariances_': gmm_education.covariances_ 
}
gmm_marital_dict = {
    'weights_': gmm_marital.weights_, 
    'means_': gmm_marital.means_, 
    'covariances_': gmm_marital.covariances_ 
}
gmm_occupation_dict = {
    'weights_': gmm_occupation.weights_, 
    'means_': gmm_occupation.means_, 
    'covariances_': gmm_occupation.covariances_ 
}
gmm_relationship_dict = {
    'weights_': gmm_relationship.weights_, 
    'means_': gmm_relationship.means_, 
    'covariances_': gmm_relationship.covariances_ 
}
gmm_race_dict = {
    'weights_': gmm_race.weights_, 
    'means_': gmm_race.means_, 
    'covariances_': gmm_race.covariances_ 
}
gmm_sex_dict = {
    'weights_': gmm_sex.weights_, 
    'means_': gmm_sex.means_, 
    'covariances_': gmm_sex.covariances_ 
}
gmm_gain_dict = {
    'weights_': gmm_gain.weights_, 
    'means_': gmm_gain.means_, 
    'covariances_': gmm_gain.covariances_ 
}
gmm_loss_dict = {
    'weights_': gmm_loss.weights_, 
    'means_': gmm_loss.means_, 
    'covariances_': gmm_loss.covariances_ 
}
gmm_country_dict = {
    'weights_': gmm_country.weights_, 
    'means_': gmm_country.means_, 
    'covariances_': gmm_country.covariances_ 
}
gmm_salary_dict = {
    'weights_': gmm_salary.weights_, 
    'means_': gmm_salary.means_, 
    'covariances_': gmm_salary.covariances_ 
}

gmm_hours_dict = {
    'weights_': gmm_hours.weights_,
    'means_': gmm_hours.means_, 
    'covariances_': gmm_hours.covariances_  
}


age_bounds = (GMM_cdf_table['age'].min(), GMM_cdf_table['age'].max())
workclass_bounds = (GMM_cdf_table[' workclass'].min(), GMM_cdf_table[' workclass'].max())
fnlwgt_bounds = (GMM_cdf_table[' fnlwgt'].min(), GMM_cdf_table[' fnlwgt'].max())
education_bounds = (GMM_cdf_table[' education'].min(), GMM_cdf_table[' education'].max())
marital_bounds = (GMM_cdf_table[' marital-status'].min(), GMM_cdf_table[' marital-status'].max())
occupation_bounds = (GMM_cdf_table[' occupation'].min(), GMM_cdf_table[' occupation'].max())
relationship_bounds = (GMM_cdf_table[' relationship'].min(), GMM_cdf_table[' relationship'].max())
race_bounds = (GMM_cdf_table[' race'].min(), GMM_cdf_table[' race'].max())
sex_bounds = (GMM_cdf_table[' sex'].min(), GMM_cdf_table[' sex'].max())
gain_bounds = (GMM_cdf_table[' capital-gain'].min(), GMM_cdf_table[' capital-gain'].max())
loss_bounds = (GMM_cdf_table[' capital-loss'].min(), GMM_cdf_table[' capital-loss'].max())
country_bounds = (GMM_cdf_table[' native-country'].min(), GMM_cdf_table[' native-country'].max())
salary_bounds = (GMM_cdf_table[' salary'].min(), GMM_cdf_table[' salary'].max())
hours_bounds = (GMM_cdf_table[' hours-per-week'].min(), GMM_cdf_table[' hours-per-week'].max())   
probabilities = [0.1, 0.5, 0.9]

age_cdf_check = check_cdf_bounds(gmm_age_dict, age_bounds, probabilities)
workclass_cdf_check = check_cdf_bounds(gmm_workclass_dict, workclass_bounds, probabilities)
fnlwgt_cdf_check = check_cdf_bounds(gmm_fnlwgt_dict, fnlwgt_bounds, probabilities)
education_cdf_check = check_cdf_bounds(gmm_education_dict, education_bounds, probabilities)
marital_cdf_check = check_cdf_bounds(gmm_marital_dict, marital_bounds, probabilities)
occupation_cdf_check = check_cdf_bounds(gmm_occupation_dict, occupation_bounds, probabilities)
relationship_cdf_check = check_cdf_bounds(gmm_relationship_dict, relationship_bounds, probabilities)
race_cdf_check = check_cdf_bounds(gmm_race_dict, race_bounds, probabilities)
sex_cdf_check = check_cdf_bounds(gmm_sex_dict, sex_bounds, probabilities)
gain_cdf_check = check_cdf_bounds(gmm_gain_dict, gain_bounds, probabilities)
loss_cdf_check = check_cdf_bounds(gmm_loss_dict, loss_bounds, probabilities)
country_cdf_check = check_cdf_bounds(gmm_country_dict, country_bounds, probabilities)
salary_cdf_check = check_cdf_bounds(gmm_salary_dict, salary_bounds, probabilities)
hours_cdf_check = check_cdf_bounds(gmm_hours_dict, hours_bounds, probabilities)
# (age_cdf_check, hours_cdf_check)

In [29]:
new_age_bounds = compute_gmm_bounds(gmm_age_dict)
new_workclass_bounds = compute_gmm_bounds(gmm_workclass_dict)
new_fnlwgt_bounds = compute_gmm_bounds(gmm_fnlwgt_dict)
new_marital_bounds = compute_gmm_bounds(gmm_marital_dict)
new_education_bounds = compute_gmm_bounds(gmm_education_dict)
new_occupation_bounds = compute_gmm_bounds(gmm_occupation_dict)
new_relationship_bounds = compute_gmm_bounds(gmm_relationship_dict)
new_race_bounds = compute_gmm_bounds(gmm_race_dict)
new_sex_bounds = compute_gmm_bounds(gmm_sex_dict)
new_gain_bounds = compute_gmm_bounds(gmm_gain_dict)
new_loss_bounds = compute_gmm_bounds(gmm_loss_dict)
new_country_bounds = compute_gmm_bounds(gmm_country_dict)
new_salary_bounds = compute_gmm_bounds(gmm_salary_dict)
new_hours_bounds = compute_gmm_bounds(gmm_hours_dict)

# # Check the new CDF values at the computed bounds
# new_age_cdf_check = check_cdf_bounds(gmm_age_dict, new_age_bounds, probabilities)
# new_hours_cdf_check = check_cdf_bounds(gmm_hours_dict, new_hours_bounds, probabilities)

In [32]:
def inverse_cdf_gmm_age(p):
    lower_bound, upper_bound = new_age_bounds
    for _ in range(5):
        f_lower = gmm_cdf(lower_bound, gmm_age) - p
        f_upper = gmm_cdf(upper_bound, gmm_age) - p
        # Check if the signs are different
        if np.sign(f_lower) != np.sign(f_upper):
            try:
                return brentq(lambda x: gmm_cdf(x, gmm_age) - p, lower_bound, upper_bound, xtol=1e-6, rtol=1e-6)
            except ValueError as e:
                print(f"Failed to converge for p={p} with bounds ({lower_bound}, {upper_bound}): {e}")
                return np.nan  # or some other sentinel value indicating failure
        else:
            # Adjust the bounds
            lower_bound *= 0.9  # shrink the bounds
            upper_bound *= 1.1  # expand the bounds
    print(f"Failed to find valid bounds for p={p} after {5} retries")
    return np.nan  # or some other sentinel value indicating failure

def inverse_cdf_gmm_workclass(p):
    lower_bound, upper_bound = new_workclass_bounds
    for _ in range(5):
        f_lower = gmm_cdf(lower_bound, gmm_workclass) - p
        f_upper = gmm_cdf(upper_bound, gmm_workclass) - p
        # Check if the signs are different
        if np.sign(f_lower) != np.sign(f_upper):
            try:
                return brentq(lambda x: gmm_cdf(x, gmm_workclass) - p, lower_bound, upper_bound, xtol=1e-6, rtol=1e-6)
            except ValueError as e:
                print(f"Failed to converge for p={p} with bounds ({lower_bound}, {upper_bound}): {e}")
                return np.nan  # or some other sentinel value indicating failure
        else:
            # Adjust the bounds
            lower_bound *= 0.9  # shrink the bounds
            upper_bound *= 1.1  # expand the bounds
    print(f"Failed to find valid bounds for p={p} after {5} retries")
    return np.nan  # or some other sentinel value indicating failure

def inverse_cdf_gmm_fnlwgt(p):
    lower_bound, upper_bound = new_fnlwgt_bounds
    for _ in range(5):
        f_lower = gmm_cdf(lower_bound, gmm_fnlwgt) - p
        f_upper = gmm_cdf(upper_bound, gmm_fnlwgt) - p
        # Check if the signs are different
        if np.sign(f_lower) != np.sign(f_upper):
            try:
                return brentq(lambda x: gmm_cdf(x, gmm_fnlwgt) - p, lower_bound, upper_bound, xtol=1e-6, rtol=1e-6)
            except ValueError as e:
                print(f"Failed to converge for p={p} with bounds ({lower_bound}, {upper_bound}): {e}")
                return np.nan  # or some other sentinel value indicating failure
        else:
            # Adjust the bounds
            lower_bound *= 0.9  # shrink the bounds
            upper_bound *= 1.1  # expand the bounds
    print(f"Failed to find valid bounds for p={p} after {5} retries")
    return np.nan  # or some other sentinel value indicating failure

def inverse_cdf_gmm_education(p):
    lower_bound, upper_bound = new_education_bounds
    for _ in range(5):
        f_lower = gmm_cdf(lower_bound, gmm_education) - p
        f_upper = gmm_cdf(upper_bound, gmm_education) - p
        # Check if the signs are different
        if np.sign(f_lower) != np.sign(f_upper):
            try:
                return brentq(lambda x: gmm_cdf(x, gmm_education) - p, lower_bound, upper_bound, xtol=1e-6, rtol=1e-6)
            except ValueError as e:
                print(f"Failed to converge for p={p} with bounds ({lower_bound}, {upper_bound}): {e}")
                return np.nan  # or some other sentinel value indicating failure
        else:
            # Adjust the bounds
            lower_bound *= 0.9  # shrink the bounds
            upper_bound *= 1.1  # expand the bounds
    print(f"Failed to find valid bounds for p={p} after {5} retries")
    return np.nan  # or some other sentinel value indicating failure


def inverse_cdf_gmm_marital(p):
    lower_bound, upper_bound = new_marital_bounds
    for _ in range(5):
        f_lower = gmm_cdf(lower_bound, gmm_marital) - p
        f_upper = gmm_cdf(upper_bound, gmm_marital) - p
        # Check if the signs are different
        if np.sign(f_lower) != np.sign(f_upper):
            try:
                return brentq(lambda x: gmm_cdf(x, gmm_marital) - p, lower_bound, upper_bound, xtol=1e-6, rtol=1e-6)
            except ValueError as e:
                print(f"Failed to converge for p={p} with bounds ({lower_bound}, {upper_bound}): {e}")
                return np.nan  # or some other sentinel value indicating failure
        else:
            # Adjust the bounds
            lower_bound *= 0.9  # shrink the bounds
            upper_bound *= 1.1  # expand the bounds
    print(f"Failed to find valid bounds for p={p} after {5} retries")
    return np.nan  # or some other sentinel value indicating failure


def inverse_cdf_gmm_occupation(p):
    lower_bound, upper_bound = new_occupation_bounds
    for _ in range(5):
        f_lower = gmm_cdf(lower_bound, gmm_occupation) - p
        f_upper = gmm_cdf(upper_bound, gmm_occupation) - p
        # Check if the signs are different
        if np.sign(f_lower) != np.sign(f_upper):
            try:
                return brentq(lambda x: gmm_cdf(x, gmm_occupation) - p, lower_bound, upper_bound, xtol=1e-6, rtol=1e-6)
            except ValueError as e:
                print(f"Failed to converge for p={p} with bounds ({lower_bound}, {upper_bound}): {e}")
                return np.nan  # or some other sentinel value indicating failure
        else:
            # Adjust the bounds
            lower_bound *= 0.9  # shrink the bounds
            upper_bound *= 1.1  # expand the bounds
    print(f"Failed to find valid bounds for p={p} after {5} retries")
    return np.nan  # or some other sentinel value indicating failure


def inverse_cdf_gmm_relationship(p):
    lower_bound, upper_bound = new_relationship_bounds
    for _ in range(5):
        f_lower = gmm_cdf(lower_bound, gmm_relationship) - p
        f_upper = gmm_cdf(upper_bound, gmm_relationship) - p
        # Check if the signs are different
        if np.sign(f_lower) != np.sign(f_upper):
            try:
                return brentq(lambda x: gmm_cdf(x, gmm_relationship) - p, lower_bound, upper_bound, xtol=1e-6, rtol=1e-6)
            except ValueError as e:
                print(f"Failed to converge for p={p} with bounds ({lower_bound}, {upper_bound}): {e}")
                return np.nan  # or some other sentinel value indicating failure
        else:
            # Adjust the bounds
            lower_bound *= 0.9  # shrink the bounds
            upper_bound *= 1.1  # expand the bounds
    print(f"Failed to find valid bounds for p={p} after {5} retries")
    return np.nan  # or some other sentinel value indicating failure


def inverse_cdf_gmm_race(p):
    lower_bound, upper_bound = new_race_bounds
    for _ in range(5):
        f_lower = gmm_cdf(lower_bound, gmm_race) - p
        f_upper = gmm_cdf(upper_bound, gmm_race) - p
        # Check if the signs are different
        if np.sign(f_lower) != np.sign(f_upper):
            try:
                return brentq(lambda x: gmm_cdf(x, gmm_race) - p, lower_bound, upper_bound, xtol=1e-6, rtol=1e-6)
            except ValueError as e:
                print(f"Failed to converge for p={p} with bounds ({lower_bound}, {upper_bound}): {e}")
                return np.nan  # or some other sentinel value indicating failure
        else:
            # Adjust the bounds
            lower_bound *= 0.9  # shrink the bounds
            upper_bound *= 1.1  # expand the bounds
    print(f"Failed to find valid bounds for p={p} after {5} retries")
    return np.nan  # or some other sentinel value indicating failure


def inverse_cdf_gmm_sex(p):
    lower_bound, upper_bound = new_sex_bounds
    for _ in range(5):
        f_lower = gmm_cdf(lower_bound, gmm_sex) - p
        f_upper = gmm_cdf(upper_bound, gmm_sex) - p
        # Check if the signs are different
        if np.sign(f_lower) != np.sign(f_upper):
            try:
                return brentq(lambda x: gmm_cdf(x, gmm_sex) - p, lower_bound, upper_bound, xtol=1e-6, rtol=1e-6)
            except ValueError as e:
                print(f"Failed to converge for p={p} with bounds ({lower_bound}, {upper_bound}): {e}")
                return np.nan  # or some other sentinel value indicating failure
        else:
            # Adjust the bounds
            lower_bound *= 0.9  # shrink the bounds
            upper_bound *= 1.1  # expand the bounds
    print(f"Failed to find valid bounds for p={p} after {5} retries")
    return np.nan  # or some other sentinel value indicating failure


def inverse_cdf_gmm_gain(p):
    lower_bound, upper_bound = new_gain_bounds
    for _ in range(5):
        f_lower = gmm_cdf(lower_bound, gmm_gain) - p
        f_upper = gmm_cdf(upper_bound, gmm_gain) - p
        # Check if the signs are different
        if np.sign(f_lower) != np.sign(f_upper):
            try:
                return brentq(lambda x: gmm_cdf(x, gmm_gain) - p, lower_bound, upper_bound, xtol=1e-6, rtol=1e-6)
            except ValueError as e:
                print(f"Failed to converge for p={p} with bounds ({lower_bound}, {upper_bound}): {e}")
                return np.nan  # or some other sentinel value indicating failure
        else:
            # Adjust the bounds
            lower_bound *= 0.9  # shrink the bounds
            upper_bound *= 1.1  # expand the bounds
    print(f"Failed to find valid bounds for p={p} after {5} retries")
    return np.nan  # or some other sentinel value indicating failure



def inverse_cdf_gmm_loss(p):
    lower_bound, upper_bound = new_loss_bounds
    for _ in range(5):
        f_lower = gmm_cdf(lower_bound, gmm_loss) - p
        f_upper = gmm_cdf(upper_bound, gmm_loss) - p
        # Check if the signs are different
        if np.sign(f_lower) != np.sign(f_upper):
            try:
                return brentq(lambda x: gmm_cdf(x, gmm_loss) - p, lower_bound, upper_bound, xtol=1e-6, rtol=1e-6)
            except ValueError as e:
                print(f"Failed to converge for p={p} with bounds ({lower_bound}, {upper_bound}): {e}")
                return np.nan  # or some other sentinel value indicating failure
        else:
            # Adjust the bounds
            lower_bound *= 0.9  # shrink the bounds
            upper_bound *= 1.1  # expand the bounds
    print(f"Failed to find valid bounds for p={p} after {5} retries")
    return np.nan  # or some other sentinel value indicating failure

def inverse_cdf_gmm_hours(p):
    lower_bound, upper_bound = new_hours_bounds
    for _ in range(5):
        f_lower = gmm_cdf(lower_bound, gmm_hours) - p
        f_upper = gmm_cdf(upper_bound, gmm_hours) - p
        if np.sign(f_lower) != np.sign(f_upper):
            try:
                return brentq(lambda x: gmm_cdf(x, gmm_hours) - p, lower_bound, upper_bound, xtol=1e-6, rtol=1e-6)
            except ValueError as e:
                print(f"Failed to converge for p={p} with bounds ({lower_bound}, {upper_bound}): {e}")
                return np.nan  # or some other sentinel value indicating failure
        else:
            lower_bound *= 0.9  # shrink the bounds
            upper_bound *= 1.1  # expand the bounds
    print(f"Failed to find valid bounds for p={p} after {5} retries")
    return np.nan  # or some other sentinel value indicating failure

def inverse_cdf_gmm_country(p):
    lower_bound, upper_bound = new_country_bounds
    for _ in range(5):
        f_lower = gmm_cdf(lower_bound, gmm_country) - p
        f_upper = gmm_cdf(upper_bound, gmm_country) - p
        # Check if the signs are different
        if np.sign(f_lower) != np.sign(f_upper):
            try:
                return brentq(lambda x: gmm_cdf(x, gmm_country) - p, lower_bound, upper_bound, xtol=1e-6, rtol=1e-6)
            except ValueError as e:
                print(f"Failed to converge for p={p} with bounds ({lower_bound}, {upper_bound}): {e}")
                return np.nan  # or some other sentinel value indicating failure
        else:
            # Adjust the bounds
            lower_bound *= 0.9  # shrink the bounds
            upper_bound *= 1.1  # expand the bounds
    print(f"Failed to find valid bounds for p={p} after {5} retries")
    return np.nan  # or some other sentinel value indicating failure


def inverse_cdf_gmm_salary(p):
    lower_bound, upper_bound = new_salary_bounds
    for _ in range(5):
        f_lower = gmm_cdf(lower_bound, gmm_salary) - p
        f_upper = gmm_cdf(upper_bound, gmm_salary) - p
        # Check if the signs are different
        if np.sign(f_lower) != np.sign(f_upper):
            try:
                return brentq(lambda x: gmm_cdf(x, gmm_salary) - p, lower_bound, upper_bound, xtol=1e-6, rtol=1e-6)
            except ValueError as e:
                print(f"Failed to converge for p={p} with bounds ({lower_bound}, {upper_bound}): {e}")
                return np.nan  # or some other sentinel value indicating failure
        else:
            # Adjust the bounds
            lower_bound *= 0.9  # shrink the bounds
            upper_bound *= 1.1  # expand the bounds
    print(f"Failed to find valid bounds for p={p} after {5} retries")
    return np.nan  # or some other sentinel value indicating failure


F_inv = [inverse_cdf_gmm_age, inverse_cdf_gmm_workclass, inverse_cdf_gmm_fnlwgt, inverse_cdf_gmm_education, inverse_cdf_gmm_marital, inverse_cdf_gmm_occupation, 
         inverse_cdf_gmm_relationship, inverse_cdf_gmm_race, inverse_cdf_gmm_sex, inverse_cdf_gmm_gain, inverse_cdf_gmm_loss, inverse_cdf_gmm_hours, inverse_cdf_gmm_country, 
         inverse_cdf_gmm_salary]

In [31]:
num_rows = data.shape[0]  # Number of rows you want to generate
synthetic_dataset = [sample(F_inv, covariance_matrix) for _ in tqdm(range(num_rows))]
synthetic_df_full = pd.DataFrame(synthetic_dataset, columns=['age', ' workclass', ' fnlwgt', ' education', ' marital-status', ' occupation', ' relationship', ' race',
                                                             ' sex', ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country', ' salary'])

  1%|          | 246/32561 [00:17<37:03, 14.53it/s]

Failed to find valid bounds for p=0.0007253601583202063 after 5 retries


  2%|▏         | 688/32561 [00:48<37:52, 14.02it/s]

Failed to find valid bounds for p=4.1729042973430085e-05 after 5 retries


  4%|▎         | 1200/32561 [01:24<38:45, 13.49it/s]

Failed to find valid bounds for p=0.0005669449353195798 after 5 retries


  5%|▍         | 1504/32561 [01:46<36:38, 14.13it/s]

Failed to find valid bounds for p=0.00010533326511879899 after 5 retries


  5%|▌         | 1768/32561 [02:05<35:08, 14.60it/s]

Failed to find valid bounds for p=0.0007410494830575356 after 5 retries


  6%|▌         | 1826/32561 [02:09<36:23, 14.07it/s]

Failed to find valid bounds for p=0.0004111051832920867 after 5 retries


  7%|▋         | 2200/32561 [02:35<37:47, 13.39it/s]

Failed to find valid bounds for p=0.000622871243406263 after 5 retries


  7%|▋         | 2276/32561 [02:41<37:23, 13.50it/s]

Failed to find valid bounds for p=0.0005267260917386511 after 5 retries


  8%|▊         | 2492/32561 [02:56<37:33, 13.34it/s]

Failed to find valid bounds for p=4.5526195198410066e-05 after 5 retries


  8%|▊         | 2498/32561 [02:57<37:04, 13.52it/s]

Failed to find valid bounds for p=0.0008245542664046808 after 5 retries


  9%|▉         | 3012/32561 [03:33<35:30, 13.87it/s]

Failed to find valid bounds for p=0.000633729664077352 after 5 retries


 10%|▉         | 3134/32561 [03:42<33:39, 14.57it/s]

Failed to find valid bounds for p=0.0008247529465429228 after 5 retries


 10%|▉         | 3212/32561 [03:47<34:14, 14.29it/s]

Failed to find valid bounds for p=5.4938821616719655e-05 after 5 retries


 11%|█         | 3578/32561 [04:13<34:04, 14.17it/s]

Failed to find valid bounds for p=0.0003177095054298412 after 5 retries


 12%|█▏        | 3780/32561 [04:27<32:44, 14.65it/s]

Failed to find valid bounds for p=1.3461269134303462e-05 after 5 retries


 12%|█▏        | 3792/32561 [04:28<32:51, 14.59it/s]

Failed to find valid bounds for p=6.311932578221315e-05 after 5 retries


 12%|█▏        | 3816/32561 [04:30<32:25, 14.77it/s]

Failed to find valid bounds for p=2.864634907778797e-05 after 5 retries


 12%|█▏        | 4048/32561 [04:46<33:08, 14.34it/s]

Failed to find valid bounds for p=3.772777067560869e-06 after 5 retries
Failed to find valid bounds for p=9.928950191597698e-05 after 5 retries


 14%|█▎        | 4400/32561 [05:11<33:09, 14.16it/s]

Failed to find valid bounds for p=3.214301327598467e-05 after 5 retries


 14%|█▎        | 4468/32561 [05:16<32:49, 14.26it/s]

Failed to find valid bounds for p=0.0008004006280693336 after 5 retries


 14%|█▍        | 4700/32561 [05:32<33:09, 14.00it/s]

Failed to find valid bounds for p=0.0006204911140173591 after 5 retries


 16%|█▌        | 5060/32561 [05:57<31:37, 14.49it/s]

Failed to find valid bounds for p=0.00013739925020204606 after 5 retries


 16%|█▌        | 5118/32561 [06:01<32:15, 14.18it/s]

Failed to find valid bounds for p=0.0007645914745780199 after 5 retries


 17%|█▋        | 5436/32561 [06:23<31:09, 14.51it/s]

Failed to find valid bounds for p=0.0009377773390180355 after 5 retries


 18%|█▊        | 5916/32561 [06:57<30:27, 14.58it/s]

Failed to find valid bounds for p=0.0005920702429857345 after 5 retries


 18%|█▊        | 5930/32561 [06:58<31:25, 14.12it/s]

Failed to find valid bounds for p=0.0006646748879187801 after 5 retries
Failed to find valid bounds for p=0.0006761143301349398 after 5 retries


 20%|█▉        | 6396/32561 [07:31<29:51, 14.61it/s]

Failed to find valid bounds for p=0.0004132086570129064 after 5 retries


 20%|█▉        | 6472/32561 [07:36<31:33, 13.78it/s]

Failed to find valid bounds for p=0.0010996541821474163 after 5 retries


 20%|██        | 6612/32561 [07:46<30:35, 14.14it/s]

Failed to find valid bounds for p=0.0005866636608078007 after 5 retries


 20%|██        | 6652/32561 [07:49<30:29, 14.16it/s]

Failed to find valid bounds for p=5.783945364301526e-05 after 5 retries


 20%|██        | 6668/32561 [07:50<29:28, 14.64it/s]

Failed to find valid bounds for p=0.00047240934455075535 after 5 retries


 21%|██        | 6680/32561 [07:51<31:06, 13.87it/s]

Failed to find valid bounds for p=0.0007712470968405631 after 5 retries


 21%|██        | 6746/32561 [07:56<29:50, 14.41it/s]

Failed to find valid bounds for p=0.00021143438061024697 after 5 retries


 21%|██        | 6836/32561 [08:02<32:15, 13.29it/s]

Failed to find valid bounds for p=0.0003011719015821437 after 5 retries


 22%|██▏       | 7026/32561 [08:16<30:07, 14.13it/s]

Failed to find valid bounds for p=0.0006440563011220373 after 5 retries


 22%|██▏       | 7212/32561 [08:29<29:53, 14.14it/s]

Failed to find valid bounds for p=0.000181484545165522 after 5 retries


 22%|██▏       | 7282/32561 [08:34<29:25, 14.32it/s]

Failed to find valid bounds for p=0.000526948647252116 after 5 retries


 23%|██▎       | 7518/32561 [08:50<29:03, 14.36it/s]

Failed to find valid bounds for p=5.441197626252106e-05 after 5 retries


 23%|██▎       | 7526/32561 [08:51<29:05, 14.34it/s]

Failed to find valid bounds for p=0.0004927227104920518 after 5 retries


 23%|██▎       | 7618/32561 [08:57<28:14, 14.72it/s]

Failed to find valid bounds for p=0.00036124971096092296 after 5 retries


 25%|██▍       | 8006/32561 [09:25<29:00, 14.11it/s]

Failed to find valid bounds for p=3.472217141968874e-05 after 5 retries


 25%|██▌       | 8154/32561 [09:35<27:14, 14.93it/s]

Failed to find valid bounds for p=0.00027421579939209713 after 5 retries


 26%|██▌       | 8312/32561 [09:46<28:41, 14.08it/s]

Failed to find valid bounds for p=6.246740496224468e-05 after 5 retries


 26%|██▌       | 8470/32561 [09:58<27:41, 14.50it/s]

Failed to find valid bounds for p=0.0004272208766576482 after 5 retries


 27%|██▋       | 8692/32561 [10:13<28:56, 13.75it/s]

Failed to find valid bounds for p=0.00031894794649871245 after 5 retries


 27%|██▋       | 8734/32561 [10:16<27:38, 14.37it/s]

Failed to find valid bounds for p=0.0001945640240470321 after 5 retries


 28%|██▊       | 9132/32561 [10:44<28:24, 13.75it/s]

Failed to find valid bounds for p=0.0001680559402508826 after 5 retries


 29%|██▉       | 9444/32561 [11:06<26:22, 14.60it/s]

Failed to find valid bounds for p=0.00047597373794066683 after 5 retries


 29%|██▉       | 9500/32561 [11:10<27:23, 14.03it/s]

Failed to find valid bounds for p=0.0006348498208583202 after 5 retries


 30%|██▉       | 9758/32561 [11:28<26:58, 14.09it/s]

Failed to find valid bounds for p=0.00011619150853468952 after 5 retries


 30%|███       | 9862/32561 [11:36<28:11, 13.42it/s]

Failed to find valid bounds for p=0.000819853739800524 after 5 retries


 30%|███       | 9924/32561 [11:40<26:15, 14.37it/s]

Failed to find valid bounds for p=0.0011142254512563736 after 5 retries


 31%|███▏      | 10228/32561 [12:02<25:56, 14.35it/s]

Failed to find valid bounds for p=3.698727477639975e-05 after 5 retries


 32%|███▏      | 10274/32561 [12:05<26:01, 14.27it/s]

Failed to find valid bounds for p=0.00042506402198468795 after 5 retries


 32%|███▏      | 10366/32561 [12:12<26:13, 14.11it/s]

Failed to find valid bounds for p=0.0005909896534984187 after 5 retries


 33%|███▎      | 10644/32561 [12:31<26:41, 13.68it/s]

Failed to find valid bounds for p=0.000246537586706559 after 5 retries


 33%|███▎      | 10772/32561 [12:40<25:17, 14.36it/s]

Failed to find valid bounds for p=0.0001924987347881548 after 5 retries


 36%|███▋      | 11866/32561 [13:57<23:47, 14.50it/s]

Failed to find valid bounds for p=0.00011276849855776234 after 5 retries


 37%|███▋      | 12032/32561 [14:09<23:27, 14.59it/s]

Failed to find valid bounds for p=0.0007539671127691785 after 5 retries


 37%|███▋      | 12166/32561 [14:18<24:07, 14.09it/s]

Failed to find valid bounds for p=0.000271263448821426 after 5 retries


 38%|███▊      | 12212/32561 [14:22<24:12, 14.01it/s]

Failed to find valid bounds for p=0.0005464141918473419 after 5 retries


 38%|███▊      | 12420/32561 [14:36<23:06, 14.53it/s]

Failed to find valid bounds for p=6.483892898249822e-05 after 5 retries


 38%|███▊      | 12494/32561 [14:42<24:35, 13.60it/s]

Failed to find valid bounds for p=1.8144622797328708e-05 after 5 retries


 38%|███▊      | 12516/32561 [14:43<23:31, 14.20it/s]

Failed to find valid bounds for p=8.702897982440852e-05 after 5 retries


 39%|███▊      | 12596/32561 [14:49<24:06, 13.81it/s]

Failed to find valid bounds for p=0.0006152204754117967 after 5 retries


 39%|███▉      | 12826/32561 [15:05<23:32, 13.97it/s]

Failed to find valid bounds for p=0.0002070711984523423 after 5 retries


 39%|███▉      | 12848/32561 [15:07<23:30, 13.97it/s]

Failed to find valid bounds for p=0.00019067292036926476 after 5 retries


 40%|███▉      | 12904/32561 [15:11<22:55, 14.29it/s]

Failed to find valid bounds for p=0.0004125849701442496 after 5 retries


 41%|████      | 13326/32561 [15:41<22:27, 14.27it/s]

Failed to find valid bounds for p=4.924983257065442e-05 after 5 retries


 42%|████▏     | 13622/32561 [16:01<21:56, 14.39it/s]

Failed to find valid bounds for p=0.00023549960902915931 after 5 retries


 42%|████▏     | 13764/32561 [16:11<22:33, 13.89it/s]

Failed to find valid bounds for p=0.00018857691960266152 after 5 retries


 43%|████▎     | 14088/32561 [16:34<20:38, 14.92it/s]

Failed to find valid bounds for p=0.000257222082895549 after 5 retries


 44%|████▍     | 14306/32561 [16:49<22:36, 13.46it/s]

Failed to find valid bounds for p=0.0006318106059952888 after 5 retries


 44%|████▍     | 14394/32561 [16:56<21:21, 14.17it/s]

Failed to find valid bounds for p=0.00013789433749947965 after 5 retries


 45%|████▍     | 14584/32561 [17:09<21:10, 14.15it/s]

Failed to find valid bounds for p=0.0009145002448607873 after 5 retries


 48%|████▊     | 15548/32561 [18:18<20:22, 13.91it/s]

Failed to find valid bounds for p=0.0007831209867653694 after 5 retries


 48%|████▊     | 15792/32561 [18:35<20:02, 13.95it/s]

Failed to find valid bounds for p=0.00023248867003120274 after 5 retries


 49%|████▊     | 15808/32561 [18:36<19:30, 14.32it/s]

Failed to find valid bounds for p=0.00014269115329943848 after 5 retries


 50%|████▉     | 16206/32561 [19:05<20:15, 13.46it/s]

Failed to find valid bounds for p=0.0005508371436458684 after 5 retries


 51%|█████     | 16604/32561 [19:34<18:16, 14.55it/s]

Failed to find valid bounds for p=0.0003443150166910012 after 5 retries


 52%|█████▏    | 16906/32561 [19:55<18:46, 13.89it/s]

Failed to find valid bounds for p=0.0001963593718119818 after 5 retries


 54%|█████▍    | 17524/32561 [20:39<17:47, 14.08it/s]

Failed to find valid bounds for p=2.413043291581226e-07 after 5 retries


 54%|█████▍    | 17656/32561 [20:48<17:07, 14.51it/s]

Failed to find valid bounds for p=0.0002330053671912227 after 5 retries


 55%|█████▍    | 17790/32561 [20:57<17:33, 14.02it/s]

Failed to find valid bounds for p=5.33293981800192e-05 after 5 retries


 55%|█████▍    | 17826/32561 [21:00<18:09, 13.53it/s]

Failed to find valid bounds for p=0.0010146918683533264 after 5 retries


 55%|█████▌    | 18022/32561 [21:14<16:32, 14.64it/s]

Failed to find valid bounds for p=0.00020023782574219652 after 5 retries


 56%|█████▌    | 18086/32561 [21:18<17:15, 13.98it/s]

Failed to find valid bounds for p=0.00026859593371048767 after 5 retries


 56%|█████▌    | 18140/32561 [21:22<16:37, 14.46it/s]

Failed to find valid bounds for p=0.0002601040852545741 after 5 retries


 57%|█████▋    | 18442/32561 [21:44<16:24, 14.35it/s]

Failed to find valid bounds for p=2.5173879291005906e-05 after 5 retries


 57%|█████▋    | 18530/32561 [21:50<16:45, 13.95it/s]

Failed to find valid bounds for p=0.0002192539262182408 after 5 retries


 58%|█████▊    | 18808/32561 [22:09<16:29, 13.90it/s]

Failed to find valid bounds for p=8.829655212768848e-05 after 5 retries


 58%|█████▊    | 18856/32561 [22:13<15:54, 14.36it/s]

Failed to find valid bounds for p=0.001056277169132273 after 5 retries


 58%|█████▊    | 19046/32561 [22:26<15:43, 14.32it/s]

Failed to find valid bounds for p=0.0008425490596578966 after 5 retries


 59%|█████▉    | 19296/32561 [22:44<15:31, 14.25it/s]

Failed to find valid bounds for p=0.0004511697465967947 after 5 retries


 60%|██████    | 19590/32561 [23:04<15:00, 14.41it/s]

Failed to find valid bounds for p=8.94652526158192e-05 after 5 retries


 61%|██████    | 19802/32561 [23:19<14:45, 14.41it/s]

Failed to find valid bounds for p=0.00045727324642961094 after 5 retries


 61%|██████    | 19912/32561 [23:27<15:03, 14.00it/s]

Failed to find valid bounds for p=0.00016541358544974063 after 5 retries


 62%|██████▏   | 20160/32561 [23:44<15:02, 13.73it/s]

Failed to find valid bounds for p=0.00024302490539792804 after 5 retries


 62%|██████▏   | 20308/32561 [23:55<14:30, 14.07it/s]

Failed to find valid bounds for p=0.00020290554836629407 after 5 retries


 63%|██████▎   | 20364/32561 [23:59<14:22, 14.14it/s]

Failed to find valid bounds for p=1.096023887902333e-05 after 5 retries


 63%|██████▎   | 20624/32561 [24:17<14:51, 13.39it/s]

Failed to find valid bounds for p=0.0005130885326282179 after 5 retries


 64%|██████▎   | 20740/32561 [24:25<13:41, 14.39it/s]

Failed to find valid bounds for p=3.3422596802593286e-05 after 5 retries


 64%|██████▍   | 20856/32561 [24:33<13:26, 14.51it/s]

Failed to find valid bounds for p=0.00036333077950845304 after 5 retries


 65%|██████▌   | 21196/32561 [24:57<13:31, 14.00it/s]

Failed to find valid bounds for p=0.0003179549458597553 after 5 retries


 66%|██████▌   | 21356/32561 [25:08<13:28, 13.87it/s]

Failed to find valid bounds for p=5.53008961122126e-05 after 5 retries


 66%|██████▋   | 21576/32561 [25:24<13:45, 13.30it/s]

Failed to find valid bounds for p=2.3832222628974424e-05 after 5 retries
Failed to find valid bounds for p=0.00021607674454394592 after 5 retries


 67%|██████▋   | 21700/32561 [25:32<13:11, 13.73it/s]

Failed to find valid bounds for p=0.0009468963303169472 after 5 retries


 67%|██████▋   | 21898/32561 [25:46<12:37, 14.08it/s]

Failed to find valid bounds for p=0.0007194288073127041 after 5 retries


 68%|██████▊   | 22158/32561 [26:05<12:12, 14.19it/s]

Failed to find valid bounds for p=0.000928067246927004 after 5 retries


 69%|██████▉   | 22584/32561 [26:35<11:32, 14.42it/s]

Failed to find valid bounds for p=0.0003443570177076149 after 5 retries


 71%|███████   | 23030/32561 [27:06<11:19, 14.02it/s]

Failed to find valid bounds for p=0.0006360531713147661 after 5 retries


 72%|███████▏  | 23356/32561 [27:29<10:25, 14.72it/s]

Failed to find valid bounds for p=0.00010585591342786286 after 5 retries


 73%|███████▎  | 23666/32561 [27:51<10:58, 13.51it/s]

Failed to find valid bounds for p=0.0006306639206108511 after 5 retries


 73%|███████▎  | 23782/32561 [28:00<10:40, 13.71it/s]

Failed to find valid bounds for p=4.430887471351807e-05 after 5 retries


 73%|███████▎  | 23840/32561 [28:04<10:30, 13.83it/s]

Failed to find valid bounds for p=0.0005476625162174482 after 5 retries


 73%|███████▎  | 23926/32561 [28:10<09:53, 14.54it/s]

Failed to find valid bounds for p=7.275861388376508e-05 after 5 retries


 74%|███████▎  | 24006/32561 [28:15<10:04, 14.16it/s]

Failed to find valid bounds for p=0.0004605887942931165 after 5 retries


 74%|███████▍  | 24160/32561 [28:26<09:55, 14.10it/s]

Failed to find valid bounds for p=0.00033806448955112095 after 5 retries


 75%|███████▍  | 24354/32561 [28:40<09:58, 13.70it/s]

Failed to find valid bounds for p=0.0006253488285819 after 5 retries


 76%|███████▌  | 24618/32561 [28:58<09:12, 14.37it/s]

Failed to find valid bounds for p=0.0007670982080618311 after 5 retries


 76%|███████▌  | 24630/32561 [28:59<09:12, 14.35it/s]

Failed to find valid bounds for p=0.0006898622076775043 after 5 retries


 76%|███████▌  | 24732/32561 [29:06<09:03, 14.40it/s]

Failed to find valid bounds for p=0.0008715244377807955 after 5 retries


 76%|███████▌  | 24744/32561 [29:07<09:08, 14.26it/s]

Failed to find valid bounds for p=0.0009162547061552709 after 5 retries


 77%|███████▋  | 25114/32561 [29:33<08:35, 14.45it/s]

Failed to find valid bounds for p=0.0008352688602898638 after 5 retries


 77%|███████▋  | 25206/32561 [29:40<08:36, 14.23it/s]

Failed to find valid bounds for p=0.0002568323919339813 after 5 retries


 78%|███████▊  | 25356/32561 [29:50<08:32, 14.07it/s]

Failed to find valid bounds for p=0.00048142356863549945 after 5 retries


 79%|███████▉  | 25746/32561 [30:18<08:00, 14.17it/s]

Failed to find valid bounds for p=2.2388574831976506e-05 after 5 retries


 79%|███████▉  | 25790/32561 [30:21<07:41, 14.66it/s]

Failed to find valid bounds for p=5.3894866833533876e-05 after 5 retries


 79%|███████▉  | 25820/32561 [30:23<07:56, 14.16it/s]

Failed to find valid bounds for p=0.0005341206282432513 after 5 retries


 79%|███████▉  | 25844/32561 [30:25<07:46, 14.39it/s]

Failed to find valid bounds for p=1.4219796431240157e-05 after 5 retries


 80%|████████  | 26102/32561 [30:43<07:33, 14.23it/s]

Failed to find valid bounds for p=0.0011330337812115328 after 5 retries
Failed to find valid bounds for p=0.00100635650692005 after 5 retries


 80%|████████  | 26172/32561 [30:48<07:44, 13.77it/s]

Failed to find valid bounds for p=0.00015010644447854116 after 5 retries


 81%|████████  | 26340/32561 [31:00<07:29, 13.85it/s]

Failed to find valid bounds for p=0.0009730175558391058 after 5 retries


 81%|████████▏ | 26456/32561 [31:08<06:48, 14.95it/s]

Failed to find valid bounds for p=0.00018024333747170603 after 5 retries


 82%|████████▏ | 26568/32561 [31:16<07:05, 14.07it/s]

Failed to find valid bounds for p=0.0005761781940214943 after 5 retries


 83%|████████▎ | 26972/32561 [31:44<06:36, 14.09it/s]

Failed to find valid bounds for p=0.000553492330176131 after 5 retries


 83%|████████▎ | 27102/32561 [31:54<06:40, 13.63it/s]

Failed to find valid bounds for p=0.00031734989048519163 after 5 retries


 83%|████████▎ | 27184/32561 [32:00<06:17, 14.24it/s]

Failed to find valid bounds for p=4.232442072026133e-05 after 5 retries


 84%|████████▍ | 27470/32561 [32:20<06:10, 13.73it/s]

Failed to find valid bounds for p=0.000795127100012527 after 5 retries


 84%|████████▍ | 27500/32561 [32:22<05:57, 14.14it/s]

Failed to find valid bounds for p=0.0009794072628889678 after 5 retries


 85%|████████▌ | 27712/32561 [32:37<05:35, 14.47it/s]

Failed to find valid bounds for p=1.744168419048966e-05 after 5 retries


 85%|████████▌ | 27804/32561 [32:43<05:38, 14.04it/s]

Failed to find valid bounds for p=0.00022102792139513223 after 5 retries


 87%|████████▋ | 28192/32561 [33:11<05:05, 14.28it/s]

Failed to find valid bounds for p=1.6606023010530987e-05 after 5 retries


 87%|████████▋ | 28210/32561 [33:12<05:14, 13.85it/s]

Failed to find valid bounds for p=0.0005183107820522254 after 5 retries


 89%|████████▉ | 29132/32561 [34:17<04:07, 13.83it/s]

Failed to find valid bounds for p=2.1324820433003017e-06 after 5 retries


 91%|█████████ | 29476/32561 [34:41<03:39, 14.06it/s]

Failed to find valid bounds for p=8.329284372069825e-05 after 5 retries


 91%|█████████ | 29564/32561 [34:47<03:26, 14.52it/s]

Failed to find valid bounds for p=0.0004634113747975906 after 5 retries


 91%|█████████ | 29584/32561 [34:48<03:20, 14.83it/s]

Failed to find valid bounds for p=0.00022954685685045082 after 5 retries


 91%|█████████▏| 29736/32561 [34:59<03:23, 13.88it/s]

Failed to find valid bounds for p=0.0009094008217947847 after 5 retries


 92%|█████████▏| 29808/32561 [35:04<03:13, 14.26it/s]

Failed to find valid bounds for p=0.0005198580275447286 after 5 retries


 93%|█████████▎| 30284/32561 [35:38<02:35, 14.64it/s]

Failed to find valid bounds for p=0.00020457017709974015 after 5 retries


 95%|█████████▍| 30778/32561 [36:13<02:08, 13.84it/s]

Failed to find valid bounds for p=0.0005021112064876235 after 5 retries


 96%|█████████▋| 31374/32561 [36:54<01:22, 14.42it/s]

Failed to find valid bounds for p=0.00011150616665033406 after 5 retries


 97%|█████████▋| 31494/32561 [37:03<01:14, 14.26it/s]

Failed to find valid bounds for p=0.00040202530399509865 after 5 retries


 97%|█████████▋| 31694/32561 [37:17<01:00, 14.32it/s]

Failed to find valid bounds for p=0.00014835199045632721 after 5 retries


 98%|█████████▊| 32040/32561 [37:41<00:34, 15.05it/s]

Failed to find valid bounds for p=9.386278521379035e-05 after 5 retries


 99%|█████████▉| 32296/32561 [37:59<00:18, 14.28it/s]

Failed to find valid bounds for p=0.0002468406735222498 after 5 retries


100%|█████████▉| 32412/32561 [38:07<00:10, 13.77it/s]

Failed to find valid bounds for p=9.02682897938149e-05 after 5 retries


100%|██████████| 32561/32561 [38:18<00:00, 14.17it/s]


In [62]:
synthetic_df_full.isnull().sum()

age                 0
 workclass          5
 fnlwgt             0
 education          5
 marital-status    13
 occupation         2
 relationship       8
 race               4
 sex               42
 capital-gain       0
 capital-loss       0
 hours-per-week     8
 native-country    41
 salary            34
dtype: int64

In [38]:
synthetic_df_full.to_csv('synthetic_data_full.csv')

In [63]:
column_mean = synthetic_df_full.mean()

synthetic_df_full_filled = synthetic_df_full.fillna(column_mean)

In [64]:
synthetic_df_full_filled

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,47.602875,0.856669,296566.030725,0.984768,0.233210,0.649474,0.222499,0.409044,0.816470,0.000527,0.000145,40.001350,0.369657,0.809970
1,32.574126,0.420624,59019.315568,0.759543,0.332804,0.065237,0.850943,0.605872,0.738007,-0.001290,-0.000386,24.542558,0.384696,0.136709
2,41.354508,0.801740,246596.280778,0.932366,0.641960,0.206858,0.561668,0.357424,0.395557,-0.000174,0.000599,40.000271,0.391150,0.843468
3,19.452579,0.795551,177906.366159,0.731019,0.618765,0.743068,0.549303,0.189635,0.912975,0.000118,-0.000587,39.999642,0.276982,0.798659
4,51.995030,0.731495,108521.661446,0.826903,0.114574,0.540635,0.036591,0.584521,0.073733,-0.000094,0.000276,67.932632,0.418206,0.479894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,30.505639,0.368888,76959.644739,0.412932,0.871281,0.884887,0.726212,0.643755,0.454662,-0.000152,-0.000032,39.998362,0.130690,0.162942
32557,17.055637,0.144870,420215.922994,0.067869,0.183269,0.052438,0.865840,0.863900,0.583839,0.000706,0.000365,46.752890,0.626080,0.787263
32558,36.403838,0.661343,46522.091577,0.459857,0.850907,0.678214,0.230242,0.476655,0.267591,-0.000971,0.000149,40.001383,0.371916,0.128849
32559,23.737994,0.278785,633733.796930,0.477951,0.866243,0.800750,0.876695,0.908915,0.525720,-0.000374,0.000621,40.001594,0.230323,0.507307


In [65]:
def numerical_to_category(num_value, category_intervals):
    """ Convert a numerical value back to its corresponding category. """
    for category, (a, b) in category_intervals.items():
        if a <= num_value < b:
            return category
        elif num_value > 1 and round(b) == 1:
            return category
        elif num_value < 0 and round(b) == 0:
            return category

In [66]:
inversed_categories_workclass = synthetic_df_full_filled[' workclass'].apply(lambda x: numerical_to_category(x, category_intervals_workclass))
inversed_categories_education = synthetic_df_full_filled[' education'].apply(lambda x: numerical_to_category(x, category_intervals_education))
inversed_categories_marital = synthetic_df_full_filled[' marital-status'].apply(lambda x: numerical_to_category(x, category_intervals_marital))
inversed_categories_occupation = synthetic_df_full_filled[' occupation'].apply(lambda x: numerical_to_category(x, category_intervals_occupation))
inversed_categories_relationship = synthetic_df_full_filled[' relationship'].apply(lambda x: numerical_to_category(x, category_intervals_relationship))
inversed_categories_race = synthetic_df_full_filled[' race'].apply(lambda x: numerical_to_category(x, category_intervals_race))
inversed_categories_sex = synthetic_df_full_filled[' sex'].apply(lambda x: numerical_to_category(x, category_intervals_sex))
inversed_categories_country = synthetic_df_full_filled[' native-country'].apply(lambda x: numerical_to_category(x, category_intervals_country))
inversed_categories_salary = synthetic_df_full_filled[' salary'].apply(lambda x: numerical_to_category(x, category_intervals_salary))

In [67]:
synthetic_df_full_filled[' workclass'] = inversed_categories_workclass

synthetic_df_full_filled[' education'] = inversed_categories_education

synthetic_df_full_filled[' marital-status'] = inversed_categories_marital

synthetic_df_full_filled[' occupation'] = inversed_categories_occupation

synthetic_df_full_filled[' relationship'] = inversed_categories_relationship

synthetic_df_full_filled[' race'] = inversed_categories_race

synthetic_df_full_filled[' sex'] = inversed_categories_sex

synthetic_df_full_filled[' native-country'] = inversed_categories_country

synthetic_df_full_filled[' salary'] = inversed_categories_salary

In [68]:
synthetic_df_full_filled

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,47.602875,?,296566.030725,5th-6th,Married-civ-spouse,Other-service,Husband,White,Female,0.000527,0.000145,40.001350,United-States,>50K
1,32.574126,Private,59019.315568,Masters,Married-civ-spouse,Prof-specialty,Unmarried,White,Female,-0.001290,-0.000386,24.542558,United-States,<=50K
2,41.354508,Local-gov,246596.280778,Prof-school,Never-married,Craft-repair,Not-in-family,White,Male,-0.000174,0.000599,40.000271,United-States,>50K
3,19.452579,Local-gov,177906.366159,Masters,Never-married,Machine-op-inspct,Not-in-family,White,Female,0.000118,-0.000587,39.999642,United-States,>50K
4,51.995030,Self-emp-not-inc,108521.661446,11th,Married-civ-spouse,Sales,Husband,White,Male,-0.000094,0.000276,67.932632,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,30.505639,Private,76959.644739,Some-college,Divorced,Handlers-cleaners,Own-child,White,Male,-0.000152,-0.000032,39.998362,United-States,<=50K
32557,17.055637,Private,420215.922994,HS-grad,Married-civ-spouse,Prof-specialty,Unmarried,Black,Male,0.000706,0.000365,46.752890,United-States,>50K
32558,36.403838,Private,46522.091577,Some-college,Divorced,Other-service,Husband,White,Male,-0.000971,0.000149,40.001383,United-States,<=50K
32559,23.737994,Private,633733.796930,Some-college,Divorced,?,Unmarried,Black,Male,-0.000374,0.000621,40.001594,United-States,<=50K


In [69]:
data1 = pd.read_csv("American_Income.csv")
data_for_test = data1.drop(columns=[' education-num'])

data_for_test

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [70]:
metadata1 = SingleTableMetadata()
metadata1.detect_from_dataframe(data = data_for_test)
baseline_quality_report = evaluate_quality(
    data_for_test,  #real data
    synthetic_df_full_filled, #synthetic data(perfect synthesizer generated(holdout))
    metadata1
)

Creating report: 100%|██████████| 4/4 [00:00<00:00,  5.14it/s]



Overall Quality Score: 87.67%

Properties:
Column Shapes: 90.2%
Column Pair Trends: 85.14%


In [57]:
metadata1 = SingleTableMetadata()

metadata1.detect_from_dataframe(data = data_for_test)
synthesier = GaussianCopulaSynthesizer(metadata1)
synthesier.fit(data_for_test)
synthetic_data = synthesier.sample(num_rows = num_rows)  #keep the number of rows equal to the data hold
synthetic_data 

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,44,Private,137738,HS-grad,Divorced,Transport-moving,Own-child,White,Female,87580,23,55,United-States,>50K
1,33,Federal-gov,144753,11th,Married-civ-spouse,Exec-managerial,Unmarried,White,Female,5255,0,60,United-States,<=50K
2,22,Private,263046,Prof-school,Married-civ-spouse,Transport-moving,Husband,White,Male,51338,0,49,United-States,>50K
3,47,Local-gov,142107,9th,Divorced,Prof-specialty,Own-child,Black,Female,49060,0,32,United-States,<=50K
4,24,Federal-gov,342515,Bachelors,Never-married,Transport-moving,Unmarried,Black,Male,3978,62,39,India,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,42,Federal-gov,74028,Masters,Married-spouse-absent,Transport-moving,Unmarried,White,Female,99592,0,38,United-States,<=50K
32557,21,Private,142426,Assoc-acdm,Never-married,Exec-managerial,Not-in-family,Black,Female,536,0,33,United-States,<=50K
32558,33,State-gov,106801,Masters,Never-married,Handlers-cleaners,Husband,White,Female,26961,0,24,United-States,<=50K
32559,49,?,148553,7th-8th,Married-civ-spouse,?,Own-child,White,Male,62266,112,39,Cuba,>50K


In [59]:
metadata1

{
    "columns": {
        "age": {
            "sdtype": "numerical"
        },
        " workclass": {
            "sdtype": "categorical"
        },
        " fnlwgt": {
            "sdtype": "numerical"
        },
        " education": {
            "sdtype": "categorical"
        },
        " marital-status": {
            "sdtype": "categorical"
        },
        " occupation": {
            "sdtype": "categorical"
        },
        " relationship": {
            "sdtype": "categorical"
        },
        " race": {
            "sdtype": "categorical"
        },
        " sex": {
            "sdtype": "categorical"
        },
        " capital-gain": {
            "sdtype": "numerical"
        },
        " capital-loss": {
            "sdtype": "numerical"
        },
        " hours-per-week": {
            "sdtype": "numerical"
        },
        " native-country": {
            "sdtype": "categorical"
        },
        " salary": {
            "sdtype": "categorical"
       

In [58]:
metadata1 = SingleTableMetadata()
metadata1.detect_from_dataframe(data = data_for_test)
baseline_quality_report = evaluate_quality(
    data_for_test,  #real data
    synthetic_data, #synthetic data(perfect synthesizer generated(holdout))
    metadata1
)

Creating report: 100%|██████████| 4/4 [00:00<00:00,  4.01it/s]



Overall Quality Score: 71.81%

Properties:
Column Shapes: 74.55%
Column Pair Trends: 69.07%
