In [23]:
from sdv.datasets.demo import download_demo
from sdmetrics.single_table import BinaryLogisticRegression

real_data, _ = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)

# define columns to be dropped - for simplicity
columns_to_drop = ['guest_email', 'billing_address', 'credit_card_number', 'checkin_date', 'checkout_date']


# drop the specified columns
real_data_cleaned = real_data.drop(columns=columns_to_drop)

# generate updated metadata
from sdv.metadata import SingleTableMetadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=real_data_cleaned)

# split data into training and test sets
from sklearn.model_selection import train_test_split
real_data_train, real_data_test = train_test_split(
    real_data_cleaned,
    test_size=0.2, # reserves 20% for testing
    random_state=42
)

# define target columns
target_column = 'has_rewards'

# compute baseline
baseline_score = BinaryLogisticRegression.compute(
    test_data=real_data_test,
    train_data=real_data_train,
    target=target_column,
)

print(f'Baseline Logistic Regression F1 Score: {baseline_score}')

Baseline Logistic Regression F1 Score: 0.761904761904762


In [24]:
# here we start synthesizing data
from sdv.lite import SingleTablePreset

synthesizer = SingleTablePreset(
    metadata,
    name='FAST_ML'
)

# make sure to only use the train set to prevent data leakage
synthesizer.fit(
    data=real_data_train
)

synthetic_data = synthesizer.sample(
    num_rows=500
)

In [27]:
# here we compute efficacy of our synthetic data in binary classification prediction

logistic_regression_score = BinaryLogisticRegression.compute(
    test_data=real_data_test,
    train_data=synthetic_data,
    target='has_rewards',
    metadata=metadata
)

print(f'Logistic regression score is {logistic_regression_score}')

Logistic regression score is 0.45714285714285713


We could derive other metrics to evaluate the synthetic data:

- Ratio: Close to 1 means synthetic data is almost as good. >1 could indicate overfitting. <1 indicates missing characteristics
- Absolute Difference: Measure performance decrease
- Cross validation:
- Learnign curves: Plot learning curves by training models on increasing amounts of synthetic data and plotting the performance on the real test set
- Model interpretation: Use tools like SHAP or LIME to compare explanations for predictions on real and syn data