In [23]:
from sdv.datasets.demo import download_demo
from sdmetrics.single_table import BinaryLogisticRegression

real_data, _ = download_demo(
    modality='single_table',
    dataset_name='fake_hotel_guests'
)

# define columns to be dropped - for simplicity
columns_to_drop = ['guest_email', 'billing_address', 'credit_card_number', 'checkin_date', 'checkout_date']


# drop the specified columns
real_data_cleaned = real_data.drop(columns=columns_to_drop)

# generate updated metadata
from sdv.metadata import SingleTableMetadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=real_data_cleaned)

# split data into training and test sets
from sklearn.model_selection import train_test_split
real_data_train, real_data_test = train_test_split(
    real_data_cleaned,
    test_size=0.2, # reserves 20% for testing
    random_state=42
)

# define target columns
target_column = 'has_rewards'

# compute baseline
baseline_score = BinaryLogisticRegression.compute(
    test_data=real_data_test,
    train_data=real_data_train,
    target=target_column,
)

print(f'Baseline Logistic Regression F1 Score: {baseline_score}')

Baseline Logistic Regression F1 Score: 0.761904761904762


In [24]:
# here we start synthesizing data
from sdv.lite import SingleTablePreset

synthesizer = SingleTablePreset(
    metadata,
    name='FAST_ML'
)

# make sure to only use the train set to prevent data leakage
synthesizer.fit(
    data=real_data_train
)

synthetic_data = synthesizer.sample(
    num_rows=500
)

In [27]:
# here we compute efficacy of our synthetic data in binary classification prediction

logistic_regression_score = BinaryLogisticRegression.compute(
    test_data=real_data_test,
    train_data=synthetic_data,
    target='has_rewards',
    metadata=metadata
)

print(f'Logistic regression score is {logistic_regression_score}')

Logistic regression score is 0.45714285714285713


We could derive other metrics to evaluate the synthetic data:

- Ratio: Close to 1 means synthetic data is almost as good. >1 could indicate overfitting. <1 indicates missing characteristics
- Absolute Difference: Measure performance decrease
- Cross validation:
- Learnign curves: Plot learning curves by training models on increasing amounts of synthetic data and plotting the performance on the real test set
- Model interpretation: Use tools like SHAP or LIME to compare explanations for predictions on real and syn data

ValueError: Found unknown categories ['ramirezroy@example.org', 'lewisamanda@example.org', 'zjames@example.org', 'moondavid@example.com', 'ericalandry@example.org', 'julie49@example.com', 'ingramemily@example.net', 'daniel67@example.net', 'sean31@example.net', 'stevenirwin@example.com', 'longgloria@example.net', 'steven59@example.org', 'colerobert@example.com', 'creid@example.org', 'johnsonmary@example.net', 'ugates@example.net', 'pfarmer@example.com', 'ryan53@example.org', 'xturner@example.net', 'prios@example.org', 'blairsteven@example.com', 'ivillegas@example.net', 'michelewalters@example.org', 'smithpatty@example.org', 'nwolf@example.com', 'sarah11@example.net', 'sheltonkyle@example.net', 'steven96@example.org', 'johnsonsteven@example.net', 'williamschristina@example.com', 'david32@example.org', 'gwilson@example.net', 'hudsondavid@example.org', 'rpage@example.net', 'karensullivan@example.net', 'denisenelson@example.com', 'rachel49@example.net', 'lwells@example.net', 'allison28@example.net', 'jamie94@example.com', 'oliverjessica@example.net', 'michaelmitchell@example.net', 'bentleyamanda@example.com', 'racheldaniel@example.org', 'erikfisher@example.org', 'nicole57@example.net', 'melissamcintosh@example.com', 'sandra49@example.org', 'warrenpatricia@example.org', 'ruizcameron@example.org', 'nealbailey@example.org', 'owensdavid@example.net', 'wilsonmatthew@example.net', 'amanda81@example.com', 'jesse07@example.net', 'ejones@example.net', 'gosborn@example.com', 'thomas24@example.org', 'lynchsteven@example.net', 'aray@example.com', 'iharper@example.net', 'lisa61@example.com', 'josephwalker@example.org', 'meghanharrington@example.com', 'bobbyreid@example.com', 'nsaunders@example.com', 'samantha34@example.net', 'shawn44@example.org', 'donaldsontina@example.com', 'amberlewis@example.com', 'ramossamuel@example.org', 'tyang@example.org', 'amoyer@example.com', 'edwinherrera@example.org', 'christopher48@example.com', 'murraymelissa@example.com', 'cfox@example.com', 'rstephens@example.net', 'jason43@example.org', 'jrobinson@example.net', 'edwin11@example.com', 'apena@example.com', 'kimlance@example.net', 'katrinamills@example.org', 'matthewrogers@example.org', 'autumn41@example.com', 'edwardstammy@example.com', 'james20@example.net', 'dsullivan@example.net', 'kellerelizabeth@example.com', 'rachel98@example.com', 'jessica33@example.org', 'ztodd@example.com', 'klinealexander@example.com', 'laurabarber@example.net', 'jenniferbarrett@example.com', 'amy89@example.com', 'michaelharris@example.org', 'aadams@example.com', 'helenwilson@example.org', 'jmcconnell@example.com', 'ogarcia@example.net', 'smiller@example.com', 'omartin@example.org', 'woodsjasmine@example.net', 'vincenthamilton@example.net', 'jeannephillips@example.com', 'dwilliams@example.net', 'james08@example.net', 'emily36@example.com', 'ksweeney@example.net', 'jeffrey87@example.net', 'campbellaaron@example.com', 'carpenterdonald@example.com', 'brendabarrett@example.com', 'aaronvillegas@example.net', 'sanchezangela@example.net', 'bettyzuniga@example.org', 'russellwendy@example.org', 'bryanphelps@example.net', 'martinsamuel@example.org', 'orrsusan@example.org', 'pclark@example.net', 'ilopez@example.com', 'patty61@example.com', 'pschroeder@example.org', 'jeffrey21@example.org', 'chughes@example.net', 'robert41@example.net', 'vanessaacevedo@example.org', 'mitchellvictor@example.com', 'tracy60@example.com', 'michellehill@example.net', 'rsmith@example.com', 'mcknightstephanie@example.net', 'muellerdaniel@example.net', 'gravessteven@example.org', 'diaztheresa@example.net', 'dtaylor@example.org', 'xramirez@example.net', 'robertblair@example.org', 'kennedybryan@example.org', 'garydavid@example.net', 'ysmith@example.com', 'brennankathy@example.org', 'lcarey@example.org', 'alvarezalice@example.net', 'kyle23@example.net', 'xcole@example.com', 'cbarron@example.net', 'brianarobinson@example.org', 'david04@example.com', 'jonesashley@example.net', 'simmonsashley@example.org', 'castillomaria@example.com', 'julie49@example.net', 'gwebb@example.org', 'edward52@example.org', 'catherinemccormick@example.net', 'rfletcher@example.net', 'tmercado@example.net', 'josephknight@example.com', 'joshuabrown@example.net', 'qrodriguez@example.org', 'amanda10@example.com', 'webstersara@example.com', 'jameshenry@example.com', 'gkey@example.org', 'imcneil@example.org', 'robertpope@example.org', 'espinozaricky@example.net', 'gwalton@example.com', 'qwebster@example.net', 'hwaller@example.com', 'ccollins@example.com', 'josephhoffman@example.com', 'barnesjeremy@example.com', 'lynnjohnson@example.org', 'sandrachavez@example.net', 'perrymichael@example.com', 'gabriel70@example.net', 'matthewsdavid@example.net', 'marcusduran@example.net', 'udavidson@example.com', 'gloria15@example.org', 'vcrane@example.com', 'ihuffman@example.com', 'annesherman@example.org', 'kyle22@example.org', 'timothyfigueroa@example.com', 'latoya51@example.org', 'melissawilliams@example.net', 'cassie32@example.org', 'elizabethgutierrez@example.com', 'thomas72@example.com', 'kyledudley@example.org', 'morganmegan@example.org', 'floreskristine@example.org', 'wmills@example.com', 'tylerwilliams@example.org', 'anthonyrebecca@example.net', 'coloncatherine@example.net', 'kjones@example.org', 'lynchjill@example.com', 'qosborne@example.com', 'james04@example.com', 'xbaker@example.org', 'adamsmith@example.org', 'zmurray@example.com', 'ariana98@example.org', 'caitlincasey@example.com', 'icameron@example.org', 'johnsilva@example.org', 'dschmidt@example.org', 'candice62@example.net', 'uwilliams@example.org', 'hector56@example.com', 'edickerson@example.org', 'montoyarobert@example.com', 'michael95@example.net', 'bradleymartin@example.com', 'ule@example.org', 'lynnyoung@example.org', 'toddwallace@example.org', 'reynoldspatrick@example.org', 'tespinoza@example.com', 'kristenmassey@example.com', 'donnaodom@example.net', 'paula75@example.org', 'oliviaking@example.org', 'durhamamanda@example.net', 'spencejohn@example.org', 'maria89@example.net', 'omitchell@example.com', 'robertmills@example.net', 'robinsonvictoria@example.net', 'sarahwarren@example.org', 'adamslaura@example.net', 'loganosborne@example.org', 'bakerstephen@example.com', 'zharper@example.com', 'browntiffany@example.net', 'heidi58@example.com', 'yorkdestiny@example.org', 'blairjaime@example.net', 'tracey36@example.net', 'mmiller@example.com', 'maryacosta@example.net', 'jessicaturner@example.com', 'khancock@example.com', 'bmorris@example.org', 'warnerkristopher@example.org', 'baileyhoward@example.com', 'kendra08@example.com', 'donaldmoran@example.org', 'stephanie76@example.org', 'lauren41@example.com', 'amartinez@example.org', 'donald09@example.com', 'whiteanne@example.net', 'mcconnelllisa@example.com', 'ycabrera@example.net', 'humphreyjennifer@example.net', 'ahall@example.com', 'mackcrystal@example.net', 'oglass@example.net', 'gomezelizabeth@example.org', 'nguyenmolly@example.org', 'gcarroll@example.com', 'vsutton@example.net', 'dennisbennett@example.net', 'fschultz@example.com', 'rachelroach@example.com', 'johnstonmary@example.net', 'annafarmer@example.com', 'jorge85@example.org', 'martinjasmine@example.com', 'ballardcourtney@example.com', 'hamptonemma@example.net', 'carlnovak@example.com', 'kristineschmitt@example.org', 'xmccarthy@example.org', 'smithhenry@example.net', 'lee18@example.com', 'barry53@example.org', 'emilycurtis@example.org', 'huangpatrick@example.net', 'lopezsarah@example.org', 'amy62@example.org', 'thompsonsamantha@example.org', 'glenn22@example.com', 'dennismorse@example.org', 'scottbrandon@example.com', 'selena47@example.com', 'nguyenjeremy@example.net', 'kimberlytran@example.org', 'radams@example.com', 'patrick66@example.com', 'michelewright@example.com', 'swansondeborah@example.org', 'olsensharon@example.net', 'edward04@example.com', 'andrewhill@example.net', 'mendezscott@example.org', 'alexanderevans@example.org', 'cbruce@example.com', 'sherrycook@example.org', 'kathleen35@example.org', 'tdaniels@example.com', 'matthewlamb@example.com', 'kenneth17@example.org', 'smithsara@example.org', 'ilewis@example.org', 'raymond78@example.org', 'margaretperkins@example.net', 'stephenaustin@example.org', 'hernandeztheresa@example.net', 'allen45@example.net', 'singletonmegan@example.org', 'reginajohnston@example.net', 'mollylee@example.com', 'gruiz@example.org', 'mooneypatty@example.org', 'chavezhannah@example.net', 'virginia58@example.net', 'broman@example.com', 'ebruce@example.net', 'nicholas86@example.com', 'maykimberly@example.com', 'louis16@example.com', 'dennisgray@example.org', 'ohenderson@example.com', 'rosariodan@example.net', 'orodriguez@example.org', 'ryan15@example.org', 'justincervantes@example.org', 'colestephen@example.org', 'westjoseph@example.com', 'payala@example.com', 'rhodesjames@example.org', 'bsmith@example.org', 'ashleyrivera@example.org', 'kimberly45@example.org', 'mannrichard@example.org', 'andrea02@example.net', 'nmoses@example.com', 'sroberts@example.com', 'vlee@example.org', 'dmatthews@example.net', 'oconnellcatherine@example.org', 'williamwilson@example.org', 'jill26@example.org', 'john87@example.com', 'sarahernandez@example.com', 'ejordan@example.org', 'andrewgeorge@example.net', 'xkelley@example.net', 'teresajones@example.com', 'rodriguezmichael@example.net', 'hestertyler@example.com', 'xknapp@example.com', 'galvanjennifer@example.net', 'robleswillie@example.com', 'vcox@example.org', 'michaelwheeler@example.com', 'elizabethcoleman@example.com', 'richard66@example.com', 'vvance@example.org', 'owallace@example.net', 'qcook@example.com', 'tammycarey@example.net', 'goodwinwayne@example.com', 'druiz@example.com', 'katherinewilson@example.com', 'jason33@example.org', 'beltranelizabeth@example.com', 'edward69@example.org', 'sarahreed@example.org', 'whitejames@example.org', 'timothyhansen@example.net', 'ashleymccall@example.com', 'joshuadavis@example.com', 'kennethlara@example.net', 'denise69@example.net', 'owilliams@example.org', 'johnsonbenjamin@example.net', 'bryanwong@example.com', 'justinwilson@example.org', 'pharris@example.org', 'lwilliams@example.org', 'ashleycurry@example.org', 'fadkins@example.org', 'batestimothy@example.org', 'hayesjeffery@example.net', 'david65@example.org', 'ljones@example.net', 'patrickangela@example.org', 'wmiller@example.net', 'maria62@example.com', 'julie28@example.org', 'williamdaniels@example.com', 'urobertson@example.net', 'lancewatkins@example.net', 'walkerallen@example.com', 'wendyjohnson@example.org', 'bryancooper@example.org', 'andreabrewer@example.com', 'joseph37@example.org', 'asmith@example.org', 'ggregory@example.com', 'julia69@example.net', 'evanskelly@example.org', 'nmarquez@example.net', 'charlesemily@example.org', 'duncanmegan@example.net', 'jessicawright@example.com', 'simpsonwilliam@example.org', 'josephrichardson@example.com', 'vreynolds@example.com', 'grantdeborah@example.com', 'lydiabenjamin@example.net', 'thomas30@example.net', 'xschneider@example.org', 'egonzales@example.com', 'victorreese@example.com', 'owensjoseph@example.com', 'kelly08@example.com', 'john29@example.net', 'sperkins@example.org', 'gabriellekelly@example.net', 'omoss@example.com', 'shawn99@example.com', 'olivia80@example.org', 'matthewjenkins@example.net', 'paynescott@example.net', 'dpace@example.com', 'ucurtis@example.org', 'brianzamora@example.net', 'arianafritz@example.org', 'jsmith@example.org', 'jaclyn28@example.com', 'xsmith@example.org', 'jmartin@example.com', 'mleach@example.net', 'brandonallen@example.net', 'owright@example.com', 'valenciacatherine@example.com', 'gregoryjarvis@example.net', 'hillcourtney@example.com', 'calderonallen@example.net', 'woodsdaniel@example.com', 'lonnie61@example.com', 'edward86@example.com', 'zgreen@example.net', 'vsteele@example.com', 'murrayjavier@example.org', 'deborah80@example.com', 'abigailmata@example.org', 'isanchez@example.com', 'natashaanderson@example.com', 'oscar50@example.org', 'osborncaleb@example.org', 'reedjennifer@example.org', 'jacksonjennifer@example.org', 'randy76@example.com', 'williamcurry@example.org', 'nicolasodom@example.net', 'elizabeth92@example.org', 'kevin21@example.org', 'amanda14@example.com', 'brittneyrose@example.com', 'ggonzalez@example.org', 'charles30@example.org', 'hebertaaron@example.net', 'johnathannewman@example.com', 'vflores@example.net', 'jillianwilliams@example.net', 'larsenjessica@example.com', 'beasleyjillian@example.org', 'greeneerik@example.org', 'wdavidson@example.com', 'lindsey88@example.com', 'brandon15@example.net', 'amberbutler@example.net', 'nkelly@example.com', 'feliciamedina@example.net', 'andrew74@example.net', 'peterhoffman@example.com', 'lisa72@example.org', 'taylorlewis@example.org', 'robert27@example.com', 'romanmonica@example.org', 'krystal86@example.org', 'gmeyers@example.org', 'wilsonpatricia@example.org', 'sgray@example.net', 'jasmine35@example.com', 'williamsmichelle@example.net', 'derrickrice@example.net', 'jimmy72@example.net'] in column 0 during transform