# Using data generation internally

In [5]:
import pandas as pd
from lsynth import compute_upsilon

df_real = pd.read_csv("../datasets/gss_2018.csv", keep_default_na=False).sample(100)

# Baseline independent-column generator
ups_baseline, syn_baseline = compute_upsilon(
    num=100,
    model_path="../datasets/gss_2018.joblib",
    generate=True,
    gen_algorithm="BASELINE",
    orig_df=df_real,
    n_workers=11,
)
print("Baseline mean Upsilon:", ups_baseline.mean())

# LSM generator using qsample
ups_lsm, syn_lsm = compute_upsilon(
    num=100,
    model_path="../datasets/gss_2018.joblib",
    generate=True,
    gen_algorithm="LSM",
    orig_df=df_real,
    n_workers=11,
)
print("LSM mean Upsilon:", ups_lsm.mean())

Loading model from ../datasets/gss_2018.joblib ...
Generating 100 samples via BASELINE (independent columns).
Computing Upsilon via model.average_fidelity ...


average_fidelity(threads=11): 100%|███████████| 100/100 [00:28<00:00,  3.55it/s]


Baseline mean Upsilon: 0.7457296565896354
Loading model from ../datasets/gss_2018.joblib ...
Generating 100 samples via LSM (qsample).


qsample(LSM, threads=11): 100%|███████████████| 100/100 [00:58<00:00,  1.72it/s]


Computing Upsilon via model.average_fidelity ...


average_fidelity(threads=11): 100%|███████████| 100/100 [00:26<00:00,  3.74it/s]

LSM mean Upsilon: 0.7977742912334123





In [None]:
# CTGAN generator (requires sdv installed)
ups_ctgan, syn_ctgan = compute_upsilon(
    num=100,
    model_path="../datasets/gss_2018.joblib",
    generate=True,
    gen_algorithm="CTGAN",
    orig_df=df_real,
    n_workers=11,
)
print("CTGAN mean Upsilon:", ups_ctgan.mean())

# Use any externally generated synthetic data

In [7]:
df_external=syn_baseline.copy()

In [10]:
ups_baseline2, syn_baseline = compute_upsilon(
    model_path="../datasets/gss_2018.joblib",
    generate=False,
    syndata=df_external,
    n_workers=11,
)
print("LSM mean Upsilon:", ups_baseline2.mean())

Loading model from ../datasets/gss_2018.joblib ...
Using external syndata iterable with 100 samples.
Computing Upsilon via model.average_fidelity ...


average_fidelity(threads=11): 100%|███████████| 100/100 [00:28<00:00,  3.45it/s]

LSM mean Upsilon: 0.7457296565896354



