# Tutorial 0: Basic examples

In [1]:
import warnings
import sys
warnings.filterwarnings('ignore')

from sklearn.datasets import load_diabetes
from synthcity.plugins import Plugins
import synthcity.logger as log

log.add(sink=sys.stderr, level="INFO")

X, y = load_diabetes(return_X_y=True, as_frame=True)
X["target"] = y

X

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930,220.0


## List the available generative models

In [2]:
from synthcity.plugins import Plugins

Plugins().list()

['dummy_sampler', 'random_noise']

## Load and train a generative model

In [3]:
from synthcity.plugins import Plugins

syn_model = Plugins().get("dummy_sampler")

syn_model.fit(X)

<plugin_dummy_sampler.py.DummySamplerPlugin at 0x7efff936efd0>

## Generate new data using the model

In [4]:
syn_model.generate(count = 10)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.005383,0.05068,-0.02884,-0.009113,-0.03184,-0.028871,0.008142,-0.039493,-0.018118,0.007207,88.0
1,-0.023677,-0.044642,0.03044,-0.005671,0.082364,0.092004,-0.017629,0.07121,0.033047,0.003064,122.0
2,-0.023677,-0.044642,0.04014,-0.012556,-0.009825,-0.001001,-0.002903,-0.002592,-0.011901,-0.038357,147.0
3,-0.049105,-0.044642,0.004572,0.011544,-0.037344,-0.018537,-0.017629,-0.002592,-0.03981,-0.021788,200.0
4,-0.020045,-0.044642,-0.046085,-0.098628,-0.07587,-0.059873,-0.017629,-0.039493,-0.051401,-0.046641,74.0
5,-0.023677,-0.044642,-0.069797,-0.064199,-0.059359,-0.050478,0.019187,-0.039493,-0.089137,-0.050783,63.0
6,-0.052738,-0.044642,0.03044,-0.074528,-0.023584,-0.011335,-0.002903,-0.002592,-0.030751,-0.001078,172.0
7,0.009016,0.05068,0.069241,0.059744,0.017694,-0.023234,-0.047082,0.034309,0.103292,0.07348,277.0
8,-0.0709,-0.044642,-0.057941,-0.081414,-0.045599,-0.028871,-0.043401,-0.002592,0.001144,-0.00522,162.0
9,0.016281,-0.044642,0.001339,0.008101,0.005311,0.010899,0.030232,-0.039493,-0.045421,0.032059,49.0


## Generate new data under some constraints

In [5]:
# Constraint: target <= 100
from synthcity.plugins.core.constraints import Constraints

constraints = Constraints(rules = [("target", "<=", 100)])

generated = syn_model.generate(count = 10, constraints = constraints)

assert (generated["target"] <= 100).any()

generated

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.005383,0.05068,-0.02884,-0.009113,-0.03184,-0.028871,0.008142,-0.039493,-0.018118,0.007207,88.0
1,0.070769,0.05068,-0.030996,0.021872,-0.037344,-0.047034,0.033914,-0.039493,-0.014956,-0.001078,66.0
2,0.038076,0.05068,-0.018062,0.06663,-0.051103,-0.016658,-0.076536,0.034309,-0.011901,-0.013504,77.0
3,0.001751,0.05068,-0.057941,-0.043542,-0.09651,-0.047034,-0.098625,0.034309,-0.061177,-0.071494,88.0
4,0.056239,-0.044642,-0.068719,-0.06879,-0.000193,-0.001001,0.044958,-0.037648,-0.048362,-0.001078,72.0
5,0.009016,-0.044642,-0.024529,-0.026328,0.098876,0.094196,0.07073,-0.002592,-0.021394,0.007207,84.0
6,-0.02731,0.05068,-0.023451,-0.015999,0.013567,0.012778,0.02655,-0.002592,-0.010904,-0.021788,71.0
7,0.063504,0.05068,-0.001895,0.06663,0.09062,0.108914,0.022869,0.017703,-0.035817,0.003064,63.0
8,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
9,-0.0709,-0.044642,-0.00405,-0.040099,-0.066239,-0.078662,0.052322,-0.076395,-0.051401,-0.034215,61.0


In [6]:
# Constraint: target > 150

from synthcity.plugins.core.constraints import Constraints

constraints = Constraints(rules = [("target", ">", 150)])

generated = syn_model.generate(count = 10, constraints = constraints)

assert (generated["target"] > 150).any()

generated

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.016281,-0.044642,0.020739,0.021872,-0.013953,-0.013214,-0.006584,-0.002592,0.013316,0.040343,281.0
1,0.067136,-0.044642,0.056307,0.073515,-0.013953,-0.039205,-0.032356,-0.002592,0.075738,0.036201,303.0
2,-0.045472,0.05068,0.063852,0.070073,0.133274,0.131461,-0.039719,0.108111,0.075738,0.085907,217.0
3,-0.052738,-0.044642,0.03044,-0.074528,-0.023584,-0.011335,-0.002903,-0.002592,-0.030751,-0.001078,172.0
4,-0.020045,-0.044642,0.085408,-0.036656,0.091996,0.089499,-0.061809,0.145012,0.080948,0.05277,306.0
5,-0.030942,0.05068,0.001339,-0.005671,0.064477,0.049416,-0.047082,0.108111,0.083797,0.003064,229.0
6,-0.020045,-0.044642,0.085408,-0.036656,0.091996,0.089499,-0.061809,0.145012,0.080948,0.05277,306.0
7,-0.052738,0.05068,-0.018062,0.080401,0.089244,0.107662,-0.039719,0.108111,0.036056,-0.042499,171.0
8,0.012648,0.05068,0.000261,-0.011409,0.03971,0.057245,-0.039719,0.056081,0.024053,0.032059,259.0
9,-0.04184,-0.044642,0.128521,0.063187,-0.033216,-0.032629,0.011824,-0.039493,-0.015998,-0.050783,259.0


## Benchmark the quality of plugins

In [7]:
from synthcity.benchmark import Benchmarks
constraints = Constraints(rules = [("target", "ge", 150)])

score = Benchmarks.evaluate(
    ["dummy_sampler", "random_noise"],
    X, y,
    sensitive_columns = ["sex"],
    synthetic_size = 1000,
    synthetic_constraints = constraints,
    repeats = 5,
)


[2022-03-23T16:18:50.822806+0000][867385][INFO] Benchmarking plugin : dummy_sampler
[2022-03-23T16:18:50.823837+0000][867385][INFO]  Experiment repeat: 0
[2022-03-23T16:19:32.878858+0000][867385][INFO]  Experiment repeat: 1
[2022-03-23T16:19:59.541048+0000][867385][INFO]  Experiment repeat: 2
[2022-03-23T16:20:25.332344+0000][867385][INFO]  Experiment repeat: 3
[2022-03-23T16:20:51.022226+0000][867385][INFO]  Experiment repeat: 4
[2022-03-23T16:21:16.823552+0000][867385][INFO] Benchmarking plugin : random_noise
[2022-03-23T16:21:16.824173+0000][867385][INFO]  Experiment repeat: 0
[2022-03-23T16:21:40.556853+0000][867385][INFO]  Experiment repeat: 1
[2022-03-23T16:22:05.129332+0000][867385][INFO]  Experiment repeat: 2
[2022-03-23T16:22:28.590225+0000][867385][INFO]  Experiment repeat: 3
[2022-03-23T16:22:54.460083+0000][867385][INFO]  Experiment repeat: 4


In [8]:
Benchmarks.print(score)


[4m[1mPlugin : dummy_sampler[0m[0m


Unnamed: 0,min,max,mean,stddev,median,iqr,rounds,errors,durations
sanity.data_mismatch_score,0.0,0.0,0.0,0.0,0.0,0.0,5,0,0.0
sanity.common_rows_proportion,0.454751,0.461538,0.459276,0.002478,0.459276,0.002262,5,0,0.02
sanity.avg_distance_nearest_neighbor,0.253647,0.253666,0.253653,7e-06,0.253651,5e-06,5,0,0.01
sanity.inlier_probability,0.565611,0.565611,0.565611,0.0,0.565611,0.0,5,0,0.01
sanity.outlier_probability,0.047511,0.047511,0.047511,0.0,0.047511,0.0,5,0,0.01
statistical.inverse_kl_divergence,0.599884,0.784303,0.688541,0.06409,0.686409,0.08355,5,0,0.02
statistical.kolmogorov_smirnov_test,0.716474,0.812791,0.76269,0.032559,0.764935,0.035515,5,0,0.04
statistical.chi_squared_test,0.264784,0.362665,0.309745,0.034414,0.307609,0.046438,5,0,0.03
statistical.maximum_mean_discrepancy,0.007676,0.010673,0.008607,0.001091,0.008017,0.000799,5,0,0.09
statistical.inverse_cdf_distance,2.300052,3.658739,2.955573,0.479563,2.911384,0.669023,5,0,9.27




[4m[1mPlugin : random_noise[0m[0m


Unnamed: 0,min,max,mean,stddev,median,iqr,rounds,errors,durations
sanity.data_mismatch_score,0.0,0.0,0.0,0.0,0.0,0.0,5,0,0.0
sanity.common_rows_proportion,0.0,0.0,0.0,0.0,0.0,0.0,5,0,0.02
sanity.avg_distance_nearest_neighbor,0.396054,0.396054,0.396054,0.0,0.396054,0.0,5,0,0.02
sanity.inlier_probability,0.266968,0.266968,0.266968,0.0,0.266968,0.0,5,0,0.03
sanity.outlier_probability,0.052036,0.052036,0.052036,0.0,0.052036,0.0,5,0,0.02
statistical.inverse_kl_divergence,0.049714,0.051348,0.050485,0.000578,0.05044,0.000809,5,0,0.03
statistical.kolmogorov_smirnov_test,0.225113,0.281391,0.251686,0.019904,0.250126,0.027878,5,0,0.08
statistical.chi_squared_test,0.0,0.0,0.0,0.0,0.0,0.0,5,0,0.03
statistical.maximum_mean_discrepancy,1.00592,1.005938,1.005926,7e-06,1.005922,6e-06,5,0,0.05
statistical.inverse_cdf_distance,0.0,0.0,0.0,0.0,0.0,0.0,5,0,0.0





# 