# Tutorial 2: Benchmarks

In [1]:
import warnings
import sys

warnings.filterwarnings("ignore")

from sklearn.datasets import load_iris
from synthcity.plugins import Plugins
import synthcity.logger as log
from synthcity.plugins.core.dataloader import GenericDataLoader

X, y = load_iris(return_X_y=True, as_frame=True)
X["target"] = y

loader = GenericDataLoader(X, target_column="target", sensitive_columns=[])

loader.dataframe()



Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


## List the available generative models

In [3]:
from synthcity.plugins import Plugins

plugins = Plugins().list()

plugins

['tvae',
 'pategan',
 'rtvae',
 'copulagan',
 'privbayes',
 'bayesian_network',
 'adsgan',
 'nflow',
 'ctgan',
 'gaussian_copula']

## Benchmark the quality of plugins

In [4]:
from synthcity.benchmark import Benchmarks

score = Benchmarks.evaluate(
    ["uniform_sampler"],
    loader,
    synthetic_size=len(X),
    repeats=1,
)

In [5]:
Benchmarks.print(score)


[4m[1mPlugin : uniform_sampler[0m[0m


Unnamed: 0,min,max,mean,stddev,median,iqr,rounds,errors,durations
sanity.data_mismatch.score,0.166667,0.166667,0.166667,0.0,0.166667,0.0,1,0,0.0
sanity.common_rows_proportion.score,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.01
sanity.nearest_syn_neighbor_distance.mean,0.547869,0.547869,0.547869,0.0,0.547869,0.0,1,0,0.0
sanity.close_values_probability.score,0.053333,0.053333,0.053333,0.0,0.053333,0.0,1,0,0.0
sanity.distant_values_probability.score,0.106667,0.106667,0.106667,0.0,0.106667,0.0,1,0,0.0
stats.jensenshannon_dist.marginal,0.236573,0.236573,0.236573,0.0,0.236573,0.0,1,0,0.07
stats.chi_squared_test.marginal,0.998573,0.998573,0.998573,0.0,0.998573,0.0,1,0,0.01
stats.feature_corr.joint,3.222532,3.222532,3.222532,0.0,3.222532,0.0,1,0,0.02
stats.inv_kl_divergence.marginal,0.807273,0.807273,0.807273,0.0,0.807273,0.0,1,0,0.01
stats.ks_test.marginal,0.814667,0.814667,0.814667,0.0,0.814667,0.0,1,0,0.01





In [6]:
import pandas as pd
import numpy as np

means = []
directions = None
for plugin in score:
    data = score[plugin]["mean"]
    if directions is None and len(score[plugin]["direction"].to_dict()) > 0:
        directions = score[plugin]["direction"].to_dict()

    means.append(data)

out = pd.concat(means, axis=1)
out.set_axis(score.keys(), axis=1, inplace=True)

bad_highlight = "background-color: lightcoral;"
ok_highlight = "background-color: green;"
default = ""


def highlights(row):
    metric = row.name
    if directions[metric] == "minimize":
        best_val = np.min(row.values)
        worst_val = np.max(row)
    else:
        best_val = np.max(row.values)
        worst_val = np.min(row)

    styles = []
    for val in row.values:
        if val == best_val:
            styles.append(ok_highlight)
        elif val == worst_val:
            styles.append(bad_highlight)
        else:
            styles.append(default)

    return styles


out.style.apply(highlights, axis=1)

Unnamed: 0,uniform_sampler
sanity.data_mismatch.score,0.166667
sanity.common_rows_proportion.score,0.0
sanity.nearest_syn_neighbor_distance.mean,0.547869
sanity.close_values_probability.score,0.053333
sanity.distant_values_probability.score,0.106667
stats.jensenshannon_dist.marginal,0.236573
stats.chi_squared_test.marginal,0.998573
stats.feature_corr.joint,3.222532
stats.inv_kl_divergence.marginal,0.807273
stats.ks_test.marginal,0.814667


# 