# Load toy dataset
For this tutorial we will use the breast cancer dataset from scikit-learn.

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

seed = 42

X = load_breast_cancer(as_frame=True).frame
X_train, X_test = train_test_split(
    X, test_size=0.3, stratify=X["target"], random_state=seed
)
X_train

# Generate synthetic data
Simply import a generator, train it using the fit method, and sample a synthetic dataset using the generate method.

For tabular data, the fit method requires the column names of discrete features as well. Discrete data typically needs to be handled differently than numerical data.

All preprocessing and postprocessing is performed under the hood: no need to scale, encode, or handle missingness. 

The output synthetic dataframe contains the same columns of the same numerical precision and data types as the input dataframe.

In [None]:
from synthyverse.generators import ARFGenerator

generator = ARFGenerator(random_state=seed)
generator.fit(X_train, discrete_features=["target"])

syn = generator.generate((len(X)))
syn

# Evaluate synthetic data
The evaluation module contains the MetricEvaluator object, which can be used to evaluate the quality of synthetic data.

Simply pass your desired metrics (see the docs for all included metrics) and some additional info on your dataset. 

Then calculate the metrics using the evaluate method. Pass both the original data that was used to train the generator and an independent holdout/test set of real data. Many metrics require an independent holdout set to evaluate generalization capacity.

In [None]:
from synthyverse.evaluation import MetricEvaluator

evaluator = MetricEvaluator(
    metrics=["mle", "dcr", "similarity"],
    discrete_features=["target"],
    random_state=seed,
    target_column="target",
)

results = evaluator.evaluate(X_train, X_test, syn)
results

# Evaluate synthetic data: continued
We allow fine control over metric parameters by passing them as dictionaries. See the docs for all metric parameters.

Also, you can calculate various configurations of the same metric by adding a dash to the name {metric_name}-{configuration} and then using different parameters as shown below.


In [None]:
evaluator = MetricEvaluator(
    metrics={
        "mle-trts": {"train_set": "real"},
        "mle-tstr": {"train_set": "synthetic"},
        "dcr": {"estimates": ["mean", 0.01, 0.05]},
    },
    discrete_features=["target"],
    random_state=seed,
    target_column="target",
)
results = evaluator.evaluate(X_train, X_test, syn)
results

# Benchmarking
To simplify the pipeline of synthetic data generation and evaluation, we also include a benchmarking module. 

This module abstracts a pipeline of synthetic data generation and evaluation in a few lines of code. 

Input the name of the generator, its parameters, the number of random train-test splits to fit the generator to, number of random initializations to fit the generator to, the number of synthetic sets to sample and evaluate for each fitted generator, the metrics to evaluate, and the size of the test set.

Then run the benchmark on a dataframe (including some information on which columns are discrete, and which is the target column). 

In [None]:
from synthyverse.benchmark import TabularBenchmark

benchmark = TabularBenchmark(
    generator_name="arf",
    generator_params={"random_state": seed, "num_trees": 20},
    n_random_splits=1,
    n_inits=1,
    n_generated_datasets=1,
    metrics=["classifier_test", "mle", "dcr"],
    test_size=0.3,
)
results = benchmark.run(X, target_column="target", discrete_columns=["target"])
results