# Load toy dataset

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

seed = 42
X = load_breast_cancer(as_frame=True).frame
discrete_features = ["target"]
target_column = "target"
X_train, X_test = train_test_split(
    X, test_size=0.2, stratify=X[target_column], random_state=seed
)
X_train.head()

# Import and train synthetic data generator

In [None]:
from synthyverse.generators import ARFGenerator

generator = ARFGenerator(num_trees=20, random_state=0)
generator.fit(X, discrete_features=["target"])

syn = generator.generate(len(X))
syn.head()

# Evaluate synthetic data quality

In [None]:
from synthyverse.evaluation import MetricEvaluator

metrics = ["mle", "dcr", "similarity"]
metrics = {
    "mle-trts": {"train_set": "real"},
    "mle-tstr": {"train_set": "synthetic"},
    "dcr": {"estimates": ["mean", 0.01, 0.05]},
    "similarity": {},
}
evaluator = MetricEvaluator(metrics, discrete_features, target_column, seed)
results = evaluator.evaluate(X_train, X_test, syn)
results

# Unified pipeline for synthetic data generation and evaluation

In [None]:
from synthyverse.benchmark import TabularBenchmark

benchmark = TabularBenchmark(
    generator_name="arf",
    generator_params={"num_trees": 20},
    n_random_splits=1,
    n_inits=1,
    n_generated_datasets=10,
    metrics=["classifier_test", "mle", "dcr"],
    test_size=0.2,
    val_size=0.2,
)
results = benchmark.run(X, target_column="target", discrete_columns=["target"])
results