### Standard Benchmarks for tabular data

Imports

In [5]:
import pickle
import os
from pathlib import Path

from synthcity.plugins.core.dataloader import GenericDataLoader
from synthcity.benchmark import Benchmarks
import synthcity.logger as log


Set parameters for model

In [6]:
log.add("synthcity_logs", "INFO")

KWARGS = {"n_iter": 100}
KWARGS_str = "-".join([f"{k}:{v}" for k, v in KWARGS.items()])

Main functions

In [7]:
def run_dataset(X, workspace_path, models, task_type="regression"):

    # create synthcity `dataloader`` object
    loader = GenericDataLoader(X, target_column="y")

    score = Benchmarks.evaluate(
        [(model, model, KWARGS) for model in models],
        loader.train(),
        loader.test(),
        task_type=task_type,
        synthetic_size=X.shape[0],
        metrics={
            "stats": ["alpha_precision"],
            "detection": ["detection_xgb", "detection_mlp", "detection_linear"],
            "performance": ["linear_model", "mlp", "xgb"],
        },
        workspace=workspace_path,
        repeats=1,
        synthetic_reuse_if_exists=False,
        augmented_reuse_if_exists=False,
        device="cpu",
    )

    return score

def create_absolute_path(cwd, path):
    if cwd.name not in ["tutorials", "tests", "synthcity-benckmarking"]:
        cwd = cwd / Path("../")
    path = (cwd / path).resolve()
    seen = set()
    seen_add= seen.add
    path = "/".join([p for p in path.split("/") if not (p in seen or seen_add(p))])

    return path

def run_synthcity(data_type="num", task_type="regression", models=["ctgan"], save=False):
    cwd = Path.cwd()
    
    file_path = create_absolute_path(cwd, f"../data/{data_type}/{task_type}/", data_type="num", task_type="regression",)
    workspace_path = create_absolute_path(cwd, f"../workspace/{data_type}/{task_type}/")
    result_path = create_absolute_path(cwd, f"../results/{data_type}/{task_type}/")
    Path(result_path).mkdir(parents=True, exist_ok=True)

    # list files in the file_path
    files = os.listdir(file_path)
    print(f"Number of files in {file_path}: {len(files)}")

    # Lets just run the benchmark for one file, the first in our list
    file = files[0]
    print(f"{file_path}/{file}")
    with open(f"{file_path}/{file}", "rb") as f:
        data_dict = pickle.load(f)

    X = data_dict["X"]
    y = data_dict["y"]
    X["y"] = y

    score = run_dataset(X, workspace_path, models, task_type=task_type)
    if score:
        Benchmarks.print(score)
        Benchmarks.highlight(score)
        if save:
            with open(f"{result_path}/{file}-{'-'.join(models)}-{KWARGS_str}.pkl", "wb") as f:
                pickle.dump(score, f)



In [8]:
run_synthcity("num", "regression", ["tvae", "ctgan"])

tutorials
Number of files in /home/rob/Documents/projects/RD_papers/synthcity-benchmarking/data/num/regression: 17
/home/rob/Documents/projects/RD_papers/synthcity-benchmarking/data/num/regression/296.pkl


KeyboardInterrupt: 