In [7]:
from tab_forge.dataset import Dataset
from tab_forge.models import CTGANSynthesizer
from tab_forge.benchmark import Benchmark

from tab_forge.tuning import TuningStudy
from tab_forge.tuning import AutoTuningStudy
from tab_forge.tuning.sampler import TPESampler 

In [8]:
dataset = Dataset(
    data='abalone.csv',
    target="Rings",
    task_type="regression",
    categorical_features=["Sex"],
    numerical_features=["Length", "Diameter", "Height", "Whole weight", "Shucked weight", "Viscera weight", "Shell weight"]
)

In [9]:
train, test = dataset.train_test_split(test_size=0.2, random_state=42)

### Tuner
!!! Example with data data leakage

In [10]:
def objective(trial, train, test):
    d_lr = trial.suggest_float("d_lr", 1e-5, 1e-2, log=True)
    g_lr = trial.suggest_float("g_lr", 1e-5, 1e-2, log=True)
    epochs = trial.suggest_int("epochs", 100, 300, step=100)
    
    model = CTGANSynthesizer(
        discriminator_lr=d_lr,
        generator_lr=g_lr,
        epochs=epochs
    )

    benchmark = Benchmark([
        ('r2', {'model': 'xgboost'})
    ])

    model.fit(train)
    # structed_generate -> object Dataset
    synth = model.structed_generate(len(test))
    
    score = benchmark.fit(synth, test).metrics[0]
    
    return score

In [11]:
experiment_runner = TuningStudy(
    study_name="ctgan_abalone_tuning",
    sampler=TPESampler(),
    direction="maximize"
)

In [12]:
experiment_runner.optimize(lambda trial: objective(trial, train, test), n_trials=5, verbose=True)

Trial 0: Value = 0.19852828979492188, Params = {'d_lr': 0.000558949614025373, 'g_lr': 0.00034589495312660685, 'epochs': 200}
Trial 1: Value = -0.44990694522857666, Params = {'d_lr': 3.263984877938301e-05, 'g_lr': 7.694624251444373e-05, 'epochs': 300}
Trial 2: Value = -0.12609124183654785, Params = {'d_lr': 0.00010922154358851852, 'g_lr': 4.396571427928549e-05, 'epochs': 200}
Trial 3: Value = -0.15395522117614746, Params = {'d_lr': 0.007107675483636306, 'g_lr': 2.192186506906563e-05, 'epochs': 300}
Trial 4: Value = 0.1265571117401123, Params = {'d_lr': 0.0004153483906838404, 'g_lr': 0.0003747158357621996, 'epochs': 100}


In [13]:
import optuna

optuna.visualization.plot_optimization_history(experiment_runner.study)


In [14]:
print(experiment_runner.study.best_trial.params)
print(experiment_runner.study.best_value)

{'d_lr': 0.000558949614025373, 'g_lr': 0.00034589495312660685, 'epochs': 200}
0.19852828979492188


### AutoTuner

In [4]:
def my_params(trial):
        return {
            'generator_lr': trial.suggest_float('generator_lr', 1e-4, 2e-4, log=True),
            'discriminator_lr': trial.suggest_float('discriminator_lr', 1e-4, 2e-4, log=True),
            'epochs': trial.suggest_int('epochs', 50, 100, step=10)
        }

In [16]:
benchmark_instance = Benchmark({
    'r2_linear': ('r2', {'model': 'linear'})
})

In [8]:
study_manual = AutoTuningStudy(
        model_class=CTGANSynthesizer,
        get_params=my_params,
        cv=3,
        sampler=TPESampler(),
        search_space_mode='manual'
    )

In [9]:
study_manual.optimize(train, n_trials=5, verbose=True)

Trial 0 with params: {'generator_lr': 0.00013175429846775977, 'discriminator_lr': 0.00017679885701725153, 'epochs': 70}
Scores per fold: {'rmse_xgboost': [3.479205346415945, 3.336864619548833, 3.406680438641545]}
Trial 0 finish with value: 3.4075834682021076
Trial 1 with params: {'generator_lr': 0.00010145824020920333, 'discriminator_lr': 0.00015724639522663283, 'epochs': 80}
Scores per fold: {'rmse_xgboost': [3.6576292908684644, 3.3383297349590144, 5.033852419496863]}
Trial 1 finish with value: 4.009937148441447
Trial 2 with params: {'generator_lr': 0.0001468583280551697, 'discriminator_lr': 0.0001799991942212986, 'epochs': 60}
Scores per fold: {'rmse_xgboost': [4.725016695830704, 3.8936259634183368, 3.817748412597998]}
Trial 2 finish with value: 4.14546369061568
Trial 3 with params: {'generator_lr': 0.00011687619314021476, 'discriminator_lr': 0.00014470265303078515, 'epochs': 100}
Scores per fold: {'rmse_xgboost': [3.5295292535561997, 3.7441064935520454, 3.579169774674001]}
Trial 3 f

<optuna.study.study.Study at 0x24e891286d0>

In [13]:
print(study_manual.best_params)
print(study_manual.best_value)

{'generator_lr': 0.00013175429846775977, 'discriminator_lr': 0.00017679885701725153, 'epochs': 70}
3.4075834682021076


In [22]:
study_extended = AutoTuningStudy(
        model_class=CTGANSynthesizer,
        get_params=my_params,
        cv=3,
        benchmark=benchmark_instance,
        sampler=TPESampler(),
        search_space_mode='extended',
        direction='maximize'
    )

In [23]:
study_extended.optimize(train, n_trials=5, verbose=True)

Trial 0 with params: {'generator_lr': 0.00010198650499701783, 'discriminator_lr': 0.00016148722721115043, 'epochs': 70, 'generator_dim': [84, 84, 84, 84], 'discriminator_dim': [86, 86, 86, 86], 'batch_size': 950, 'discriminator_steps': 3, 'generator_decay': 0.001, 'discriminator_decay': 0.001, 'cuda': False}
Scores per fold: {'r2_linear': [0.058668812736747955, 0.005781725809409499, 0.04763517513646942]}
Trial 0 finish with value: 0.037361904560875625
Trial 1 with params: {'generator_lr': 0.00011022671825564804, 'discriminator_lr': 0.00010810564349755159, 'epochs': 50, 'generator_dim': [129], 'discriminator_dim': [150, 150, 150], 'batch_size': 480, 'discriminator_steps': 2, 'generator_decay': 0.001, 'discriminator_decay': 0.001, 'cuda': False}
Scores per fold: {'r2_linear': [-0.04254171656937644, -0.05239448680634751, -0.1952634643126403]}
Trial 1 finish with value: -0.09673322256278809
Trial 2 with params: {'generator_lr': 0.00013023359159551213, 'discriminator_lr': 0.0001397407634968

<optuna.study.study.Study at 0x24e8ea62310>

In [24]:
print(study_extended.best_params)
print(study_extended.best_value)

{'generator_lr': 0.00010198650499701783, 'discriminator_lr': 0.00016148722721115043, 'epochs': 70, 'generator_layers': 4, 'gen_first_layer_size': 84, 'discriminator_layers': 4, 'dis_first_layer_size': 86, 'batch_size': 954, 'discriminator_steps': 3, 'generator_decay': 0.001, 'discriminator_decay': 0.001}
0.037361904560875625


In [25]:
import optuna

optuna.visualization.plot_optimization_history(study_extended.study)