In [6]:
import os, sys
import pandas as pd

from tab_gmb.dataset import Dataset
from tab_gmb.models import CTGANModel
from tab_gmb.tuner import TPETuner
from tab_gmb.experiment_runer import ExperimentRunner

In [7]:
test_dir = './titanic.csv'
test_data = pd.read_csv(test_dir)

dataset = Dataset(
    data=test_data,
    name='titanic',
    task_type='classification',
    target_column='Survived',
    num_columns=["Age", "Fare", "SibSp", "Parch"],
    cat_columns=["Pclass", "Sex", "Embarked"],
    verbose_init=True
)

Предупреждение: Колонки не классифицированы как числовые или категориальные: {'Cabin', 'Name', 'PassengerId', 'Ticket'}
Датасет 'titanic' успешно валидирован:
  - Размер: (712, 12)
  - Тип задачи: classification
  - Целевая колонка: Survived
  - Числовые колонки (4): ['Age', 'Fare', 'SibSp', 'Parch']
  - Категориальные колонки (3): ['Pclass', 'Sex', 'Embarked']


In [8]:
ctgan_space = {
    'discriminator_lr': (4e-4, 2.1e-3, 5e-5),
    'generator_lr': (5e-5, 5e-3, 5e-5),
    'batch_size': [100, 500, 1000],
    'embedding_dim': [32, 128],
    'generator_dim': [[128, 128, 128], [128, 128, 128, 128]],
    'discriminator_dim': [[256, 256], [256, 256, 256]],
    'generator_decay': (1e-6, 6.4e-6, 1e-7),
    'discriminator_decay': (1e-6, 8e-6, 1e-6),
    'log_frequency': [False, True],
    'transformation_num_type': ['CDF', 'PLE_CDF'],
    'transformation_cat_type': ['OHE']
    
}

In [9]:
model= CTGANModel()

In [10]:
tuner = TPETuner(n_trials=50)


runner = ExperimentRunner(
    dataset=dataset,
    model=model,
    tuner=tuner,
    hyperparameter_space=ctgan_space,
    metrics=['c2st', 'ml_efficacy', 'pair', 'shape'],
    metric_weights={'c2st': 1.0, 'ml_efficacy': -1.0, 'pair': -1.0, 'shape': -1.0},
    n_folds=3
)


results = runner.run_experiment()

--------------------------------------------------    
{'batch_size': 500, 'discriminator_decay': 2e-06, 'discriminator_dim': (256, 256, 256), 'discriminator_lr': 0.00115, 'embedding_dim': 32, 'generator_decay': 6.4e-06, 'generator_dim': (128, 128, 128), 'generator_lr': 0.00055, 'log_frequency': True, 'transformation_cat_type': 'OHE', 'transformation_num_type': 'CDF'}
Score:                                                
-1.3739183827092487                                   
{'c2st': 0.9061032863849765, 'ml_efficacy': 0.5691756606415376, 'pair': 0.86524093097866, 'shape': 0.8456050774740277}
--------------------------------------------------    
--------------------------------------------------                               
{'batch_size': 500, 'discriminator_decay': 1e-06, 'discriminator_dim': (256, 256, 256), 'discriminator_lr': 0.0013000000000000002, 'embedding_dim': 32, 'generator_decay': 1.2e-06, 'generator_dim': (128, 128, 128, 128), 'generator_lr': 5e-05, 'log_frequency': Fals

In [11]:
print('Лучшие параметры:', results['best_hyperparameters'])
print('Лучший score:', results['best_score'])
print(results['best_com_score'])

Лучшие параметры: {'batch_size': 100, 'discriminator_decay': 4.9999999999999996e-06, 'discriminator_dim': (256, 256), 'discriminator_lr': 0.00075, 'embedding_dim': 128, 'generator_decay': 3.1e-06, 'generator_dim': (128, 128, 128), 'generator_lr': 0.0017000000000000001, 'log_frequency': True, 'transformation_cat_type': 'OHE', 'transformation_num_type': 'CDF'}
Лучший score: -1.8364264585832584
{'c2st': 0.8524713093375066, 'ml_efficacy': 0.9005660948447923, 'pair': 0.8845442509341203, 'shape': 0.9037874221418525}
