In [1]:
from tab_forge.dataset import Dataset
from tab_forge.benchmark import Benchmark
import pandas as pd

In [2]:
dataset = Dataset(
    data='abalone.csv',
    target="Rings",
    task_type="regression",
    categorical_features=["Sex"],
    numerical_features=["Length", "Diameter", "Height", "Whole weight", "Shucked weight", "Viscera weight", "Shell weight"]
)

In [3]:
dataset.get_registered_data().head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [4]:
train, test = dataset.train_test_split(test_size=0.2, random_state=42)

In [5]:
print(train.get_registered_data().shape, test.get_registered_data().shape)

(3341, 9) (836, 9)


List

In [6]:
benchmark = Benchmark([
    ('r2', {'model': 'xgboost'}),
    ('r2', {'model': 'linear'}),
    ('rmse', {'model': 'linear'}),
    ('rmse', {'model': 'xgboost'})
])

In [7]:
res = benchmark.fit(train, test)
res

Benchmark Results: [0.46000832319259644, 0.5481628137889263, 2.2116130871218362, 2.4177516168031405]

In [8]:
res.metrics[1]

0.5481628137889263

Dict

In [9]:
def my_func_1(synth, real):
    return 0.5

def my_func_2(synth, rea, num):
    return num

In [10]:
benchmark = Benchmark(
    {
        'r2_xgboost': ('r2', {'model': 'xgboost'}),
        'r2_linear': ('r2', {'model': 'linear'}),
        'rmse_linear': ('rmse', {'model': 'linear'}),
        'lf_metric': ('frob_corr', {}),
        'js_metric': ('js_mean', {}),
        'mi_matrix_metric_50': ('frob_mi', {'n_bins': 50}),
        'mi_matrix_metric_25': ('frob_mi', {'n_bins': 25}),
        'my_metric_1': (my_func_1, {}),
        'my_metric_2': (my_func_2, {'num': 1.0})
    }
)

In [11]:
res = benchmark.fit(train, test)
res

Benchmark Results:
  r2_xgboost               : 0.460008
  r2_linear                : 0.548163
  rmse_linear              : 2.211613
  lf_metric                : 0.286597
  js_metric                : 0.073897
  mi_matrix_metric_50      : 2.383868
  mi_matrix_metric_25      : 0.978846
  my_metric_1              : 0.500000
  my_metric_2              : 1.000000

In [12]:
res.metrics['r2_xgboost']

0.46000832319259644

Pipeline

In [1]:
from tab_forge.models import CTGANSynthesizer
from tab_forge.dataset import Dataset
from tab_forge.benchmark import Benchmark



In [2]:
dataset = Dataset(
    data='abalone.csv',
    target="Rings",
    task_type="regression",
    categorical_features=["Sex"],
    numerical_features=["Length", "Diameter", "Height", "Whole weight", "Shucked weight", "Viscera weight", "Shell weight"]
)

In [3]:
train, test = dataset.train_test_split(test_size=0.2, random_state=42)

In [4]:
model = CTGANSynthesizer()

In [5]:
model.get_hyperparameters()

{'discriminator_lr': 0.0002,
 'generator_lr': 0.0002,
 'batch_size': 500,
 'embedding_dim': 128,
 'generator_dim': [256, 256],
 'discriminator_dim': [256, 256],
 'generator_decay': 1e-06,
 'discriminator_decay': 1e-06,
 'discriminator_steps': 1,
 'log_frequency': True,
 'pac': 10,
 'epochs': 300,
 'verbose': False}

In [6]:
model.fit(train)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
model.generate(10)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.435,0.362,0.146,0.9706,0.3266,0.1735,0.3757,8
1,I,0.508,0.335,0.146,0.2351,0.3281,0.1534,0.156,9
2,F,0.692,0.508,0.198,0.9558,0.9949,0.3119,0.4453,8
3,I,0.36,0.365,0.094,0.207,0.1365,0.0005,0.0287,5
4,M,0.628,0.361,0.201,0.9729,0.3836,0.2956,0.3064,15
5,I,0.548,0.296,0.059,0.2548,0.0177,0.0063,0.0524,8
6,M,0.592,0.346,0.148,0.5813,0.5019,0.118,0.2322,13
7,M,0.682,0.528,0.199,1.3689,0.649,0.1312,0.4186,12
8,M,0.515,0.381,0.197,0.7544,0.6404,0.1375,0.277,10
9,F,0.641,0.502,0.213,1.5167,0.3943,0.2688,0.3433,8


In [8]:
synth = model.structed_generate(len(test))

In [9]:
benchmark = Benchmark(
    {
        'r2_xgboost': ('r2', {'model': 'xgboost'}),
        'r2_linear': ('r2', {'model': 'linear'}),
        'rmse_xgboost': ('rmse', {'model': 'xgboost'}),
        'rmse_linear': ('rmse', {'model': 'linear'})
    }
)

In [10]:
benchmark.fit(synth, test)

Benchmark Results:
  r2_xgboost               : 0.261762
  r2_linear                : 0.331421
  rmse_xgboost             : 2.826939
  rmse_linear              : 2.690262

In [11]:
benchmark.fit(train, test)

Benchmark Results:
  r2_xgboost               : 0.460008
  r2_linear                : 0.548163
  rmse_xgboost             : 2.417752
  rmse_linear              : 2.211613