# Install requirements

In [1]:
%%capture
!pip install sdv

# Train models and generate sample

In [2]:
from typing import Dict
import time
import os
from pathlib import Path

from ydata.metadata import Metadata
from ydata.dataset import Dataset

import sdv

from common.sdv import fabric_to_sdv_metadata
from common.config import *
from common.utils import update_json_file, ndd, ndd_to_dict, load_dataframe, get_model_class


def get_sample(model, nrows):
    sample = model.sample(nrows)
    if isinstance(sample, Dataset):
        sample = sample.to_pandas()
    return sample

def resolve_init_params(init_params: Dict, metadata: Metadata):
    c = ['metadata', 'table_metadata']
    for e in c:
        if e in init_params:
            if init_params[e] == 'sdv':
                init_params[e] = fabric_to_sdv_metadata(metadata)
    return init_params

def resolve_fit_params(fit_params: Dict, metadata: Metadata):
    if 'metadata' in fit_params:
        if fit_params['metadata'] == 'fabric':
            fit_params['metadata'] = metadata
    return fit_params


def train_model(name: str, dataset, metadata: Metadata):
    model_info = get_models_config()[name]
    start = time.time()
    model_csl = get_model_class(name)
    init_params = resolve_init_params(model_info.get('init_params', {}), metadata)
    model = model_csl(**init_params)
    dataset_type = model_info.get('dataset')
    X = dataset if dataset_type == 'pandas' else Dataset(dataset)
    fit_params = resolve_fit_params(model_info.get('fit_params', {}), metadata)
    model.fit(X, **fit_params)
    end = time.time()
    timer = end - start
    return model, timer

def load_model(name: str, path: str):
    model_info = get_models_config()[name]
    model_csl = get_model_class(name)
    return model_csl.load(path)

def train_and_sample_models():
    datasets_config = get_datsets_config()
    models_config = get_models_config()
    analysis_config = get_analysis_config()

    timers = ndd()
    for dataset_name in datasets_config.keys():
        print(f'# Dataset: {dataset_name} ')
        dataset = load_dataframe(dataset_name, split='train')
        metadata = Metadata.load(str(Path(DATASET_PATH) / f'{dataset_name}_train.metadata.pkl'))
        for model_name, model_info in models_config.items():
            if not model_info.get('enabled', True):
                continue
            try:
                # Training
                print(f' # Model: {model_name} ')
                model_path = Path(MODELS_PATH) / f'{model_name}_{dataset_name}.pkl'
                if os.path.isfile(model_path):
                    print("  -> Load model...")
                    model = load_model(model_name, model_path)
                else:
                    print("  -> Train model...")
                    model, timer = train_model(model_name, dataset, metadata)
                    timers[dataset_name][model_name]['training'] = timer
                    model.save(model_path)
                    print(f'   Training time: {timer}')

                # Sampling
                sample_path = Path(SAMPLES_PATH) / f'{model_name}_{dataset_name}_sample.csv'
                if os.path.isfile(sample_path):
                    print("  -> Skip as sample exists...")
                else:
                    print("  -> Generate sample...")
                    holdout = load_dataframe(dataset_name, split='holdout')
                    start = time.time()
                    sample = get_sample(model, holdout.shape[0])
                    end = time.time()
                    timer = end - start
                    timers[dataset_name][model_name]['sample'] = timer
                    sample.to_csv(sample_path, index=False)
                    print(f'   Sampling time: {timer}')
                path = analysis_config['output_files']['timers']
                update_json_file(path, ndd_to_dict(timers))
            except Exception as e:
                print('Could no train and sample the synthsizer')
                print(e)
    return ndd_to_dict(timers)

In [3]:
if os.environ.get('STEP_DISABLED') is None:
    timers = train_and_sample_models()

# Dataset: sdv.adult 
 # Model: fabric.regular 
  -> Train model...
INFO: 2022-12-02 15:27:19,249 [SYNTHESIZER] - Number columns considered for synth: 15
INFO: 2022-12-02 15:31:06,806 [SYNTHESIZER] - Starting the synthetic data modeling process over 1x1 blocks.
INFO: 2022-12-02 15:31:06,815 [SYNTHESIZER] - Preprocess segment
INFO: 2022-12-02 15:31:06,819 [SYNTHESIZER] - Synthesizer init.
INFO: 2022-12-02 15:31:06,820 [SYNTHESIZER] - Processing the data prior fitting the synthesizer.
   Training time: 231.98067355155945
  -> Generate sample...
INFO: 2022-12-02 15:31:10,750 [SYNTHESIZER] - Start generating model samples.
   Sampling time: 2.8316285610198975
sdv.adult {} {'fabric.regular': {'training': 231.98067355155945, 'sample': 2.8316285610198975}}
 # Model: sdv.tabular 
  -> Train model...
INFO: 2022-12-02 15:31:13,841 Fitting table None metadata
INFO: 2022-12-02 15:31:13,846 Anonymizing table None
INFO: 2022-12-02 15:31:13,847 Fitting constraints for table None
INFO: 2022-12-02 15:3

In [4]:
from common.config import get_analysis_config
from common.utils import update_json_file

if os.environ.get('STEP_DISABLED') is None:
    analysis_config = get_analysis_config()

    path = analysis_config['output_files']['timers']
    update_json_file(path, timers)

training {'training': 231.98067355155945, 'sample': 2.8316285610198975} 231.98067355155945
sample {'training': 231.98067355155945, 'sample': 2.8316285610198975} 2.8316285610198975
training {'training': 0.7253715991973877, 'sample': 0.4416840076446533} 0.7253715991973877
sample {'training': 0.7253715991973877, 'sample': 0.4416840076446533} 0.4416840076446533
