# Training a sequential synthesizer

### Import required packages

In [1]:
import os 
import json
import pickle as pkl
import pandas as pd

from ydata.synthesizers.timeseries import TimeSeriesSynthesizer
from ydata.metadata import Metadata
from ydata.dataset import Dataset
from ydata.utils.data_types import DataType

In order to have the pipeline running smoothly we encourage to always create validation mechanisms.

In [2]:
def import_validations(dataset: pd.DataFrame, dataset_sttr: dict, column: dict):
    return True

### Getting the required elements
As a first step we have to read and init the output elements from the last step of the pipeline

In [3]:
#Getting the dataset header columns
with open('outputs/dataset_attrs.json', 'r') as attributes:
    dataset_attr=dict(json.load(attributes))

#Getting the defined dataset attributes
with open('outputs/data_columns.pkl', 'rb') as columns:
    columns= pkl.load(columns)

column_names = [col for col in columns.keys()]

#Reading the dataset
real_data = Dataset(pd.read_csv('outputs/real_data.csv', names=column_names))

In [4]:
#Init the metadata to be provided to the synthesizer
metadata = Metadata()
metadata(real_data, dataset_attrs=dataset_attr)

metadata.columns['date'].datatype = DataType.NUMERICAL

[########################################] | 100% Completed |  2.3s


## Fitting a sequential data synthesizer

#### Init & fit of the synthesizer

In [5]:
synth = TimeSeriesSynthesizer()
synth.fit(real_data, metadata=metadata)

INFO: 2022-02-13 22:28:17,147 [SYNTHESIZER] - Initializing Time Series SYNTHESIZER.
INFO: 2022-02-13 22:28:17,389 [SYNTHESIZER] - Starting the synthetic data generation process.


<ydata.synthesizers.timeseries.model.TimeSeriesSynthesizer at 0x7f4a891c8d10>

#### Saving the fitted synthesizer

In [6]:
print(os.getenv('MODEL_PATH'))
synth.save('outputs/model.pkl')

None
INFO: 2022-02-13 22:28:57,248 [SYNTHESIZER] - Saving SYNTHESIZER state.
