# Synthetic data generation

#### Required imports

In [5]:
import pandas as pd

from ydata.labs import DataSources

from ydata.dataset import Dataset
from ydata.metadata import Metadata

## Read your data

In [11]:
### Read as a pandas DataFrame

df = pd.read_csv('{insert-file-path}')

#Create a dataset for scale
dataset = Dataset(df)

In [None]:
### Use a Datasource created in the UI
#dataset = DataSources.get(uid='{insert-datasource-uid}')

## Calculate your dataset Metadata

In [12]:
#Compute your dataset metadata
metadata = Metadata(dataset)

[########################################] | 100% Completed | 220.31 ms
[########################################] | 100% Completed | 669.62 ms
[########################################] | 100% Completed | 207.49 ms
[########################################] | 100% Completed | 1.99 sms


In [13]:
print(metadata)

[1mMetadata Summary 
 
[0m[1mDataset type: [0mTABULAR
[1mDataset attributes: [0m
[1mNumber of columns: [0m12
[1mDuplicate rows: [0m148
[1mTarget column: [0m

[1mColumn detail: [0m
         Column    Data type Variable type
0           age    numerical           int
1        gender  categorical           int
2        height    numerical           int
3        weight    numerical         float
4         ap_hi    numerical           int
5         ap_lo    numerical           int
6   cholesterol  categorical           int
7          gluc  categorical           int
8         smoke  categorical           int
9          alco  categorical           int
10       active  categorical           int
11       cardio  categorical           int

0     skewness                     [height, weight, ap_hi]
1  correlation  [gender|height, ap_hi|ap_lo, ap_hi|cardio]
2    imbalance    [cholesterol, gluc, smoke, alco, active]



## Train & Generate synthetic data samples

In [15]:
from ydata.synthesizers.regular import RegularSynthesizer

synth = RegularSynthesizer()
synth.fit(dataset, metadata=metadata)

INFO: 2023-04-19 10:56:22,219 [SYNTHESIZER] - Number columns considered for synth: 12
INFO: 2023-04-19 10:56:22,525 [SYNTHESIZER] - Starting the synthetic data modeling process over 2x1 blocks.
INFO: 2023-04-19 10:56:22,526 [SYNTHESIZER] - Generating pipeline for segment (-0.001, 27468.0]
INFO: 2023-04-19 10:56:22,538 [SYNTHESIZER] - Preprocess segment
INFO: 2023-04-19 10:56:22,543 [SYNTHESIZER] - Synthesizer init.
INFO: 2023-04-19 10:56:22,544 [SYNTHESIZER] - Processing the data prior fitting the synthesizer.
INFO: 2023-04-19 10:56:31,182 [SYNTHESIZER] - Generating pipeline for segment (27468.0, 54936.0]
INFO: 2023-04-19 10:56:31,192 [SYNTHESIZER] - Preprocess segment
INFO: 2023-04-19 10:56:31,203 [SYNTHESIZER] - Synthesizer init.
INFO: 2023-04-19 10:56:31,204 [SYNTHESIZER] - Processing the data prior fitting the synthesizer.


<ydata.synthesizers.regular.model.RegularSynthesizer at 0x7f052e68e9e0>

In [17]:
## Generate as many synthetic samples as required. N_samples represents the number of rows to be generated.
synth_sample = synth.sample(n_samples=len(dataset))

INFO: 2023-04-19 10:57:27,747 [SYNTHESIZER] - Start generating model samples.
INFO: 2023-04-19 10:57:27,748 [SYNTHESIZER] - Sample segment (-0.001, 27468.0]
INFO: 2023-04-19 10:57:31,566 [SYNTHESIZER] - Sample segment (27468.0, 54936.0]


In [19]:
#Convert your synthetic samples from a distributed setting to Pandas DataFrame to save the sample locally
synth_sample = synth_sample.to_pandas()

synth_sample.to_csv('synthetic_sample.csv')


## Calculate quality report

In [None]:
sdf = SyntheticDataProfile(report_type=ReportType.TABULAR)
data_types = {k: v.datatype for k, v in metadata.columns.items()}

sdf.generate_report(real={insert-holdout-dataset},
                    synth=synth_sample,
                    target="{insert-target-col-name}",
                    data_types=data_types,
                    training_data={insert-training-dataset},
                    metadata=metadata,
                    pdf=True)