# Synthetic text generation & Profile

#### Required imports

In [5]:
# Importing YData's packages
from ydata.labs import DataSources
# Reading the Dataset from the DataSource
datasource = DataSources.get(uid='{datasource-id}')
dataset = datasource.dataset
# Getting the calculated Metadata to get the profile overview information in the labs
metadata = datasource.metadata
print(metadata)

## Train & Generate synthetic data samples

In [15]:
from ydata.synthesizers.regular import TextSynthesizer

synth = TextSynthesizer(model='open-ai')
synth.fit(dataset, metadata=metadata)

INFO: 2023-04-19 10:56:22,219 [SYNTHESIZER] - Number columns considered for synth: 12
INFO: 2023-04-19 10:56:22,525 [SYNTHESIZER] - Starting the synthetic data modeling process over 2x1 blocks.
INFO: 2023-04-19 10:56:22,526 [SYNTHESIZER] - Generating pipeline for segment (-0.001, 27468.0]
INFO: 2023-04-19 10:56:22,538 [SYNTHESIZER] - Preprocess segment
INFO: 2023-04-19 10:56:22,543 [SYNTHESIZER] - Synthesizer init.
INFO: 2023-04-19 10:56:22,544 [SYNTHESIZER] - Processing the data prior fitting the synthesizer.
INFO: 2023-04-19 10:56:31,182 [SYNTHESIZER] - Generating pipeline for segment (27468.0, 54936.0]
INFO: 2023-04-19 10:56:31,192 [SYNTHESIZER] - Preprocess segment
INFO: 2023-04-19 10:56:31,203 [SYNTHESIZER] - Synthesizer init.
INFO: 2023-04-19 10:56:31,204 [SYNTHESIZER] - Processing the data prior fitting the synthesizer.


<ydata.synthesizers.regular.model.RegularSynthesizer at 0x7f052e68e9e0>

In [17]:
## Generate as many synthetic samples as required. N_samples represents the number of rows to be generated.
synth_sample = synth.sample(n_samples=len(dataset))

INFO: 2023-04-19 10:57:27,747 [SYNTHESIZER] - Start generating model samples.
INFO: 2023-04-19 10:57:27,748 [SYNTHESIZER] - Sample segment (-0.001, 27468.0]
INFO: 2023-04-19 10:57:31,566 [SYNTHESIZER] - Sample segment (27468.0, 54936.0]


## Profile data

In [2]:
from ydata.profiling import ProfileReport

report = ProfileReport(synth_sample)
report_html = report.to_html()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

### Pipeline outputs

In [None]:
##add here the outputs logic
import json

profile_pipeline_output = {
    'outputs' : [
        {
            'type': 'table',
            'storage': 'inline',
            'format': 'csv',
            'header': list(synth_sample.columns),
            'source': synth_sample.to_csv(header=False, index=True)
        },
        {
          'type': 'web-app',
          'storage': 'inline',
          'source': report_html,
        },
    ]
  }

with open('mlpipeline-ui-metadata.json', 'w') as metadata_file:
    json.dump(profile_pipeline_output, metadata_file)