# 6. Augment underepresented class with Synthetic data

### Import the need packages

In [3]:
import os
from pathlib import Path
import pandas as pd
import numpy as np

import dask.dataframe as dd

from ydata.profiling import ProfileReport

from ydata.synthesizers.regular import RegularSynthesizer

from functions.saving_functions import read_file, save_file

### Getting the running settings

In [4]:
#Reading the environment variables
augment = int(os.environ.get('AUGMENT', 1))
augment_class = os.environ.get('AUGMENT_CLASS', 'Churn')
augment_class_val = int(os.environ.get('AUGMENT_CLASS_VAL', 1))
sample_size = int(os.environ.get('SAMPLE_SIZE', 1000))

try:
    augment_class_val = int(os.environ.get('AUGMENT_CLASS_VAL', 1))
except: 
    augment_class_val = os.environ.get('AUGMENT_CLASS_VAL', 1)

## Read the dataset & Metadata

In [5]:
metadata = read_file('metadata_train.pkl')
dataset = read_file('train_set.pkl')

In [6]:
# Importing YData's packages
from ydata.labs import DataSources
# Reading the Dataset from the DataSource
datasource = DataSources.get(uid='{datasource-id}')

dataset = datasource.dataset
# Getting the calculated Metadata to get the profile overview information in the labs
metadata = datasource.metadata

### The data synthesis

#### Training the synthesizer

In [13]:
from ydata.dataset import Dataset
from ydata.metadata import Metadata 

if bool(augment):    
    synth = RegularSynthesizer()
    synth.fit(dataset, metadata=metadata, condition_on=[augment_class])
    synth_sample = synth.sample(sample_size)
    
    #Convert synth sample to DASK
    synth_dataset = synth_sample.to_dask()
    
    #Re-compute dataset & Metadata    
    dataset = Dataset(dd.concat([dataset.to_dask(), synth_dataset]))
    metadata = Metadata(dataset)

[########################################] | 100% Completed | 324.59 ms
[########################################] | 100% Completed | 1.69 sms
INFO: 2022-12-04 02:50:23,154 [SYNTHESIZER] - Number columns considered for synth: 23
INFO: 2022-12-04 02:50:39,199 [SYNTHESIZER] - Starting the synthetic data modeling process over 1x1 blocks.
INFO: 2022-12-04 02:50:39,205 [SYNTHESIZER] - Preprocess segment
INFO: 2022-12-04 02:50:39,209 [SYNTHESIZER] - Synthesizer init.
INFO: 2022-12-04 02:50:39,210 [SYNTHESIZER] - Processing the data prior fitting the synthesizer.
INFO: 2022-12-04 02:50:42,195 [SYNTHESIZER] - Start generating model samples.


## Pipeline Outputs

In [14]:
#Saving the generated sample
if augment:
    save_file(dataset, 'train_set.pkl')
    metadata.save('metadata_train.pkl')

In [8]:
pipeline_parameters = read_file('pipeline_parameters.pkl')
pipeline_parameters['augment'] = augment
pipeline_parameters['sample_size'] = sample_size
save_file(pipeline_parameters, 'pipeline_parameters.pkl')