# YData Augmentation 

Data Augmentation through YData's Synthesizer. 

## 0 - Imports

In [42]:
%%capture
!pip install imblearn

In [43]:
%%capture 
import json 
from ydata.connectors import LocalConnector
from ydata.connectors.filetype import FileType
from ydata.synthesizers.regular import RegularSynthesizer 
from ydata.dataset import Dataset

from pandas import DataFrame, concat 
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.ensemble import AdaBoostClassifier

from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, SVMSMOTE

## 1 - Load Data

In [44]:
# Initialize the YData's connector and load the data
connector = LocalConnector()

preprocessed_data = connector.read_file('preprocessed_data.csv').to_pandas()

## 2 - YData Augmentation

### 2.1 - Split Data

In [45]:
# Taking the same test set used in the first one. 
test_data = connector.read_file('test_data.csv').to_pandas()
test_data.set_index('index', inplace=True)
train_data = preprocessed_data[~preprocessed_data.index.isin(test_data.index)]

### 2.2 - Augment with YData Synthesizer

In [46]:
%%capture 
synth = RegularSynthesizer()
synth.fit(Dataset(preprocessed_data))

#generated_data = synth.sample(20000).to_pandas()

INFO: 2022-04-04 14:21:39,405 [SYNTHESIZER] - Calculating metadata
INFO: 2022-04-04 14:21:40,917 [SYNTHESIZER] - Number columns considered for synth: 11
INFO: 2022-04-04 14:21:41,026 [SYNTHESIZER] - Synthesizer init.
INFO: 2022-04-04 14:21:41,028 [SYNTHESIZER] - Processing the data prior fitting the synthesizer.


In [55]:
generated_data = LocalConnector().read_file('generated_data.csv').to_pandas()

In [54]:
train_data_augmented = concat([generated_data, train_data])

### 2.3 - Train and Predict 

In [48]:
classifier = AdaBoostClassifier(random_state=30)
classifier.fit(train_data_augmented.drop('customer_lifetime_value', axis=1), train_data_augmented.customer_lifetime_value)
y_preds = classifier.predict(test_data.drop('customer_lifetime_value', axis=1))

### 2.3 - Metrics 

In [49]:
results = DataFrame(columns=['technique','f1_score','accuracy','precision','recall'])
results = results.append({'technique': 'YData',
                     'f1_score': f1_score(test_data.customer_lifetime_value, y_preds),
                    'accuracy': accuracy_score(test_data.customer_lifetime_value, y_preds),
                    'precision': precision_score(test_data.customer_lifetime_value, y_preds),
                    'recall': recall_score(test_data.customer_lifetime_value, y_preds)} ,ignore_index=True)
results 

Unnamed: 0,technique,f1_score,accuracy,precision,recall
0,YData,0.87037,0.953333,0.94,0.810345


## 3 - Artifacts 

In [50]:
# Create Artifact. The table with the metrics will be shown on the \"Run Output\"  section of the \"Runs\". \n",
metrics = {
        'metrics': [
            {
                'name': 'F1_Score',
                'numberValue': results.f1_score.values[0],
                'format': 'PERCENTAGE',
            },
             {
                'name': 'Accuracy_Score',
                'numberValue': results.accuracy.values[0],
                'format': 'PERCENTAGE',
            },
             {
                'name': 'Precision_Score',
                'numberValue':  results.precision.values[0],
                'format': 'PERCENTAGE',
            },
             {
                'name': 'Recall_Score',
                'numberValue':  results.recall.values[0],
                'format': 'PERCENTAGE',
            }]}


with open("mlpipeline-metrics.json", 'w') as f:
        json.dump(metrics, f)