# Data Quality Check 

Data quality assessment through YData's Quality package. 

## 0 - Imports

In [1]:
%%capture
!pip install imblearn

In [2]:
import json
from ydata.connectors import LocalConnector
from ydata.connectors.filetype import FileType

from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.ensemble import AdaBoostClassifier

from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, SVMSMOTE

  from distributed.utils import LoopRunner, format_bytes


## 1 - Load Data

In [3]:
# Initialize the YData's connector and load the data
connector = LocalConnector()

preprocessed_data = connector.read_file('preprocessed_data.csv').to_pandas()

## 2 - Test traditional augmentations

### 2.1 - Split Data

In [4]:
# Taking the same test set used in the first one. 
test_data = connector.read_file('test_data.csv').to_pandas()
test_data.set_index('index', inplace=True)
train_data = preprocessed_data[~preprocessed_data.index.isin(test_data.index)]

### 2.2 - Augment and Train 

Train and predict the multiple techniques. (SMOTE and variants, ADASYN) 

In [15]:
%%capture 
upsampling_techniques = [SMOTE(sampling_strategy={0: 17000, 1: 4000}, random_state = 3), SVMSMOTE(sampling_strategy={0: 17000, 1: 4000}, random_state = 3),
                        BorderlineSMOTE(sampling_strategy={0: 17000, 1: 4000}, random_state = 3), ADASYN(sampling_strategy={0: 17000, 1: 4000}, random_state = 3)]

results = DataFrame(columns=['technique','f1_score','accuracy','precision','recall'])
for samp_tech in upsampling_techniques:
    smote_x, smote_y = samp_tech.fit_resample(train_data.drop('customer_lifetime_value', axis=1),
                                              train_data.customer_lifetime_value)

    classifier = AdaBoostClassifier(random_state=30)
    classifier.fit(smote_x, smote_y)
    y_preds = classifier.predict(test_data.drop('customer_lifetime_value', axis=1))

    results = results.append({'technique': samp_tech.__class__.__name__, 
                    'f1_score': f1_score(test_data.customer_lifetime_value, y_preds),
                    'accuracy': accuracy_score(test_data.customer_lifetime_value, y_preds),
                    'precision': precision_score(test_data.customer_lifetime_value, y_preds),
                    'recall': recall_score(test_data.customer_lifetime_value, y_preds)}, ignore_index=True)

### 2.3 - Metrics 

In [6]:
results

Unnamed: 0,technique,f1_score,accuracy,precision,recall
0,SMOTE,0.785185,0.903333,0.688312,0.913793
1,SVMSMOTE,0.765957,0.89,0.650602,0.931034
2,BorderlineSMOTE,0.773723,0.896667,0.670886,0.913793
3,ADASYN,0.791367,0.903333,0.679012,0.948276


## 3 - Artifacts 

In [14]:
# Create Artifact. The table with the metrics will be shown on the "Run Output"  section of the "Runs".
metrics = {
        'metrics': [
            {
                'name': 'F1_Score',
                'numberValue': results.f1_score.mean(),
                'format': 'PERCENTAGE',
            },
             {
                'name': 'Accuracy_Score',
                'numberValue': results.accuracy.mean(),
                'format': 'PERCENTAGE',
            },
             {
                'name': 'Precision_Score',
                'numberValue':  results.precision.mean(),
                'format': 'PERCENTAGE',
            },
             {
                'name': 'Recall_Score',
                'numberValue':  results.recall.mean(),
                'format': 'PERCENTAGE',
            }]}


with open("mlpipeline-metrics.json", 'w') as f:
        json.dump(metrics, f)
