# Data Quality Check 

Data quality assessment through YData's Quality package. 

## 0 - Imports

In [3]:
import json
from ydata.connectors import LocalConnector
from ydata.connectors.filetype import FileType

from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.ensemble import AdaBoostClassifier

  from distributed.utils import LoopRunner, format_bytes


## 1 - Load Data

In [19]:
# Initialize the YData's connector and load the data
connector = LocalConnector()

preprocessed_data = connector.read_file('preprocessed_data.csv').to_pandas()

## 2 - Classification

### 2.1 Split Data

In [20]:
train_data, test_data = train_test_split(preprocessed_data, test_size=0.2, random_state=5)

### 2.2 - Train and Predict

In [21]:
classifier = AdaBoostClassifier(random_state=30)
classifier.fit(train_data.drop('customer_lifetime_value', axis=1), train_data.customer_lifetime_value)
y_pred = classifier.predict(test_data.drop('customer_lifetime_value', axis=1))

### 2.3 - Metrics 

In [22]:
results = DataFrame({'technique': 'no_augmentation',
                     'f1_score': f1_score(test_data.customer_lifetime_value, y_pred),
                    'accuracy': accuracy_score(test_data.customer_lifetime_value, y_pred),
                    'precision': precision_score(test_data.customer_lifetime_value, y_pred),
                    'recall': recall_score(test_data.customer_lifetime_value, y_pred)} ,index=[0])
results

Unnamed: 0,technique,f1_score,accuracy,precision,recall
0,no_augmentation,0.857143,0.943333,0.836066,0.87931


## 3 - Store Data 

In [23]:
connector.write_file(test_data.reset_index(), 'test_data.csv')

## 4 - Artifacts

In [1]:
# Create Artifact. The table with the metrics will be shown on the "Run Output"  section of the "Runs".
metrics = {
        'metrics': [
            {
                'name': 'F1_Score',
                'numberValue': results.f1_score.values[0],
                'format': 'PERCENTAGE',
            },
             {
                'name': 'Accuracy_Score',
                'numberValue': results.accuracy.values[0],
                'format': 'PERCENTAGE',
            },
             {
                'name': 'Precision_Score',
                'numberValue':  results.precision.values[0],
                'format': 'PERCENTAGE',
            },
             {
                'name': 'Recall_Score',
                'numberValue':  results.recall.values[0],
                'format': 'PERCENTAGE',
            }]}


with open("mlpipeline-metrics.json", 'w') as f:
        json.dump(metrics, f)
