# PreProcess Data 

Data Profiling through YData's Pandas Profiling. 

## 0 - Imports

In [3]:
from pandas import cut, DataFrame
from numpy import array 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest
from ydata.connectors import LocalConnector
from ydata.connectors.filetype import FileType
from ydata.utils.formats import read_json

  from distributed.utils import LoopRunner, format_bytes


## 1 - Load Data

In [4]:
# Read data
connector = LocalConnector()

data = connector.read_file('data.csv').to_pandas()

## 2 - Problem Setup and Preprocessing

### 2.1 - Columns

In [5]:
x = data.columns.str.lower()
x = x.str.replace(' ', '_')
data.columns = x
data.drop(['customer', 'effective_to_date'], axis=1, inplace=True)

### 2.2 - Creation of Label

In [6]:
label_cus = cut(data.customer_lifetime_value,
                   [0, data.customer_lifetime_value.quantile(0.80), data.customer_lifetime_value.max()],
                   labels=[0, 1], include_lowest=True)

data.customer_lifetime_value = array(label_cus)

### 2.3 - Categorical Encoding

In [13]:
encoder = OrdinalEncoder()
encoded_data = encoder.fit_transform(data)
encoded_data = DataFrame(encoded_data, columns=data.columns)

### 2.4 - Feature Selection

In [14]:
feat_select = SelectKBest()
temp = feat_select.fit_transform(encoded_data.drop('customer_lifetime_value', axis=1), encoded_data.customer_lifetime_value)
scores = {k: v for v, k in zip(feat_select.scores_, encoded_data.drop('customer_lifetime_value', axis=1).columns)}
scores = sorted(scores.items(), key=lambda x: x[1])
columns_selected = [x for x, y in scores[-10:]]
columns_selected.append('customer_lifetime_value')
preprocessed_data = encoded_data[columns_selected]

## 3 - Create Artifacts 

In [16]:
# Here we create the visualization of the table. This is the metadata that kubeflows need to show some lines of the dataset.
import json

metadata = {
    'outputs' : [{
      'type': 'table',
      'storage': 'inline',
      'format': 'csv',
      'header': list(preprocessed_data.columns),
      'source': preprocessed_data.to_csv(header=False, index=False)
    }]
  }

with open("mlpipeline-ui-metadata.json", 'w') as metadata_file:
    json.dump(metadata, metadata_file)

## 4 - Store Data

In [17]:
connector.write_file(preprocessed_data, 'preprocessed_data.csv')