# 3. Data preprocessing

Feature engineering can be really helpful for data science projects by allowing the extraction/selection of the most relevant and important variables for a use case. Optimizing feature selection based on your downstream application is important to expective the expected results of AI developed solutions.

In what concerns this customer churn use case, there is a lot of potential around location based columns and the available target information.

### Import needed packages

In [2]:
import os
from pathlib import Path

from functions.saving_functions import save_file, read_file

from pandas_profiling import ProfileReport

from ydata.dataset import Dataset
from ydata.metadata import Metadata

from sklearn.cluster import DBSCAN
import joblib

#### Environment variables

In [3]:
feat_eng = bool(os.environ.get('FEATURE_ENG', "True"))

## Read the dataset & load the metadata

In [4]:
dataset = read_file('dataset.pkl')
metadata = Metadata.load('metadata.pkl')

In [5]:
dataset.head()

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn,CLTV,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,3239,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,2701,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,5372,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,5003,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Yes,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,5340,Competitor had better devices


In [6]:
print(metadata)

[1mMetadata Summary 
 
[0m[1mDataset type: [0mTABULAR
[1mDataset attributes: [0m
[1mNumber of columns: [0m32
[1mDuplicate rows: [0m12
[1mTarget column: [0m

[1mColumn detail: [0m
               Column    Data type Variable type
0          CustomerID           id        string
1               Count    numerical           int
2             Country  categorical        string
3               State  categorical        string
4                City  categorical        string
5            Zip Code    numerical           int
6            Lat Long  categorical        string
7            Latitude    numerical         float
8           Longitude    numerical         float
9              Gender  categorical        string
10     Senior Citizen  categorical        string
11            Partner  categorical        string
12         Dependents  categorical        string
13             Tenure    numerical           int
14      Phone Service  categorical        string
15     Multiple Lines  

## Preprocess & Feature Engineering

In [7]:
#Remove any ID columns as they are expected to be unique throughout the whoel dataset
dataset = dataset.drop_columns(metadata.id_vars)

In [8]:
#Clean the dataset based on Metadata
dataset_cols = list(dataset.columns)
if len(metadata.warnings['constant']) > 0:
    const_cols = [warning.column for warning in metadata.warnings['constant']]

dataset = dataset.select_columns(list(set(dataset_cols)-set(const_cols)))

In [9]:
len_dataset = len(dataset)
cardinality_cols = [(warning.details['value']/len_dataset) for warning in metadata.warnings['cardinality']]

In [10]:
#Lat lon is a calculated field which means it can be removed

#Let's have a look into the cities representative distribution
cities_cardinality = dataset.to_dask().groupby('City').count().sort_values(by='Zip Code', ascending=False)[['Zip Code']].compute()

## Explore the city impact with compare profiling

In [11]:
ten_cust = list(cities_cardinality[cities_cardinality['Zip Code'] <=10].reset_index()['City'])
plus10_cust = list(cities_cardinality[cities_cardinality['Zip Code'] >10].reset_index()['City'])

In [12]:
small_cities = dataset._data[dataset._data['City'].isin(ten_cust)].compute()
bigger_cities = dataset._data[dataset._data['City'].isin(plus10_cust)].compute()

In [13]:
biggercities_r = ProfileReport(bigger_cities, title='Cities with +customers')
smallcities_r = ProfileReport(small_cities, title='Cities with less customers')

compare_report = biggercities_r.compare(smallcities_r)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

#### Dropping columns that are not relevant for the use case

In [15]:
## Drop variables related to location as they seem to hold no information
location_var = ['Latitude', 'Longitude', 'Lat Long']

In [16]:
flrt_dataset = dataset.select_columns(list(set(dataset.columns) - set(location_var)))

In [17]:
#create a new column based on wether is Los Angeles or not
flrt_dataset.to_dask()['City'].unique().compute()

0           Los Angeles
1         Beverly Hills
2       Huntington Park
3               Lynwood
4        Marina Del Rey
             ...       
1124            Milford
1125            Calpine
1126           Standish
1127           Tulelake
1128     Olympic Valley
Name: City, Length: 1129, dtype: string

In [18]:
flrt_dataset.head()

Unnamed: 0,City,Gender,Senior Citizen,Partner,Dependents,Tenure,Phone Service,Multiple Lines,Internet Service,Online Security,...,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn,CLTV,Churn Reason
0,Los Angeles,Male,No,No,No,2,Yes,No,DSL,Yes,...,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,3239,Competitor made better offer
1,Los Angeles,Female,No,No,Yes,2,Yes,No,Fiber optic,No,...,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,2701,Moved
2,Los Angeles,Female,No,No,Yes,8,Yes,Yes,Fiber optic,No,...,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,5372,Moved
3,Los Angeles,Female,No,Yes,Yes,28,Yes,Yes,Fiber optic,No,...,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,5003,Moved
4,Los Angeles,Male,No,No,Yes,49,Yes,Yes,Fiber optic,No,...,Yes,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,5340,Competitor had better devices


In [19]:
dd_data = flrt_dataset.to_dask()

In [20]:
dd_city = dd_data.groupby(['City'])[['Monthly Charges','Total Charges']].mean().reset_index()
dd_city.columns = ['City', 'Avg Montly charges per city','Avg Total charges per city']

In [21]:
#Add new columns to extract the city information
dd_data = dd_data.merge(dd_city,on='City',suffixes=('_x', '_y'))

In [22]:
dd_data = dd_data.drop(['City', 'Churn Label'], axis=1)

### Dropping churn reason as it leaks information

In [23]:
dd_data = dd_data.drop('Churn Reason', axis=1)

## Missing values processing

In [24]:
dd_data = dd_data.dropna()

In [25]:
preprocessed_data = Dataset(dd_data)
preprocessed_meta = Metadata(preprocessed_data)

[                                        ] | 0% Completed | 102.91 ms

  warn("Datasets other than Timeseries don't make use of dataset_attrs")


[########################################] | 100% Completed | 316.01 ms
[########################################] | 100% Completed | 1.30 sms


## Save as new datasource

In [26]:
## add here the connector details
from ydata.labs.connectors import Connectors
from ydata.labs import GoogleCloudStorageDataSource, DataType, FileType

conn=Connectors.get('{connector-id}')

In [None]:
conn.write_file(data=preprocessed_data, 
                path="{add-file-path}")

In [None]:
datasource = GoogleCloudStorageDataSource(name='Customer churn - Preprocessed',
                                          connector=conn,
                                          data_type=DataType.TABULAR,
                                          path="{add-file-path}",
                                          file_type=FileType.CSV,
                                         )
datasource.create()

## Create pipeline outputs

### Saving the artifacts

In [31]:
parameters = {'preprocessed_id': datasource.id}

In [32]:
metadata = preprocessed_meta.save('proc_metadata.pkl')

#Saving pipeline parameters file
save_file(parameters, file_path='pipeline_parameters.pkl')

In [None]:
import json

profile_pipeline_output = {
    'outputs' : [
        {
      'type': 'web-app',
      'storage': 'inline',
      'source': compare_report.to_html(),
    }
    ]
  }

with open('mlpipeline-ui-metadata.json', 'w') as metadata_file:
    json.dump(profile_pipeline_output, metadata_file)