# Exploratory Data Analysis Part 2: Feature Engineering

In [176]:
# Libraries importing
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

In [177]:
# Read training data
train_data = pd.read_csv('./train_data.csv', index_col=0)

  train_data = pd.read_csv('./train_data.csv', index_col=0)


In [178]:
# Take a look on data inside the training dataset
train_data.head(10)

Unnamed: 0_level_0,PAYMENT_DAY,APPLICATION_SUBMISSION_TYPE,POSTAL_ADDRESS_TYPE,SEX,MARITAL_STATUS,QUANT_DEPENDANTS,STATE_OF_BIRTH,CITY_OF_BIRTH,NACIONALITY,RESIDENCIAL_STATE,...,FLAG_PROFESSIONAL_PHONE,PROFESSIONAL_PHONE_AREA_CODE,MONTHS_IN_THE_JOB,PROFESSION_CODE,OCCUPATION_TYPE,PRODUCT,AGE,RESIDENCIAL_ZIP_3,PROFESSIONAL_ZIP_3,TARGET_LABEL_BAD=1
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,Web,1,F,6,1,RN,Assu,1,RN,...,N,,0,9.0,4.0,1,32,595,595,1
2,15,Carga,1,F,2,0,RJ,rio de janeiro,1,RJ,...,N,,0,11.0,4.0,1,34,230,230,1
3,5,Web,1,F,2,0,RN,GARANHUNS,1,RN,...,N,,0,11.0,,1,27,591,591,0
4,20,Web,1,F,2,0,PE,CABO,1,PE,...,N,,0,,,1,61,545,545,0
5,10,Web,1,M,2,0,RJ,RIO DE JANEIRO,1,RJ,...,N,,0,9.0,5.0,1,48,235,235,1
6,10,0,1,M,2,0,MG,CARMO DO RIO CLARO,1,MG,...,N,,0,9.0,2.0,2,40,371,371,1
7,15,Carga,1,F,2,2,BA,salvador,1,BA,...,N,,0,11.0,4.0,1,40,413,413,1
8,25,Web,1,F,1,0,MG,ATALEIA,1,SP,...,Y,5.0,0,11.0,2.0,1,28,686,686,0
9,15,0,1,F,1,0,SP,LENCOIS PAULISTA,1,SP,...,N,,0,0.0,2.0,2,31,172,172,0
10,5,0,1,F,1,0,RS,SANTA MARIA,1,RS,...,Y,54.0,0,9.0,2.0,1,41,914,914,0


Before imputing missing values, It is necessary to contrast the kind of values we have in each variable and the expected values given by the file 'PAKDD2010_VariablesList.XLS'

In [179]:
# Build a dictionary with expected values for categorical variables
expected_values = {
    'PAYMENT_DAY': [1, 5, 10, 15, 20, 25],
    'APPLICATION_SUBMISSION_TYPE': ['Web', 'Carga'],
    'QUANT_ADDITIONAL_CARDS': [1, 2],
    'POSTAL_ADDRESS_TYPE': [1, 2],
    'SEX': ['M', 'F'],
    'MARITAL_STATUS': [1, 2, 3, 4, 5, 6, 7],
    'EDUCATION_LEVEL': [1, 2, 3, 4, 5],
    'NACIONALITY': [0, 1, 2],
    'FLAG_RESIDENTIAL_PHONE': ['Y', 'N'],
    'RESIDENCE_TYPE': [1, 2, 3, 4 ,5],
    'FLAG_MOBILE_PHONE': ['Y', 'N'],
    'FLAG_EMAIL': [0, 1],
    'FLAG_VISA': [0, 1],
    'FLAG_MASTERCARD': [0, 1],
    'FLAG_DINERS': [0, 1],
    'FLAG_AMERICAN_EXPRESS': [0, 1],
    'FLAG_OTHER_CARDS': [0, 1],
    'QUANT_BANKING_ACCOUNTS': [0, 1, 2],
    'QUANT_SPECIAL_BANKING_ACCOUNTS': [0, 1, 2],
    'COMPANY': ['Y', 'N'],
    'FLAG_PROFESSIONAL_PHONE': ['Y', 'N'],
    'OCCUPATION_TYPE': [1, 2, 3, 4, 5],
    'EDUCATION_LEVEL_2': [1, 2, 3, 4, 5],
    'FLAG_HOME_ADDRESS_DOCUMENT': [0, 1],
    'FLAG_RG': [0, 1],
    'FLAG_CPF': [0, 1],
    'FLAG_INCOME_PROOF': [0, 1],
    'FLAG_ACSP_RECORD': ['Y', 'N'],
    'TARGET_LABEL_BAD=1': [0, 1]
}

In [180]:
for col in train_data.columns:
    if col in expected_values:
        # Define a categorical data type
        cat_dtype = pd.CategoricalDtype(categories=expected_values[col],
                                        ordered=False)
        train_data[col] = train_data[col].astype(dtype=cat_dtype)

In [181]:
# Get the numerical variables
num_vars = train_data._get_numeric_data().columns
# Get the categorical variables
cat_vars = list(set(train_data.columns) - set(num_vars))

In [182]:
num_vars

Index(['QUANT_DEPENDANTS', 'MONTHS_IN_RESIDENCE', 'PERSONAL_MONTHLY_INCOME',
       'OTHER_INCOMES', 'PERSONAL_ASSETS_VALUE', 'QUANT_CARS',
       'MONTHS_IN_THE_JOB', 'PROFESSION_CODE', 'PRODUCT', 'AGE'],
      dtype='object')

In [183]:
cat_vars

['PROFESSIONAL_STATE',
 'RESIDENCIAL_CITY',
 'RESIDENCIAL_PHONE_AREA_CODE',
 'RESIDENCIAL_STATE',
 'STATE_OF_BIRTH',
 'PROFESSIONAL_PHONE_AREA_CODE',
 'NACIONALITY',
 'PAYMENT_DAY',
 'POSTAL_ADDRESS_TYPE',
 'FLAG_DINERS',
 'FLAG_AMERICAN_EXPRESS',
 'FLAG_PROFESSIONAL_PHONE',
 'FLAG_OTHER_CARDS',
 'FLAG_MASTERCARD',
 'FLAG_EMAIL',
 'RESIDENCIAL_ZIP_3',
 'CITY_OF_BIRTH',
 'PROFESSIONAL_ZIP_3',
 'RESIDENCE_TYPE',
 'COMPANY',
 'QUANT_SPECIAL_BANKING_ACCOUNTS',
 'APPLICATION_SUBMISSION_TYPE',
 'RESIDENCIAL_BOROUGH',
 'FLAG_VISA',
 'OCCUPATION_TYPE',
 'TARGET_LABEL_BAD=1',
 'SEX',
 'FLAG_RESIDENCIAL_PHONE',
 'MARITAL_STATUS',
 'QUANT_BANKING_ACCOUNTS']

Dataframe method '_get_numeric_data' returns variables whose data type is numeric but if we take a look on those columns, we can conclude that they are some variables that are categorical but expressed by numbers, so we have to find another way to split the data between numerical and categorical variables.

The new criteria will be the number of unique values that are present in the data for each column, so those variables that have less than 20 different values will be considered as categorical because 20 different values inside a dataset with 50000 samples is a small value so each of those 20 numbers will be repeated (in average) a lot of times.

In [184]:
'QUANT_DEPENDANTS' not in expected_values.keys()

True

In [185]:
'QUANT_DEPENDANTS' in num_vars

True

In [186]:
if train_data['QUANT_DEPENDANTS'].nunique() <= 20 and 'QUANT_DEPENDANTS' not in expected_values.keys():
    print('Ok')


Ok


In [187]:
# Let's cast numerical variables with less than 20 unique values to string type
for col in num_vars:
    if train_data[col].nunique() <= 20 and col in expected_values.keys():
        # First cast to int
        train_data[col] = pd.to_numeric(train_data[col],
                                        errors='coerce',     # Errors to NaN
                                        downcast='integer')
        # Create categorical data type for its unique values
        cat_dtype = pd.CategoricalDtype(
            categories=train_data[col].dropna().unique())
        # Cast to categorical data type
        train_data[col] = train_data[col].astype(dtype=cat_dtype)

Let's get the new numerical and categorical variables

In [188]:
# Get the numerical variables
num_vars = train_data._get_numeric_data().columns
# Get the categorical variables
cat_vars = list(set(train_data.columns) - set(num_vars))

In [189]:
num_vars

Index(['QUANT_DEPENDANTS', 'MONTHS_IN_RESIDENCE', 'PERSONAL_MONTHLY_INCOME',
       'OTHER_INCOMES', 'PERSONAL_ASSETS_VALUE', 'QUANT_CARS',
       'MONTHS_IN_THE_JOB', 'PROFESSION_CODE', 'PRODUCT', 'AGE'],
      dtype='object')

In [190]:
cat_vars

['PROFESSIONAL_STATE',
 'RESIDENCIAL_CITY',
 'RESIDENCIAL_PHONE_AREA_CODE',
 'RESIDENCIAL_STATE',
 'STATE_OF_BIRTH',
 'PROFESSIONAL_PHONE_AREA_CODE',
 'NACIONALITY',
 'PAYMENT_DAY',
 'POSTAL_ADDRESS_TYPE',
 'FLAG_DINERS',
 'FLAG_AMERICAN_EXPRESS',
 'FLAG_PROFESSIONAL_PHONE',
 'FLAG_OTHER_CARDS',
 'FLAG_MASTERCARD',
 'FLAG_EMAIL',
 'RESIDENCIAL_ZIP_3',
 'CITY_OF_BIRTH',
 'PROFESSIONAL_ZIP_3',
 'RESIDENCE_TYPE',
 'COMPANY',
 'QUANT_SPECIAL_BANKING_ACCOUNTS',
 'APPLICATION_SUBMISSION_TYPE',
 'RESIDENCIAL_BOROUGH',
 'FLAG_VISA',
 'OCCUPATION_TYPE',
 'TARGET_LABEL_BAD=1',
 'SEX',
 'FLAG_RESIDENCIAL_PHONE',
 'MARITAL_STATUS',
 'QUANT_BANKING_ACCOUNTS']

Now, it will be good to take a look on how many different values do the categorical variables have, in order to know how many variables will appear when we apply One Hot Encoding to those variables with a high number of different values.

In [191]:
train_data[cat_vars].nunique()

PROFESSIONAL_STATE                   28
RESIDENCIAL_CITY                   3529
RESIDENCIAL_PHONE_AREA_CODE         102
RESIDENCIAL_STATE                    27
STATE_OF_BIRTH                       29
PROFESSIONAL_PHONE_AREA_CODE         87
NACIONALITY                           3
PAYMENT_DAY                           6
POSTAL_ADDRESS_TYPE                   2
FLAG_DINERS                           2
FLAG_AMERICAN_EXPRESS                 2
FLAG_PROFESSIONAL_PHONE               2
FLAG_OTHER_CARDS                      2
FLAG_MASTERCARD                       2
FLAG_EMAIL                            2
RESIDENCIAL_ZIP_3                  1481
CITY_OF_BIRTH                      9910
PROFESSIONAL_ZIP_3                 1481
RESIDENCE_TYPE                        5
COMPANY                               2
QUANT_SPECIAL_BANKING_ACCOUNTS        3
APPLICATION_SUBMISSION_TYPE           2
RESIDENCIAL_BOROUGH               14511
FLAG_VISA                             2
OCCUPATION_TYPE                       5


As they are variables with a high number of different values, we will drop them to keep a dataset with a shape we can handle after One Hot Encoding.

In [192]:
cols_to_drop = []
# List columns to drop
for col in cat_vars:
    if train_data[col].nunique() > 80:
        cols_to_drop.append(col)

# Drop columns
train_data.drop(columns=cols_to_drop, inplace=True)

In [193]:
# Take a look on the remaining dataframe
train_data.head(10)

Unnamed: 0_level_0,PAYMENT_DAY,APPLICATION_SUBMISSION_TYPE,POSTAL_ADDRESS_TYPE,SEX,MARITAL_STATUS,QUANT_DEPENDANTS,STATE_OF_BIRTH,NACIONALITY,RESIDENCIAL_STATE,FLAG_RESIDENCIAL_PHONE,...,QUANT_CARS,COMPANY,PROFESSIONAL_STATE,FLAG_PROFESSIONAL_PHONE,MONTHS_IN_THE_JOB,PROFESSION_CODE,OCCUPATION_TYPE,PRODUCT,AGE,TARGET_LABEL_BAD=1
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,Web,1,F,6,1,RN,1,RN,Y,...,0,N,,N,0,9.0,4.0,1,32,1
2,15,Carga,1,F,2,0,RJ,1,RJ,Y,...,0,Y,,N,0,11.0,4.0,1,34,1
3,5,Web,1,F,2,0,RN,1,RN,Y,...,0,N,,N,0,11.0,,1,27,0
4,20,Web,1,F,2,0,PE,1,PE,N,...,0,N,,N,0,,,1,61,0
5,10,Web,1,M,2,0,RJ,1,RJ,Y,...,0,N,,N,0,9.0,5.0,1,48,1
6,10,,1,M,2,0,MG,1,MG,Y,...,1,Y,MG,N,0,9.0,2.0,2,40,1
7,15,Carga,1,F,2,2,BA,1,BA,Y,...,0,N,,N,0,11.0,4.0,1,40,1
8,25,Web,1,F,1,0,MG,1,SP,N,...,0,Y,SP,Y,0,11.0,2.0,1,28,0
9,15,,1,F,1,0,SP,1,SP,Y,...,1,Y,,N,0,0.0,2.0,2,31,0
10,5,,1,F,1,0,RS,1,RS,Y,...,1,Y,RS,Y,0,9.0,2.0,1,41,0


As we can see, there are son samples that have a string like ' ' and this is just a missing value, so let's replace them with NaN so that pandas and numpy can recognize them as missing values.

In [200]:
# Count empty strings present in each column
train_data.isin(values=[' ']).sum()

PAYMENT_DAY                           0
APPLICATION_SUBMISSION_TYPE           0
POSTAL_ADDRESS_TYPE                   0
SEX                                   0
MARITAL_STATUS                        0
QUANT_DEPENDANTS                      0
STATE_OF_BIRTH                     2064
NACIONALITY                           0
RESIDENCIAL_STATE                     0
FLAG_RESIDENCIAL_PHONE                0
RESIDENCE_TYPE                        0
MONTHS_IN_RESIDENCE                   0
FLAG_EMAIL                            0
PERSONAL_MONTHLY_INCOME               0
OTHER_INCOMES                         0
FLAG_VISA                             0
FLAG_MASTERCARD                       0
FLAG_DINERS                           0
FLAG_AMERICAN_EXPRESS                 0
FLAG_OTHER_CARDS                      0
QUANT_BANKING_ACCOUNTS                0
QUANT_SPECIAL_BANKING_ACCOUNTS        0
PERSONAL_ASSETS_VALUE                 0
QUANT_CARS                            0
COMPANY                               0


In [201]:
# Replace empty strings
train_data.replace(to_replace=['', ' '], value=np.nan, inplace=True)

Finally, we'll save the dataset into a csv file.

In [202]:
train_data.to_csv('./preprocessed_data.csv')