# Exploratory Data Analysis Part 2: Feature Engineering

In [2]:
import os

# Libraries importing
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from lightgbm import LGBMClassifier
import xgboost as xgb

from utils import model_evaluation


In [3]:
# GLOBAL CONSTANTS
SEED = 1

In [4]:
# Read training data
data_dir = os.listdir('./data')[-1]
data = pd.read_csv(os.path.join('./data',
                                data_dir,
                                'preprocessed_data.csv'), index_col=0)

  data = pd.read_csv(os.path.join('./data',


In [5]:
# Take a look on data inside the training dataset
data.head(10)

Unnamed: 0_level_0,PAYMENT_DAY,APPLICATION_SUBMISSION_TYPE,POSTAL_ADDRESS_TYPE,SEX,MARITAL_STATUS,QUANT_DEPENDANTS,STATE_OF_BIRTH,NACIONALITY,RESIDENCIAL_STATE,FLAG_RESIDENCIAL_PHONE,...,PROFESSIONAL_STATE,FLAG_PROFESSIONAL_PHONE,PROFESSIONAL_PHONE_AREA_CODE,MONTHS_IN_THE_JOB,PROFESSION_CODE,OCCUPATION_TYPE,PRODUCT,AGE,RESIDENCIAL_ZIP_3,TARGET_LABEL_BAD=1
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,Web,1,F,6.0,1,RN,1,RN,Y,...,NO_JOB,N,NO_DATA,0,9.0,4.0,1,32,595,1
2,15,Carga,1,F,2.0,0,RJ,1,RJ,Y,...,NO_JOB,N,NO_DATA,0,11.0,4.0,1,34,230,1
3,5,Web,1,F,2.0,0,RN,1,RN,Y,...,NO_JOB,N,NO_DATA,0,11.0,,1,27,591,0
4,20,Web,1,F,2.0,0,PE,1,PE,N,...,NO_JOB,N,NO_DATA,0,,,1,61,545,0
5,10,Web,1,M,2.0,0,RJ,1,RJ,Y,...,NO_JOB,N,NO_DATA,0,9.0,5.0,1,48,235,1
6,10,,1,M,2.0,0,MG,1,MG,Y,...,MG,N,NO_DATA,0,9.0,2.0,2,40,371,1
7,15,Carga,1,F,2.0,2,BA,1,BA,Y,...,NO_JOB,N,NO_DATA,0,11.0,4.0,1,40,413,1
8,25,Web,1,F,1.0,0,MG,1,SP,N,...,SP,Y,5,0,11.0,2.0,1,28,686,0
9,15,,1,F,1.0,0,SP,1,SP,Y,...,NO_JOB,N,NO_DATA,0,0.0,2.0,2,31,172,0
10,5,,1,F,1.0,0,RS,1,RS,Y,...,RS,Y,54,0,9.0,2.0,1,41,914,0


In [6]:
# Read the numerical and categorical features txt files
num_features = []
with open('./numerical_features_names.txt', 'r') as f:
    for line in f:
        # Read line by line and append top empty list
        num_features.append(line.split('\n')[0])

cat_features = []
with open('./categorical_features_names.txt', 'r') as f:
    for line in f:
        # Read line by line and append top empty list
        cat_features.append(line.split('\n')[0])

In [7]:
# Convert dtypes to the right ones
# First use suggested dtypes from pandas core
data = data.convert_dtypes()
# Remove Target variable from features
cat_features.remove('TARGET_LABEL_BAD=1')
# Then use the lists generated before
for col in cat_features:
    # First to string
    data[col] = data[col].astype('string')
    data[col] = pd.Categorical(data[col])

# As Pandas could introduce pd.NA values in some features
# when converting them to categorical, let's replace them with np.nan by casting
# int columns to float32
cols_to_float = data.select_dtypes(include='int').columns
data[cols_to_float] = data[cols_to_float].astype(dtype='float32')

##Dataset Split

In [8]:
# Split data into features and label
X, y = data.drop(columns=['TARGET_LABEL_BAD=1']), data['TARGET_LABEL_BAD=1']
# Split dataset into Train and Test
X_train_0, X_test, y_train_0, y_test = train_test_split(
    X, y, test_size=0.1, random_state=SEED
)
# Split train datasets into train and validation datasets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_0, y_train_0, test_size=0.1, random_state=SEED
)

In [9]:
## Build processing pipeline
# Create individual transformers
num_transformer = Pipeline(
        steps=[
            ('imputer' , SimpleImputer(strategy='median')),
            ('scaler'  , StandardScaler())
        ]
)

cat_transformer = Pipeline(
        steps=[
            ('imputer2', SimpleImputer(missing_values=pd.NA,
                                      strategy='most_frequent')),
            ('encoder' , OneHotEncoder(drop='if_binary',
                                       #dtype='int8',
                                       handle_unknown='ignore',
                                       sparse=False))
        ]
)

# Ensemble all the transformers
preprocessor = ColumnTransformer(
        transformers=[
            ('num' , num_transformer , num_features),
            ('cat' , cat_transformer , cat_features)
        ]
)

In [10]:
# Use ColumnTransformer 'preprocessor' to process data
# Train data
X_train_pre   = preprocessor.fit_transform(X=X_train)
X_train_0_pre = preprocessor.transform(X=X_train_0)
# Validation data
X_val_pre = preprocessor.transform(X=X_val)
# Test data
X_test_pre = preprocessor.transform(X=X_test)



In [None]:
# Save all datasets
np.savetxt(os.path.join('./data',
                        data_dir,
                        'X_train.csv'), X_train_pre, delimiter=',')
np.savetxt(os.path.join('./data',
                        data_dir,
                        'X_val.csv'), X_val_pre, delimiter=',')
np.savetxt(os.path.join('./data',
                        data_dir,
                        'X_test.csv'), X_test_pre, delimiter=',')
np.savetxt(os.path.join('./data',
                        data_dir,
                        'y_train.csv'), y_train, delimiter=',')
np.savetxt(os.path.join('./data',
                        data_dir,
                        'y_val.csv'), y_val, delimiter=',')
np.savetxt(os.path.join('./data',
                        data_dir,
                        'y_test.csv'), y_test, delimiter=',')

In [15]:
# Save features names after preprocessing
pd.Series(preprocessor.get_feature_names_out()).to_csv(
        os.path.join('./data',
                     data_dir,
                     'features_names_out.csv')
)