In [47]:
# !pip uninstall numpy --yes
# !pip install numpy==1.23.1

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import joblib

In [49]:
app_train = pd.read_csv("home-credit-default-risk/application_train.csv")

In [50]:
app_train.shape

(307511, 122)

In [51]:
app_train.dtypes

SK_ID_CURR                      int64
TARGET                          int64
NAME_CONTRACT_TYPE             object
CODE_GENDER                    object
FLAG_OWN_CAR                   object
                               ...   
AMT_REQ_CREDIT_BUREAU_DAY     float64
AMT_REQ_CREDIT_BUREAU_WEEK    float64
AMT_REQ_CREDIT_BUREAU_MON     float64
AMT_REQ_CREDIT_BUREAU_QRT     float64
AMT_REQ_CREDIT_BUREAU_YEAR    float64
Length: 122, dtype: object

### 1. Checking Duplicate Data

In [52]:
app_train.duplicated().sum()

0

In [53]:
# app_train.drop_duplicates()

### 2. Handling Missing Value

In [54]:
null = app_train.isna().sum().reset_index().rename(columns = {"index": "column_name", 0: "null_counts"}).sort_values(by = ["null_counts"])
null = null.set_index("column_name")
null["null_percentage"] = (null["null_counts"] / app_train.shape[0] * 100).round(2)
null

Unnamed: 0_level_0,null_counts,null_percentage
column_name,Unnamed: 1_level_1,Unnamed: 2_level_1
SK_ID_CURR,0,0.00
HOUR_APPR_PROCESS_START,0,0.00
REG_REGION_NOT_WORK_REGION,0,0.00
LIVE_REGION_NOT_WORK_REGION,0,0.00
REG_CITY_NOT_LIVE_CITY,0,0.00
...,...,...
NONLIVINGAPARTMENTS_MEDI,213514,69.43
NONLIVINGAPARTMENTS_MODE,213514,69.43
COMMONAREA_MODE,214865,69.87
COMMONAREA_AVG,214865,69.87


In [55]:
# Delete columns that have >20% missing values
del_column = list(null[null["null_percentage"] > 20].index)

app_train.drop(columns = del_column, inplace = True)

In [56]:
app_train.shape

(307511, 72)

In [57]:
# See the missing value percentage of other columns (<20%)
null2 = null.drop(del_column, axis = 0)
len(null2)

72

In [58]:
null2

Unnamed: 0_level_0,null_counts,null_percentage
column_name,Unnamed: 1_level_1,Unnamed: 2_level_1
SK_ID_CURR,0,0.00
HOUR_APPR_PROCESS_START,0,0.00
REG_REGION_NOT_WORK_REGION,0,0.00
LIVE_REGION_NOT_WORK_REGION,0,0.00
REG_CITY_NOT_LIVE_CITY,0,0.00
...,...,...
AMT_REQ_CREDIT_BUREAU_MON,41519,13.50
AMT_REQ_CREDIT_BUREAU_WEEK,41519,13.50
AMT_REQ_CREDIT_BUREAU_YEAR,41519,13.50
AMT_REQ_CREDIT_BUREAU_QRT,41519,13.50


In [59]:
app_train.dtypes

SK_ID_CURR                      int64
TARGET                          int64
NAME_CONTRACT_TYPE             object
CODE_GENDER                    object
FLAG_OWN_CAR                   object
                               ...   
AMT_REQ_CREDIT_BUREAU_DAY     float64
AMT_REQ_CREDIT_BUREAU_WEEK    float64
AMT_REQ_CREDIT_BUREAU_MON     float64
AMT_REQ_CREDIT_BUREAU_QRT     float64
AMT_REQ_CREDIT_BUREAU_YEAR    float64
Length: 72, dtype: object

#### 2a. Change The Dtype

In [60]:
# Change the dtype of TARGET
app_train["TARGET"] = app_train["TARGET"].astype(object)

In [61]:
# Numerical columns
list_num = list(app_train.select_dtypes(include = "number").columns)

# Categorical columns
list_cat = list(app_train.select_dtypes(include = "object").columns)

len(list_num), len(list_cat)

(60, 12)

In [62]:
num_to_cat = ['FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 
              'FLAG_EMAIL', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 
              'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 
              'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY',
              'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 
              'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 
              'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11','FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 
              'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16','FLAG_DOCUMENT_17', 
              'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']

In [63]:
app_train[num_to_cat] = app_train[num_to_cat].astype(object)

In [64]:
# Update numerical and categorical columns
list_num = list(app_train.select_dtypes(include = "number").columns)
list_cat = list(app_train.select_dtypes(include = "object").columns)

len(list_num), len(list_cat)

(26, 46)

In [65]:
app_train.dtypes

SK_ID_CURR                      int64
TARGET                         object
NAME_CONTRACT_TYPE             object
CODE_GENDER                    object
FLAG_OWN_CAR                   object
                               ...   
AMT_REQ_CREDIT_BUREAU_DAY     float64
AMT_REQ_CREDIT_BUREAU_WEEK    float64
AMT_REQ_CREDIT_BUREAU_MON     float64
AMT_REQ_CREDIT_BUREAU_QRT     float64
AMT_REQ_CREDIT_BUREAU_YEAR    float64
Length: 72, dtype: object

In [66]:
# Missing in numerical data
app_train[list_num].isna().sum()

SK_ID_CURR                        0
CNT_CHILDREN                      0
AMT_INCOME_TOTAL                  0
AMT_CREDIT                        0
AMT_ANNUITY                      12
AMT_GOODS_PRICE                 278
REGION_POPULATION_RELATIVE        0
DAYS_BIRTH                        0
DAYS_EMPLOYED                     0
DAYS_REGISTRATION                 0
DAYS_ID_PUBLISH                   0
CNT_FAM_MEMBERS                   2
HOUR_APPR_PROCESS_START           0
EXT_SOURCE_2                    660
EXT_SOURCE_3                  60965
OBS_30_CNT_SOCIAL_CIRCLE       1021
DEF_30_CNT_SOCIAL_CIRCLE       1021
OBS_60_CNT_SOCIAL_CIRCLE       1021
DEF_60_CNT_SOCIAL_CIRCLE       1021
DAYS_LAST_PHONE_CHANGE            1
AMT_REQ_CREDIT_BUREAU_HOUR    41519
AMT_REQ_CREDIT_BUREAU_DAY     41519
AMT_REQ_CREDIT_BUREAU_WEEK    41519
AMT_REQ_CREDIT_BUREAU_MON     41519
AMT_REQ_CREDIT_BUREAU_QRT     41519
AMT_REQ_CREDIT_BUREAU_YEAR    41519
dtype: int64

In [67]:
# save
joblib.dump(list_num, "list_num.pkl")

['list_num.pkl']

In [68]:
num_mean = app_train[list_num].mean()
num_mean.to_csv("num_mean.csv")

In [69]:
# Impute missing value in numerical data using "mean"
app_train[list_num] = app_train[list_num].fillna(app_train[list_num].mean())
app_train[list_num].isna().sum()

SK_ID_CURR                    0
CNT_CHILDREN                  0
AMT_INCOME_TOTAL              0
AMT_CREDIT                    0
AMT_ANNUITY                   0
AMT_GOODS_PRICE               0
REGION_POPULATION_RELATIVE    0
DAYS_BIRTH                    0
DAYS_EMPLOYED                 0
DAYS_REGISTRATION             0
DAYS_ID_PUBLISH               0
CNT_FAM_MEMBERS               0
HOUR_APPR_PROCESS_START       0
EXT_SOURCE_2                  0
EXT_SOURCE_3                  0
OBS_30_CNT_SOCIAL_CIRCLE      0
DEF_30_CNT_SOCIAL_CIRCLE      0
OBS_60_CNT_SOCIAL_CIRCLE      0
DEF_60_CNT_SOCIAL_CIRCLE      0
DAYS_LAST_PHONE_CHANGE        0
AMT_REQ_CREDIT_BUREAU_HOUR    0
AMT_REQ_CREDIT_BUREAU_DAY     0
AMT_REQ_CREDIT_BUREAU_WEEK    0
AMT_REQ_CREDIT_BUREAU_MON     0
AMT_REQ_CREDIT_BUREAU_QRT     0
AMT_REQ_CREDIT_BUREAU_YEAR    0
dtype: int64

In [70]:
# Missing in categorical data
app_train[list_cat].isna().sum()

TARGET                            0
NAME_CONTRACT_TYPE                0
CODE_GENDER                       0
FLAG_OWN_CAR                      0
FLAG_OWN_REALTY                   0
NAME_TYPE_SUITE                1292
NAME_INCOME_TYPE                  0
NAME_EDUCATION_TYPE               0
NAME_FAMILY_STATUS                0
NAME_HOUSING_TYPE                 0
FLAG_MOBIL                        0
FLAG_EMP_PHONE                    0
FLAG_WORK_PHONE                   0
FLAG_CONT_MOBILE                  0
FLAG_PHONE                        0
FLAG_EMAIL                        0
REGION_RATING_CLIENT              0
REGION_RATING_CLIENT_W_CITY       0
WEEKDAY_APPR_PROCESS_START        0
REG_REGION_NOT_LIVE_REGION        0
REG_REGION_NOT_WORK_REGION        0
LIVE_REGION_NOT_WORK_REGION       0
REG_CITY_NOT_LIVE_CITY            0
REG_CITY_NOT_WORK_CITY            0
LIVE_CITY_NOT_WORK_CITY           0
ORGANIZATION_TYPE                 0
FLAG_DOCUMENT_2                   0
FLAG_DOCUMENT_3             

In [71]:
dict_cat_mode = {}

for i in list_cat:
    mode_value = app_train[i].mode()[0]
    dict_cat_mode[i] = mode_value
    
del dict_cat_mode["TARGET"]

In [72]:
# save
joblib.dump(dict_cat_mode, "dict_cat_mode.pkl")

['dict_cat_mode.pkl']

In [73]:
# Impute missing value in categorical data using "mode"
app_train["NAME_TYPE_SUITE"] = app_train["NAME_TYPE_SUITE"].fillna(app_train["NAME_TYPE_SUITE"].mode()[0])
app_train[list_cat].isna().sum()

TARGET                         0
NAME_CONTRACT_TYPE             0
CODE_GENDER                    0
FLAG_OWN_CAR                   0
FLAG_OWN_REALTY                0
NAME_TYPE_SUITE                0
NAME_INCOME_TYPE               0
NAME_EDUCATION_TYPE            0
NAME_FAMILY_STATUS             0
NAME_HOUSING_TYPE              0
FLAG_MOBIL                     0
FLAG_EMP_PHONE                 0
FLAG_WORK_PHONE                0
FLAG_CONT_MOBILE               0
FLAG_PHONE                     0
FLAG_EMAIL                     0
REGION_RATING_CLIENT           0
REGION_RATING_CLIENT_W_CITY    0
WEEKDAY_APPR_PROCESS_START     0
REG_REGION_NOT_LIVE_REGION     0
REG_REGION_NOT_WORK_REGION     0
LIVE_REGION_NOT_WORK_REGION    0
REG_CITY_NOT_LIVE_CITY         0
REG_CITY_NOT_WORK_CITY         0
LIVE_CITY_NOT_WORK_CITY        0
ORGANIZATION_TYPE              0
FLAG_DOCUMENT_2                0
FLAG_DOCUMENT_3                0
FLAG_DOCUMENT_4                0
FLAG_DOCUMENT_5                0
FLAG_DOCUM

#### 2b. Exploring Numerical and Categorical Data

In [74]:
# CATEGORICAL DATA

In [75]:
app_train[list_cat].describe().T

Unnamed: 0,count,unique,top,freq
TARGET,307511,2,0,282686
NAME_CONTRACT_TYPE,307511,2,Cash loans,278232
CODE_GENDER,307511,3,F,202448
FLAG_OWN_CAR,307511,2,N,202924
FLAG_OWN_REALTY,307511,2,Y,213312
NAME_TYPE_SUITE,307511,7,Unaccompanied,249818
NAME_INCOME_TYPE,307511,8,Working,158774
NAME_EDUCATION_TYPE,307511,5,Secondary / secondary special,218391
NAME_FAMILY_STATUS,307511,6,Married,196432
NAME_HOUSING_TYPE,307511,6,House / apartment,272868


In [76]:
for i in list_cat:
    print(i)
    print(app_train[i].value_counts())
    print("\n----------")

TARGET
0    282686
1     24825
Name: TARGET, dtype: int64

----------
NAME_CONTRACT_TYPE
Cash loans         278232
Revolving loans     29279
Name: NAME_CONTRACT_TYPE, dtype: int64

----------
CODE_GENDER
F      202448
M      105059
XNA         4
Name: CODE_GENDER, dtype: int64

----------
FLAG_OWN_CAR
N    202924
Y    104587
Name: FLAG_OWN_CAR, dtype: int64

----------
FLAG_OWN_REALTY
Y    213312
N     94199
Name: FLAG_OWN_REALTY, dtype: int64

----------
NAME_TYPE_SUITE
Unaccompanied      249818
Family              40149
Spouse, partner     11370
Children             3267
Other_B              1770
Other_A               866
Group of people       271
Name: NAME_TYPE_SUITE, dtype: int64

----------
NAME_INCOME_TYPE
Working                 158774
Commercial associate     71617
Pensioner                55362
State servant            21703
Unemployed                  22
Student                     18
Businessman                 10
Maternity leave              5
Name: NAME_INCOME_TYPE, dtype

In [77]:
((app_train.isna().sum()) > 0).sum()

0

In [78]:
# "XNA" in CODE_GENDER replace by NAN
# "Unknown" in NAME_FAMILY_STATUS replace by NAN
# "XNA" in ORGANIZATION_TYPE replace by "Other"

app_train["CODE_GENDER"].replace("XNA", np.nan, inplace = True)
app_train["NAME_FAMILY_STATUS"].replace("Unknown", np.nan, inplace = True)
app_train["ORGANIZATION_TYPE"].replace("XNA", "Other", inplace = True)

In [79]:
((app_train.isna().sum()) > 0).sum()

2

In [80]:
# Impute missing value in categorical data using "mode"
app_train["CODE_GENDER"] = app_train["CODE_GENDER"].fillna(app_train["CODE_GENDER"].mode()[0])
app_train["NAME_FAMILY_STATUS"] = app_train["NAME_FAMILY_STATUS"].fillna(app_train["NAME_FAMILY_STATUS"].mode()[0])
app_train[list_cat].isna().sum()

TARGET                         0
NAME_CONTRACT_TYPE             0
CODE_GENDER                    0
FLAG_OWN_CAR                   0
FLAG_OWN_REALTY                0
NAME_TYPE_SUITE                0
NAME_INCOME_TYPE               0
NAME_EDUCATION_TYPE            0
NAME_FAMILY_STATUS             0
NAME_HOUSING_TYPE              0
FLAG_MOBIL                     0
FLAG_EMP_PHONE                 0
FLAG_WORK_PHONE                0
FLAG_CONT_MOBILE               0
FLAG_PHONE                     0
FLAG_EMAIL                     0
REGION_RATING_CLIENT           0
REGION_RATING_CLIENT_W_CITY    0
WEEKDAY_APPR_PROCESS_START     0
REG_REGION_NOT_LIVE_REGION     0
REG_REGION_NOT_WORK_REGION     0
LIVE_REGION_NOT_WORK_REGION    0
REG_CITY_NOT_LIVE_CITY         0
REG_CITY_NOT_WORK_CITY         0
LIVE_CITY_NOT_WORK_CITY        0
ORGANIZATION_TYPE              0
FLAG_DOCUMENT_2                0
FLAG_DOCUMENT_3                0
FLAG_DOCUMENT_4                0
FLAG_DOCUMENT_5                0
FLAG_DOCUM

In [81]:
app_train[list_cat].describe().T

Unnamed: 0,count,unique,top,freq
TARGET,307511,2,0,282686
NAME_CONTRACT_TYPE,307511,2,Cash loans,278232
CODE_GENDER,307511,2,F,202452
FLAG_OWN_CAR,307511,2,N,202924
FLAG_OWN_REALTY,307511,2,Y,213312
NAME_TYPE_SUITE,307511,7,Unaccompanied,249818
NAME_INCOME_TYPE,307511,8,Working,158774
NAME_EDUCATION_TYPE,307511,5,Secondary / secondary special,218391
NAME_FAMILY_STATUS,307511,5,Married,196434
NAME_HOUSING_TYPE,307511,6,House / apartment,272868


In [82]:
# NUMERICAL DATA

In [83]:
app_train[list_num].describe().round(2).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SK_ID_CURR,307511.0,278180.52,102790.18,100002.0,189145.5,278202.0,367142.5,456255.0
CNT_CHILDREN,307511.0,0.42,0.72,0.0,0.0,0.0,1.0,19.0
AMT_INCOME_TOTAL,307511.0,168797.92,237123.15,25650.0,112500.0,147150.0,202500.0,117000000.0
AMT_CREDIT,307511.0,599026.0,402490.78,45000.0,270000.0,513531.0,808650.0,4050000.0
AMT_ANNUITY,307511.0,27108.57,14493.45,1615.5,16524.0,24903.0,34596.0,258025.5
AMT_GOODS_PRICE,307511.0,538396.21,369279.43,40500.0,238500.0,450000.0,679500.0,4050000.0
REGION_POPULATION_RELATIVE,307511.0,0.02,0.01,0.0,0.01,0.02,0.03,0.07
DAYS_BIRTH,307511.0,-16037.0,4363.99,-25229.0,-19682.0,-15750.0,-12413.0,-7489.0
DAYS_EMPLOYED,307511.0,63815.05,141275.77,-17912.0,-2760.0,-1213.0,-289.0,365243.0
DAYS_REGISTRATION,307511.0,-4986.12,3522.89,-24672.0,-7479.5,-4504.0,-2010.0,0.0


In [84]:
app_train.shape

(307511, 72)

In [85]:
app_train.columns.duplicated().sum()

0

### 3. Feature Selection

In [86]:
# CORRELATION
# Numerical & Categorical -> ANOVA
# Categorical & Categorical -> Chi Square

In [87]:
from scipy.stats import f_oneway, chi2_contingency

#### 3a. ANOVA

In [88]:
# Assumption (H0) is between 2 variables are NOT correlated
# Reject H0 if p-value < 0.05 (Reject = correlated)

category_group = app_train.groupby("TARGET")["CNT_CHILDREN"].apply(list)
anova_result = f_oneway(*category_group)
print("P-value of Anova:", anova_result[1])

P-value of Anova: 1.9224915500910093e-26


In [89]:
dict_anova = {}
for i in list_num:
    category_group = app_train.groupby("TARGET")[i].apply(list)
    pvalue_anova = f_oneway(*category_group)[1]
    dict_anova[i] = pvalue_anova

In [90]:
dict_anova

{'SK_ID_CURR': 0.24231496893115526,
 'CNT_CHILDREN': 1.9224915500910093e-26,
 'AMT_INCOME_TOTAL': 0.02723796087829524,
 'AMT_CREDIT': 1.1474602724260586e-63,
 'AMT_ANNUITY': 1.1815936484710046e-12,
 'AMT_GOODS_PRICE': 4.1013948936918764e-107,
 'REGION_POPULATION_RELATIVE': 9.582701564553064e-95,
 'DAYS_BIRTH': 0.0,
 'DAYS_EMPLOYED': 3.6311730827265075e-137,
 'DAYS_REGISTRATION': 6.0365534541554926e-120,
 'DAYS_ID_PUBLISH': 2.527523814198052e-179,
 'CNT_FAM_MEMBERS': 2.4485092616990623e-07,
 'HOUR_APPR_PROCESS_START': 5.826823528542842e-41,
 'EXT_SOURCE_2': 0.0,
 'EXT_SOURCE_3': 0.0,
 'OBS_30_CNT_SOCIAL_CIRCLE': 4.2083174080918836e-07,
 'DEF_30_CNT_SOCIAL_CIRCLE': 1.9282236658471134e-71,
 'OBS_60_CNT_SOCIAL_CIRCLE': 5.758343188747271e-07,
 'DEF_60_CNT_SOCIAL_CIRCLE': 2.600710458865681e-67,
 'DAYS_LAST_PHONE_CHANGE': 3.1837199188792205e-206,
 'AMT_REQ_CREDIT_BUREAU_HOUR': 0.6382568757674145,
 'AMT_REQ_CREDIT_BUREAU_DAY': 0.17177647564740878,
 'AMT_REQ_CREDIT_BUREAU_WEEK': 0.6906113957174

#### 3b. Chi Square

In [91]:
# Assumption (H0) is between 2 variables are NOT correlated
# Reject H0 if p-value < 0.05 (Reject = correlated)

crosstab_result = pd.crosstab(index = app_train["CODE_GENDER"], 
                              columns = app_train["TARGET"])
chisquare_result = chi2_contingency(crosstab_result)
print("P-value of Chi Square:", chisquare_result[1])

P-value of Chi Square: 4.183493188620687e-202


In [92]:
dict_chisquare = {}
for i in list_cat:
    crosstab_result = pd.crosstab(index = app_train[i], 
                                  columns = app_train["TARGET"])
    pvalue_chisquare = chi2_contingency(crosstab_result)[1]
    dict_chisquare[i] = pvalue_chisquare

In [93]:
dict_chisquare

{'TARGET': 0.0,
 'NAME_CONTRACT_TYPE': 1.0235150721172847e-65,
 'CODE_GENDER': 4.183493188620687e-202,
 'FLAG_OWN_CAR': 9.330994431109667e-34,
 'FLAG_OWN_REALTY': 0.0006681470317545887,
 'NAME_TYPE_SUITE': 1.669680344640335e-05,
 'NAME_INCOME_TYPE': 1.9281456056861122e-266,
 'NAME_EDUCATION_TYPE': 2.4476812052198174e-219,
 'NAME_FAMILY_STATUS': 6.983958115483933e-108,
 'NAME_HOUSING_TYPE': 1.0990890032617707e-88,
 'FLAG_MOBIL': 0.12378615154489829,
 'FLAG_EMP_PHONE': 2.5306059279614537e-143,
 'FLAG_WORK_PHONE': 2.6758000919452704e-56,
 'FLAG_CONT_MOBILE': 0.8976989816319643,
 'FLAG_PHONE': 9.489418049556951e-40,
 'FLAG_EMAIL': 0.3366632895181666,
 'REGION_RATING_CLIENT': 1.8283164955910817e-232,
 'REGION_RATING_CLIENT_W_CITY': 5.05571529094165e-249,
 'WEEKDAY_APPR_PROCESS_START': 0.01744736931389504,
 'REG_REGION_NOT_LIVE_REGION': 0.0021769580022904804,
 'REG_REGION_NOT_WORK_REGION': 0.0001258375420219184,
 'LIVE_REGION_NOT_WORK_REGION': 0.12192447948152679,
 'REG_CITY_NOT_LIVE_CITY': 

In [94]:
len(dict_anova), len(dict_chisquare)

(26, 46)

In [95]:
# Combine p_value from ANOVA and Chi Square
pvalue_all = dict(dict_anova)
pvalue_all.update(dict_chisquare)
len(pvalue_all)

72

In [96]:
# Just select variables that have correlation with "TARGET"
correlated_var = []
for var, pvalue in pvalue_all.items():
    if pvalue <= 0.05:
        correlated_var.append(var)
        
correlated_var.append("SK_ID_CURR")

In [97]:
len(correlated_var)

55

In [98]:
correlated_var

['CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'CNT_FAM_MEMBERS',
 'HOUR_APPR_PROCESS_START',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'DAYS_LAST_PHONE_CHANGE',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'TARGET',
 'NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_PHONE',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'WEEKDAY_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'ORGANIZATION_TYPE',
 'FLAG_DOCUMEN

In [99]:
app_train = app_train[correlated_var]
app_train.shape

(307511, 55)

### 4. Handling Categorical Data

In [100]:
app_train_ok = app_train.copy()

In [101]:
app_train_ok.to_csv("data train with correlated var.csv")

In [102]:
list_cat_ok = list(app_train_ok.select_dtypes(include = "object").columns)
for i in list_cat_ok:
    print(i)
    print(app_train_ok[i].unique())
    print("\n----------")

list_cat_ok.remove("TARGET")
# All of the categorical variables are nominal -> use get_dummies

TARGET
[1 0]

----------
NAME_CONTRACT_TYPE
['Cash loans' 'Revolving loans']

----------
CODE_GENDER
['M' 'F']

----------
FLAG_OWN_CAR
['N' 'Y']

----------
FLAG_OWN_REALTY
['Y' 'N']

----------
NAME_TYPE_SUITE
['Unaccompanied' 'Family' 'Spouse, partner' 'Children' 'Other_A' 'Other_B'
 'Group of people']

----------
NAME_INCOME_TYPE
['Working' 'State servant' 'Commercial associate' 'Pensioner' 'Unemployed'
 'Student' 'Businessman' 'Maternity leave']

----------
NAME_EDUCATION_TYPE
['Secondary / secondary special' 'Higher education' 'Incomplete higher'
 'Lower secondary' 'Academic degree']

----------
NAME_FAMILY_STATUS
['Single / not married' 'Married' 'Civil marriage' 'Widow' 'Separated']

----------
NAME_HOUSING_TYPE
['House / apartment' 'Rented apartment' 'With parents'
 'Municipal apartment' 'Office apartment' 'Co-op apartment']

----------
FLAG_EMP_PHONE
[1 0]

----------
FLAG_WORK_PHONE
[0 1]

----------
FLAG_PHONE
[1 0]

----------
REGION_RATING_CLIENT
[2 1 3]

----------
REGIO

In [103]:
# Use OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

In [104]:
ohe = OneHotEncoder(sparse = False, handle_unknown = "ignore")
encoded_nominal = ohe.fit(app_train_ok[list_cat_ok])
encoded_nominal = ohe.transform(app_train_ok[list_cat_ok])

# Converting to a dataframe 
encoded_nominal_df = pd.DataFrame(encoded_nominal, columns = ohe.get_feature_names_out())

# Combine with the original data
app_train_ok = pd.concat(objs = [encoded_nominal_df, app_train_ok], axis = 1)
app_train_ok.drop(list_cat_ok, axis = 1, inplace = True)



In [105]:
# Save the "ohe" object to use in test data
joblib.dump(ohe, "one_hot_encoder.pkl")

['one_hot_encoder.pkl']

In [59]:
app_train_ok.head(2)

Unnamed: 0,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Revolving loans,CODE_GENDER_F,CODE_GENDER_M,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_N,FLAG_OWN_REALTY_Y,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,...,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_YEAR,TARGET,SK_ID_CURR
0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.139376,2.0,2.0,2.0,2.0,-1134.0,0.0,1.0,1,100002
1,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.510853,1.0,0.0,1.0,0.0,-828.0,0.0,0.0,0,100003


In [60]:
app_train_ok = app_train_ok.drop("SK_ID_CURR", axis = 1)
app_train_ok.head(2)

Unnamed: 0,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Revolving loans,CODE_GENDER_F,CODE_GENDER_M,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_N,FLAG_OWN_REALTY_Y,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,...,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_YEAR,TARGET
0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.262949,0.139376,2.0,2.0,2.0,2.0,-1134.0,0.0,1.0,1
1,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.622246,0.510853,1.0,0.0,1.0,0.0,-828.0,0.0,0.0,0


### 5. Handling Imbalanced Data

In [68]:
# Percentage of each class
app_train_ok["TARGET"].value_counts() / len(app_train_ok) * 100

0    91.927118
1     8.072882
Name: TARGET, dtype: float64

In [69]:
# !pip install -U imbalanced-learn
# !pip install imblearn

In [70]:
X = app_train_ok.drop("TARGET", axis = 1)
y = app_train_ok["TARGET"]
y = y.astype("int")

In [71]:
len(X.columns)

168

In [72]:
y.value_counts()

0    282686
1     24825
Name: TARGET, dtype: int64

In [73]:
# Oversampling Minority Class using Synthetic Minority Oversampling Technique (SMOTE)
from imblearn.over_sampling import SMOTE
oversample = SMOTE(random_state = 42)
X_smote, y_smote = oversample.fit_resample(X, y)

In [74]:
# !pip install -U threadpoolctl

# import sklearn
# print(sklearn.show_versions())

In [75]:
y_smote.value_counts()

1    282686
0    282686
Name: TARGET, dtype: int64

### 6. Feature Scaling

In [76]:
X_smote.shape

(565372, 168)

In [77]:
X_smote.dtypes.unique()

array([dtype('float64'), dtype('int64')], dtype=object)

In [78]:
encoded_nominal.shape[1], len(X_smote.columns)

(147, 168)

In [79]:
168-147 # Numerical variables

21

In [80]:
list_num_ok = []
for i in list(X_smote.columns):
    if i in list_num:
        list_num_ok.append(i)

In [81]:
len(list_num_ok)

21

In [82]:
# list_encoded = list(X_smote.select_dtypes(include = "uint8").columns)
# len(list_encoded)

In [83]:
list_non_encoded = list_num_ok
list_non_encoded

['CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'CNT_FAM_MEMBERS',
 'HOUR_APPR_PROCESS_START',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'DAYS_LAST_PHONE_CHANGE',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_YEAR']

In [84]:
X_smote_stand = X_smote.copy()

In [85]:
# Use Standardization
from sklearn.preprocessing import StandardScaler

In [93]:
# Apply standardization on numerical features
import joblib

dict_standardization = {}

for i in list_non_encoded:
    # fit on training data column
    scale = StandardScaler()
    scale.fit(X_smote_stand[[i]])

    # save the scaler object per variable
    joblib.dump(scale, "pickle_standardization/" + i + ".pkl")
    dict_standardization[i] = "pickle_standardization/" + i + ".pkl"
    
    # transform the training data column
    X_smote_stand[i] = scale.transform(X_smote_stand[[i]])

In [109]:
# save
joblib.dump(dict_standardization, "dict_standardization.pkl")

['dict_standardization.pkl']

In [94]:
dict_standardization

{'CNT_CHILDREN': 'pickle_standardization/CNT_CHILDREN.pkl',
 'AMT_INCOME_TOTAL': 'pickle_standardization/AMT_INCOME_TOTAL.pkl',
 'AMT_CREDIT': 'pickle_standardization/AMT_CREDIT.pkl',
 'AMT_ANNUITY': 'pickle_standardization/AMT_ANNUITY.pkl',
 'AMT_GOODS_PRICE': 'pickle_standardization/AMT_GOODS_PRICE.pkl',
 'REGION_POPULATION_RELATIVE': 'pickle_standardization/REGION_POPULATION_RELATIVE.pkl',
 'DAYS_BIRTH': 'pickle_standardization/DAYS_BIRTH.pkl',
 'DAYS_EMPLOYED': 'pickle_standardization/DAYS_EMPLOYED.pkl',
 'DAYS_REGISTRATION': 'pickle_standardization/DAYS_REGISTRATION.pkl',
 'DAYS_ID_PUBLISH': 'pickle_standardization/DAYS_ID_PUBLISH.pkl',
 'CNT_FAM_MEMBERS': 'pickle_standardization/CNT_FAM_MEMBERS.pkl',
 'HOUR_APPR_PROCESS_START': 'pickle_standardization/HOUR_APPR_PROCESS_START.pkl',
 'EXT_SOURCE_2': 'pickle_standardization/EXT_SOURCE_2.pkl',
 'EXT_SOURCE_3': 'pickle_standardization/EXT_SOURCE_3.pkl',
 'OBS_30_CNT_SOCIAL_CIRCLE': 'pickle_standardization/OBS_30_CNT_SOCIAL_CIRCLE.pkl'

In [99]:
X_smote_stand.head(2)

Unnamed: 0,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Revolving loans,CODE_GENDER_F,CODE_GENDER_M,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_N,FLAG_OWN_REALTY_Y,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,...,HOUR_APPR_PROCESS_START,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_YEAR
0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,-0.57175,-1.069298,-1.911051,0.248677,4.204997,0.258946,5.281482,-0.324379,-0.345601,-0.587652
1,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,-0.24263,0.811721,0.247623,-0.206084,-0.379798,-0.199899,-0.329679,0.086181,-0.345601,-1.20824


In [100]:
dict_standardization

{'CNT_CHILDREN': StandardScaler(),
 'AMT_INCOME_TOTAL': StandardScaler(),
 'AMT_CREDIT': StandardScaler(),
 'AMT_ANNUITY': StandardScaler(),
 'AMT_GOODS_PRICE': StandardScaler(),
 'REGION_POPULATION_RELATIVE': StandardScaler(),
 'DAYS_BIRTH': StandardScaler(),
 'DAYS_EMPLOYED': StandardScaler(),
 'DAYS_REGISTRATION': StandardScaler(),
 'DAYS_ID_PUBLISH': StandardScaler(),
 'CNT_FAM_MEMBERS': StandardScaler(),
 'HOUR_APPR_PROCESS_START': StandardScaler(),
 'EXT_SOURCE_2': StandardScaler(),
 'EXT_SOURCE_3': StandardScaler(),
 'OBS_30_CNT_SOCIAL_CIRCLE': StandardScaler(),
 'DEF_30_CNT_SOCIAL_CIRCLE': StandardScaler(),
 'OBS_60_CNT_SOCIAL_CIRCLE': StandardScaler(),
 'DEF_60_CNT_SOCIAL_CIRCLE': StandardScaler(),
 'DAYS_LAST_PHONE_CHANGE': StandardScaler(),
 'AMT_REQ_CREDIT_BUREAU_MON': StandardScaler(),
 'AMT_REQ_CREDIT_BUREAU_YEAR': StandardScaler()}

In [101]:
# We will use "X_smote_stand" and "y_smote"
len(X_smote_stand), len(y_smote)

(565372, 565372)

In [102]:
# Combine "X_smote_stand" and "y_smote" to save the data
data_smote = pd.concat(objs = [X_smote_stand, y_smote], axis = 1)
data_smote.head(2)

Unnamed: 0,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Revolving loans,CODE_GENDER_F,CODE_GENDER_M,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_N,FLAG_OWN_REALTY_Y,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,...,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_YEAR,TARGET
0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,-1.069298,-1.911051,0.248677,4.204997,0.258946,5.281482,-0.324379,-0.345601,-0.587652,1
1,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.811721,0.247623,-0.206084,-0.379798,-0.199899,-0.329679,0.086181,-0.345601,-1.20824,0


In [103]:
data_smote.shape

(565372, 169)

In [104]:
data_smote.to_csv("data final_smote standardization.csv")

### 7. Modelling and Evaluation

#### 7a. Logistic Regression

In [105]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [None]:
# classifier = LogisticRegression(random_state = 42)
# classifier.fit(X_smote_stand, y_smote)

In [None]:
logistic_classifier = LogisticRegression(random_state = 42)

# USE CROSS VAL
f1_logistic = cross_val_score(estimator = logistic_classifier, X = X_smote_stand, y = y_smote, scoring = "f1", cv = 5)

In [None]:
f1_logistic

In [None]:
f1_logistic.mean()

In [None]:
# Because the score is good, we will fit to the data and save the data (pickle)
logistic_classifier.fit(X_smote_stand, y_smote)

In [None]:
import pickle
with open("model_logistic_regression_ok.pkl", "wb") as f:
    pickle.dump(logistic_classifier, f)

#### 7b. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [None]:
forest_classifier = RandomForestClassifier(random_state = 42)

# USE CROSS VAL
f1_forest = cross_val_score(estimator = forest_classifier, X = X_smote_stand, y = y_smote, scoring = "f1", cv = 5)

In [None]:
f1_forest

In [None]:
f1_forest.mean()

In [None]:
# Because the score is good, we will fit to the data and save the data (pickle)
forest_classifier.fit(X_smote_stand, y_smote)

In [None]:
import pickle
with open("model_random_forest_ok.pkl", "wb") as f:
    pickle.dump(forest_classifier, f)