In [None]:
pip install datasist

In [4]:
# Importing libraries and packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datasist.structdata import detect_outliers
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    BaggingClassifier,
    ExtraTreesClassifier,
    RandomForestClassifier,
    StackingClassifier,
    HistGradientBoostingClassifier
)
from xgboost import XGBClassifier
from sklearn import metrics 

In [11]:
#Input Data
df_origin = pd.read_csv('/application_train.csv')
df = df_origin.copy()
df.shape

(17474, 122)

## EDA

In [12]:
# Check statistical value
## Numerical columns
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SK_ID_CURR,17474.0,110187.879993,5880.732247,100002.0,105109.25,110166.5,115271.75,120375.0
TARGET,17474.0,0.078288,0.268632,0.0,0.00,0.0,0.00,1.0
CNT_CHILDREN,17474.0,0.420282,0.721725,0.0,0.00,0.0,1.00,8.0
AMT_INCOME_TOTAL,17474.0,174585.102273,889002.980556,25650.0,112500.00,146250.0,202500.00,117000000.0
AMT_CREDIT,17474.0,601472.550790,403978.013895,45000.0,270000.00,517500.0,813195.00,4050000.0
...,...,...,...,...,...,...,...,...
AMT_REQ_CREDIT_BUREAU_DAY,15133.0,0.008128,0.117804,0.0,0.00,0.0,0.00,5.0
AMT_REQ_CREDIT_BUREAU_WEEK,15133.0,0.033040,0.195691,0.0,0.00,0.0,0.00,5.0
AMT_REQ_CREDIT_BUREAU_MON,15133.0,0.272649,0.950988,0.0,0.00,0.0,0.00,24.0
AMT_REQ_CREDIT_BUREAU_QRT,15133.0,0.264389,0.614241,0.0,0.00,0.0,0.00,8.0


In [13]:
## Categorical columns
df.describe(exclude=np.number).T

Unnamed: 0,count,unique,top,freq
NAME_CONTRACT_TYPE,17474,2,Cash loans,15789
CODE_GENDER,17474,2,F,11487
FLAG_OWN_CAR,17474,2,N,11545
FLAG_OWN_REALTY,17474,2,Y,12158
NAME_TYPE_SUITE,17404,7,Unaccompanied,14143
NAME_INCOME_TYPE,17474,6,Working,9113
NAME_EDUCATION_TYPE,17474,5,Secondary / secondary special,12352
NAME_FAMILY_STATUS,17474,5,Married,11325
NAME_HOUSING_TYPE,17474,6,House / apartment,15471
OCCUPATION_TYPE,12026,18,Laborers,3100


## Data Cleaning

Identify issues
* Non useful column (unique identifier)
* Columns with 'object' datatypes
* Missing data
* Outliers


**(1) Removing unuseful column**

In [14]:
del df['SK_ID_CURR']

**(2) Converting columns**

In [15]:
# Check columns with 'object' datatypes
categorical_cols = list(df.select_dtypes(include='object').columns[1:])
categorical_cols

['CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE',
 'EMERGENCYSTATE_MODE']

In [16]:
# Convert columns to 'category' datatype
for col in categorical_cols:
  df[col] = df[col].astype('category')

In [17]:
df.select_dtypes(include='object').columns[1:]

Index([], dtype='object')

**(3) Handling missing values**

In [18]:
# Drop columns with 75% missing values (none)
miss_col = df.columns[ df.isnull().mean() > 0.75 ]
miss_col

Index([], dtype='object')

In [21]:
# Check numerical columns with missing values
numerical_cols = list(df.select_dtypes(include='float64').columns[1:])
is_null_cols = list(df.columns[df.isnull().sum() != 0])

null_num = [x for x in is_null_cols if x in numerical_cols]
null_cat = [x for x in is_null_cols if x in categorical_cols]

print(len(null_num))
print(len(null_cat))

79
6


Missing values will be imputed with mean for 79 numerical columns and with mode for 6 categorical columns.

In [32]:
c1 = df.select_dtypes(np.number).columns
df[c1] = df[c1].fillna(df[c1].mean())
c2 = df.select_dtypes(exclude=np.number).columns
df[c2] = df[c2].fillna(df[c2].mode())

In [33]:
df.columns[df.isnull().sum() != 0]

Index(['NAME_TYPE_SUITE', 'OCCUPATION_TYPE', 'FONDKAPREMONT_MODE',
       'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE'],
      dtype='object')

**(4) Handling outliers**

Outliers will be replaced by median of each feature.

In [None]:
for col in num_cols :
    outliers_indecies = detect_outliers(df,0,[col])
    median = df[col].median()
    df[col].iloc[outliers_indecies] = median

## Data Processing

Identify issues
* Categorical columns that have to be encoded
* Feature selection
* Feature scaling
* Imbalanced target column

**(1) Encoding categorical columns**

In [35]:
df_encoded = pd.get_dummies(df, drop_first = True)
df_encoded.shape

(17474, 225)

**(2) Performing feature selection**

In [36]:
# Using Pearson Correlation
plt.figure(figsize=(12,10))
cor = df_encoded.corr()
cor

Unnamed: 0,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,FONDKAPREMONT_MODE_reg oper spec account,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_Yes
TARGET,1.000000,0.034258,0.023713,-0.025355,0.013022,-0.027388,-0.027958,0.087875,-0.043355,0.047561,...,-0.006502,0.017456,-0.004641,-0.008757,-0.013484,0.002840,-0.030800,-0.016005,0.013224,0.002242
CNT_CHILDREN,0.034258,1.000000,0.009115,0.005647,0.031412,0.001926,-0.011815,0.334859,-0.241087,0.181346,...,-0.004663,-0.008213,0.009361,0.011106,-0.002755,0.005732,-0.029195,-0.020107,0.013833,0.011842
AMT_INCOME_TOTAL,0.023713,0.009115,1.000000,0.033951,0.041603,0.031890,0.002079,0.012408,-0.019761,0.002089,...,0.000790,-0.000710,0.000716,0.000345,0.006122,0.001260,0.007643,0.017163,-0.001713,-0.002132
AMT_CREDIT,-0.025355,0.005647,0.033951,1.000000,0.734697,0.929666,0.048976,-0.074243,-0.059786,-0.010873,...,0.007203,0.001543,0.015026,0.006322,0.031854,0.010405,0.042084,0.016694,-0.010050,-0.012125
AMT_ANNUITY,0.013022,0.031412,0.041603,0.734697,1.000000,0.710400,0.060767,0.002913,-0.113955,0.020936,...,0.010272,0.002842,0.006222,0.002920,0.036183,0.020006,0.053527,0.001269,-0.007515,-0.008963
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WALLSMATERIAL_MODE_Others,0.002840,0.005732,0.001260,0.010405,0.020006,0.010691,0.012738,-0.005788,-0.002400,-0.008873,...,0.010847,-0.004660,0.099679,-0.005983,-0.005431,1.000000,-0.037943,-0.036613,-0.009413,0.021813
WALLSMATERIAL_MODE_Panel,-0.030800,-0.029195,0.007643,0.042084,0.053527,0.039180,0.017300,-0.013697,-0.011485,-0.067392,...,0.118391,-0.016952,-0.018275,-0.043851,-0.039806,-0.037943,1.000000,-0.268339,-0.068985,-0.038997
"WALLSMATERIAL_MODE_Stone, brick",-0.016005,-0.020107,0.017163,0.016694,0.001269,0.014713,0.025647,-0.019691,0.006278,-0.019519,...,0.109640,0.094359,0.012209,-0.042314,-0.038410,-0.036613,-0.268339,1.000000,-0.066567,-0.014112
WALLSMATERIAL_MODE_Wooden,0.013224,0.013833,-0.001713,-0.010050,-0.007515,-0.011651,-0.052448,0.001352,-0.004064,-0.009154,...,0.002009,0.005324,0.152750,-0.010878,-0.009875,-0.009413,-0.068985,-0.066567,1.000000,0.284997


<Figure size 1200x1000 with 0 Axes>

In [58]:
# Correlation with output variable
cor_target = abs(cor["TARGET"])

# Selecting highly correlated features
relevant_features = cor_target[cor_target>0.04]
relevant_features

TARGET                                               1.000000
DAYS_BIRTH                                           0.087875
DAYS_EMPLOYED                                        0.043355
DAYS_REGISTRATION                                    0.047561
DAYS_ID_PUBLISH                                      0.052615
FLAG_EMP_PHONE                                       0.044393
REGION_RATING_CLIENT                                 0.064539
REGION_RATING_CLIENT_W_CITY                          0.065706
REG_CITY_NOT_WORK_CITY                               0.043013
EXT_SOURCE_2                                         0.148578
EXT_SOURCE_3                                         0.144825
DAYS_LAST_PHONE_CHANGE                               0.055043
FLAG_DOCUMENT_3                                      0.045508
CODE_GENDER_M                                        0.058039
NAME_INCOME_TYPE_Pensioner                           0.044279
NAME_INCOME_TYPE_Working                             0.051844
NAME_EDU

In [60]:
selected_cols = ['TARGET', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'FLAG_EMP_PHONE', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
                 'REG_CITY_NOT_WORK_CITY', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_LAST_PHONE_CHANGE', 'FLAG_DOCUMENT_3', 'CODE_GENDER_M', 'NAME_INCOME_TYPE_Pensioner', 'NAME_INCOME_TYPE_Working',
                 'NAME_EDUCATION_TYPE_Higher education', 'NAME_EDUCATION_TYPE_Secondary / secondary special', 'ORGANIZATION_TYPE_XNA']

In [61]:
df_new = df_encoded[selected_cols]
df_new

Unnamed: 0,TARGET,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_EMP_PHONE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,REG_CITY_NOT_WORK_CITY,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_3,CODE_GENDER_M,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Secondary / secondary special,ORGANIZATION_TYPE_XNA
0,1,-9461,-637,-3648.0,-2120,1,2,2,0,0.262949,0.139376,-1134.00000,1.000000,1,0,1,0,1,0
1,0,-16765,-1188,-1186.0,-291,1,1,1,0,0.622246,0.510811,-828.00000,1.000000,0,0,0,1,0,0
2,0,-19046,-225,-4260.0,-2531,1,2,2,0,0.555912,0.729567,-815.00000,0.000000,1,0,1,0,1,0
3,0,-19005,-3039,-9833.0,-2437,1,2,2,0,0.650442,0.510811,-617.00000,1.000000,0,0,1,0,1,0
4,0,-19932,-3038,-4311.0,-3458,1,2,2,1,0.322738,0.510811,-1106.00000,0.000000,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17469,0,-11582,-715,-1581.0,-3816,1,2,2,0,0.720997,0.517297,-1223.00000,1.000000,1,0,1,0,1,0
17470,0,-15594,-1672,-849.0,-4107,1,3,3,0,0.449491,0.380800,-380.00000,1.000000,1,0,1,0,1,0
17471,0,-12070,-1554,-2591.0,-4501,1,2,2,0,0.257455,0.396220,-8.00000,1.000000,1,0,1,0,1,0
17472,0,-12865,-375,-1757.0,-5516,1,2,2,0,0.595265,0.633032,0.00000,1.000000,0,0,1,0,1,0


**(3) Applying oversampling**

In [62]:
# Count data per class
df_new['TARGET'].value_counts(normalize=True)

0    0.921712
1    0.078288
Name: TARGET, dtype: float64

In [64]:
# Split data
X, y = df_new.drop("TARGET",axis=1).values , df_new["TARGET"]
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [67]:
from imblearn.over_sampling import SMOTE
rus = SMOTE(sampling_strategy='auto')
X_data_rus, y_data_rus = rus.fit_resample(X, y)

y_data_rus.value_counts(normalize=True)

1    0.5
0    0.5
Name: TARGET, dtype: float64

**(4) Performing feature scaling**

In [77]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_new.to_numpy())
df_scaled = pd.DataFrame(df_scaled, columns=selected_cols)

In [78]:
df_scaled

Unnamed: 0,TARGET,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_EMP_PHONE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,REG_CITY_NOT_WORK_CITY,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_3,CODE_GENDER_M,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Secondary / secondary special,ORGANIZATION_TYPE_XNA
0,3.431238,1.504980,-0.454405,0.366581,0.572880,0.466947,-0.097848,-0.059246,-0.552095,-1.329872,-2.288670,-0.221470,6.393711e-01,1.385156,-0.466675,0.957852,-0.572636,0.643949,-0.466856
1,-0.291440,-0.175860,-0.458314,1.072204,1.784513,0.466947,-2.065398,-2.046332,-0.552095,0.564584,-0.041171,0.152816,6.393711e-01,-0.721940,-0.466675,-1.044003,1.746309,-1.552919,-0.466856
2,-0.291440,-0.700777,-0.451481,0.191178,0.300611,0.466947,-0.097848,-0.059246,-0.552095,0.214828,1.282489,0.168717,-1.564126e+00,1.385156,-0.466675,0.957852,-0.572636,0.643949,-0.466856
3,-0.291440,-0.691342,-0.471448,-1.406074,0.362882,0.466947,-0.097848,-0.059246,-0.552095,0.713251,-0.041171,0.410902,6.393711e-01,-0.721940,-0.466675,0.957852,-0.572636,0.643949,-0.466856
4,-0.291440,-0.904669,-0.471441,0.176561,-0.313486,0.466947,-0.097848,-0.059246,1.811283,-1.014621,-0.041171,-0.187222,-1.564126e+00,1.385156,-0.466675,0.957852,-0.572636,0.643949,-0.466856
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17469,-0.291440,1.016883,-0.454958,0.958995,-0.550646,0.466947,-0.097848,-0.059246,-0.552095,1.085264,-0.001926,-0.330331,6.393711e-01,1.385156,-0.466675,0.957852,-0.572636,0.643949,-0.466856
17470,-0.291440,0.093618,-0.461749,1.168790,-0.743421,0.466947,1.869702,1.927840,-0.552095,-0.346295,-0.827849,0.700790,6.393711e-01,1.385156,-0.466675,0.957852,-0.572636,0.643949,-0.466856
17471,-0.291440,0.904582,-0.460911,0.669523,-1.004429,0.466947,-0.097848,-0.059246,-0.552095,-1.358836,-0.734546,1.155804,6.393711e-01,1.385156,-0.466675,0.957852,-0.572636,0.643949,-0.466856
17472,-0.291440,0.721631,-0.452545,0.908552,-1.676822,0.466947,-0.097848,-0.059246,-0.552095,0.422322,0.698369,1.165589,6.393711e-01,-0.721940,-0.466675,0.957852,-0.572636,0.643949,-0.466856


In [79]:
df_scaled.to_csv('preprocessedData.csv', index=False)

## Model Building

In [80]:
# Split data
X, y = df_scaled.drop("TARGET",axis=1).values , df_new["TARGET"]
X_train, X_test, y_train, y_test = train_test_split(X, y)

**(1) Logistic Regression**

In [82]:
logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train, y_train)

In [90]:
y_pred = logreg.predict(X_test) 

In [91]:
cnf_matrix = metrics.confusion_matrix(y_test, y_pred) 
cnf_matrix 

array([[4021,    0],
       [ 346,    2]])

In [92]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) 
print("Precision:",metrics.precision_score(y_test, y_pred)) 
print("Recall:",metrics.recall_score(y_test, y_pred)) 

Accuracy: 0.9208056763561455
Precision: 1.0
Recall: 0.005747126436781609


**(2) Bagging**

In [96]:
bagging = BaggingClassifier(n_jobs=-1)
bagging.fit(X_train, y_train)

In [97]:
y_pred2 = bagging.predict(X_test) 

In [98]:
cnf_matrix2 = metrics.confusion_matrix(y_test, y_pred2) 
cnf_matrix2

array([[4000,   21],
       [ 340,    8]])

In [99]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred2)) 
print("Precision:",metrics.precision_score(y_test, y_pred2)) 
print("Recall:",metrics.recall_score(y_test, y_pred2)) 

Accuracy: 0.9173723964293888
Precision: 0.27586206896551724
Recall: 0.022988505747126436


**(3) Random Forest**

In [100]:
randomForest = RandomForestClassifier(n_jobs=-1)
randomForest.fit(X_train, y_train)

In [101]:
y_pred3 = randomForest.predict(X_test) 

In [102]:
cnf_matrix3 = metrics.confusion_matrix(y_test, y_pred3) 
cnf_matrix3

array([[4019,    2],
       [ 346,    2]])

In [103]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred3)) 
print("Precision:",metrics.precision_score(y_test, y_pred3)) 
print("Recall:",metrics.recall_score(y_test, y_pred3)) 

Accuracy: 0.9203479056992446
Precision: 0.5
Recall: 0.005747126436781609


**(4) XGB**

In [104]:
XGB = XGBClassifier(n_jobs=-1)
XGB.fit(X_train, y_train)

In [105]:
y_pred4 = XGB.predict(X_test) 

In [106]:
cnf_matrix4 = metrics.confusion_matrix(y_test, y_pred4) 
cnf_matrix4

array([[3994,   27],
       [ 339,    9]])

In [107]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred4)) 
print("Precision:",metrics.precision_score(y_test, y_pred4)) 
print("Recall:",metrics.recall_score(y_test, y_pred4)) 

Accuracy: 0.9162279697871366
Precision: 0.25
Recall: 0.02586206896551724


## Data Insights

In [12]:
import pandas as pd
df_exp = pd.read_csv('/content/application_train.csv')
df_exp["TARGET"].value_counts()

0    5341
1     448
Name: TARGET, dtype: int64

In [15]:
import plotly.express as px
fig = px.box(df_exp, 
             x="DAYS_BIRTH",  
             color="TARGET", 
             title="TARGET Based on DAYS_BIRTH", 
            )
fig.show()

In [16]:
fig = px.box(df_exp, 
             x="NAME_EDUCATION_TYPE",  
             color="TARGET", 
             title="TARGET Based on DAYS_BIRTH", 
            )
fig.show()

In [27]:
df_exp["NAME_EDUCATION_TYPE"].value_counts()

Secondary / secondary special    4123
Higher education                 1400
Incomplete higher                 188
Lower secondary                    71
Academic degree                     7
Name: NAME_EDUCATION_TYPE, dtype: int64