## Importing Libraries

In [1]:
import pandas as pd
import numpy as np

## Importing Datasets

In [2]:
application_record = pd.read_csv('data/Credit Card Approval Prediction/application_record.csv')
credit_record = pd.read_csv('data/Credit Card Approval Prediction/credit_record.csv')

In [3]:
application_record

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438552,6840104,M,N,Y,0,135000.0,Pensioner,Secondary / secondary special,Separated,House / apartment,-22717,365243,1,0,0,0,,1.0
438553,6840222,F,N,N,0,103500.0,Working,Secondary / secondary special,Single / not married,House / apartment,-15939,-3007,1,0,0,0,Laborers,1.0
438554,6841878,F,N,N,0,54000.0,Commercial associate,Higher education,Single / not married,With parents,-8169,-372,1,1,0,0,Sales staff,1.0
438555,6842765,F,N,Y,0,72000.0,Pensioner,Secondary / secondary special,Married,House / apartment,-21673,365243,1,0,0,0,,2.0


In [4]:
credit_record

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C
...,...,...,...
1048570,5150487,-25,C
1048571,5150487,-26,C
1048572,5150487,-27,C
1048573,5150487,-28,C


## Creating The Target Column

In [5]:
# Replace X,C values with 0 as they are identified as Good clients
credit_record.replace(['X','C'], 0,inplace=True)

In [6]:
credit_record.STATUS = pd.to_numeric(credit_record.STATUS)

In [7]:
# Searching for customers who have at least one late month
drop_ls = []
for i in range(len(credit_record)):
    if credit_record.STATUS[i] != 0:
        drop_ls.append(credit_record.ID[i])

In [8]:
len(drop_ls)

14194

In [9]:
# Changing the STATUS of any client with at least one late month to 1
for i in range(len(credit_record)):
        if credit_record.ID[i] in drop_ls:
            credit_record.STATUS[i] = 1

In [10]:
credit_record.STATUS.value_counts()

0    904764
1    143811
Name: STATUS, dtype: int64

In [11]:
credit_record.drop_duplicates(inplace=True)
credit_record

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,0
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,0
...,...,...,...
1048570,5150487,-25,0
1048571,5150487,-26,0
1048572,5150487,-27,0
1048573,5150487,-28,0


## Merging Datasets

In [12]:
print(f'No. of IDs in application_record = {len(application_record.ID)} No. of IDs in credit_record = {len(credit_record.ID)}')

No. of IDs in application_record = 438557 No. of IDs in credit_record = 1048575


In [13]:
dataset = application_record.merge(credit_record, on=['ID'], how='inner')
 # on to choose which column to merger on
 # How to get merge only the intersection between them

In [14]:
dataset.drop(['ID'],inplace=True,axis=1)

In [15]:
dataset.duplicated().sum()

412393

In [16]:
dataset.drop_duplicates(inplace=True)

In [17]:
dataset

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,MONTHS_BALANCE,STATUS
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,0,1
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,-1,1
2,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,-2,1
3,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,-3,1
4,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,-4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
777710,M,N,Y,0,112500.0,Working,Secondary / secondary special,Single / not married,Rented apartment,-9188,-1193,1,0,0,0,Laborers,1.0,-9,1
777711,M,N,Y,0,112500.0,Working,Secondary / secondary special,Single / not married,Rented apartment,-9188,-1193,1,0,0,0,Laborers,1.0,-10,1
777712,M,N,Y,0,112500.0,Working,Secondary / secondary special,Single / not married,Rented apartment,-9188,-1193,1,0,0,0,Laborers,1.0,-11,1
777713,M,N,Y,0,112500.0,Working,Secondary / secondary special,Single / not married,Rented apartment,-9188,-1193,1,0,0,0,Laborers,1.0,-12,1


## Getting Data's Information and Description

In [18]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 365322 entries, 0 to 777714
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   CODE_GENDER          365322 non-null  object 
 1   FLAG_OWN_CAR         365322 non-null  object 
 2   FLAG_OWN_REALTY      365322 non-null  object 
 3   CNT_CHILDREN         365322 non-null  int64  
 4   AMT_INCOME_TOTAL     365322 non-null  float64
 5   NAME_INCOME_TYPE     365322 non-null  object 
 6   NAME_EDUCATION_TYPE  365322 non-null  object 
 7   NAME_FAMILY_STATUS   365322 non-null  object 
 8   NAME_HOUSING_TYPE    365322 non-null  object 
 9   DAYS_BIRTH           365322 non-null  int64  
 10  DAYS_EMPLOYED        365322 non-null  int64  
 11  FLAG_MOBIL           365322 non-null  int64  
 12  FLAG_WORK_PHONE      365322 non-null  int64  
 13  FLAG_PHONE           365322 non-null  int64  
 14  FLAG_EMAIL           365322 non-null  int64  
 15  OCCUPATION_TYPE  

In [19]:
dataset.describe()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,MONTHS_BALANCE,STATUS
count,365322.0,365322.0,365322.0,365322.0,365322.0,365322.0,365322.0,365322.0,365322.0,365322.0,365322.0
mean,0.425742,184898.2,-16161.482656,60776.306365,1.0,0.221878,0.294214,0.089595,2.19825,-21.69531,0.203226
std,0.76854,101731.6,4144.182785,139028.719425,0.0,0.41551,0.455689,0.285601,0.92849,15.016078,0.4024
min,0.0,27000.0,-25152.0,-15713.0,1.0,0.0,0.0,0.0,1.0,-60.0,0.0
25%,0.0,117000.0,-19614.0,-3208.0,1.0,0.0,0.0,0.0,2.0,-33.0,0.0
50%,0.0,157500.0,-15849.0,-1566.0,1.0,0.0,0.0,0.0,2.0,-20.0,0.0
75%,1.0,225000.0,-12676.0,-378.0,1.0,0.0,1.0,0.0,3.0,-9.0,0.0
max,19.0,1575000.0,-7489.0,365243.0,1.0,1.0,1.0,1.0,20.0,0.0,1.0


In [20]:
dataset.isna().sum()

CODE_GENDER                 0
FLAG_OWN_CAR                0
FLAG_OWN_REALTY             0
CNT_CHILDREN                0
AMT_INCOME_TOTAL            0
NAME_INCOME_TYPE            0
NAME_EDUCATION_TYPE         0
NAME_FAMILY_STATUS          0
NAME_HOUSING_TYPE           0
DAYS_BIRTH                  0
DAYS_EMPLOYED               0
FLAG_MOBIL                  0
FLAG_WORK_PHONE             0
FLAG_PHONE                  0
FLAG_EMAIL                  0
OCCUPATION_TYPE        113130
CNT_FAM_MEMBERS             0
MONTHS_BALANCE              0
STATUS                      0
dtype: int64

In [21]:
dataset.isna().sum().sum()

113130

### Displaying Column "OCCUPATION_TYPE" with NULL Values

In [22]:
dataset.OCCUPATION_TYPE

0              NaN
1              NaN
2              NaN
3              NaN
4              NaN
            ...   
777710    Laborers
777711    Laborers
777712    Laborers
777713    Laborers
777714    Laborers
Name: OCCUPATION_TYPE, Length: 365322, dtype: object

In [23]:
dataset.OCCUPATION_TYPE.value_counts()

Laborers                 62839
Core staff               34175
Sales staff              33786
Managers                 31066
Drivers                  23349
High skill tech staff    14459
Medicine staff           11937
Accountants              11926
Security staff            6851
Cooking staff             6663
Cleaning staff            5201
Private service staff     2989
Low-skill Laborers        2000
Secretaries               1523
Waiters/barmen staff      1272
HR staff                   973
IT staff                   617
Realty agents              566
Name: OCCUPATION_TYPE, dtype: int64

### Replacing Null Values in 'OCCUPATION_TYPE' with Not Employed

In [24]:
dataset.OCCUPATION_TYPE.replace(np.nan, 'Other', inplace = True)

In [25]:
dataset.OCCUPATION_TYPE.value_counts()

Other                    113130
Laborers                  62839
Core staff                34175
Sales staff               33786
Managers                  31066
Drivers                   23349
High skill tech staff     14459
Medicine staff            11937
Accountants               11926
Security staff             6851
Cooking staff              6663
Cleaning staff             5201
Private service staff      2989
Low-skill Laborers         2000
Secretaries                1523
Waiters/barmen staff       1272
HR staff                    973
IT staff                    617
Realty agents               566
Name: OCCUPATION_TYPE, dtype: int64

## Label Encoding

In [26]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [27]:
for col in dataset.columns:
    if dataset[col].dtype == 'object':
        dataset[col] = le.fit_transform(dataset[col])

In [28]:
dataset.drop_duplicates(inplace=True)

## Splitting the dataset

In [87]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:,-1]

In [97]:
from sklearn.model_selection import KFold,StratifiedKFold
kfold = KFold(n_splits=8, shuffle=True, random_state=0)

In [98]:
for train_index, test_index in kfold.split(X,y):
    # Split the data into train and test sets
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

## Building Model

In [108]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(class_weight='balanced',max_depth=48,splitter='best',random_state=42,min_samples_split=48,min_samples_leaf=15)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight='balanced', max_depth=48,
                       min_samples_leaf=15, min_samples_split=48,
                       random_state=42)

In [109]:
y_pred = classifier.predict(X_test)

In [110]:
y_pred_train= classifier.predict(X_train)

## Calculating Scores

### Test Results

In [111]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
acc = accuracy_score(y_test,y_pred)
confusion_mat = confusion_matrix(y_test,y_pred)
pre_score = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
specificity_test = confusion_mat[0,0] / (confusion_mat[0,0] + confusion_mat[0,1])

In [112]:
print(f'Accuracy Score = {acc}\n Confusion Matrix = {confusion_mat}\n Precision Score = {pre_score}\n Recall Score = {recall}\n F1 Score = {f1}\n Specificity Test = {specificity_test}')

Accuracy Score = 0.8524471696047301
 Confusion Matrix = [[29815  6485]
 [  253  9112]]
 Precision Score = 0.5842149131243187
 Recall Score = 0.9729845168179392
 F1 Score = 0.7300697059530485
 Specificity Test = 0.8213498622589531


### Train Results

In [113]:
acc_train = accuracy_score(y_train,y_pred_train)
confusion_mat_train = confusion_matrix(y_train,y_pred_train)
pre_score_train = precision_score(y_train,y_pred_train)
recall_train = recall_score(y_train,y_pred_train)
f1_train = f1_score(y_train,y_pred_train)
specificity_train = confusion_mat_train[0,0] / (confusion_mat_train[0,0] + confusion_mat_train[0,1])

In [114]:
print(f'Accuracy Score = {acc_train}\n Confusion Matrix = {confusion_mat_train}\n Precision Score = {pre_score_train}\n Recall Score = {recall_train}\n F1 Score = {f1_train}\n Specificity Test = {specificity_train}')

Accuracy Score = 0.8615203170898807
 Confusion Matrix = [[211329  43450]
 [   816  64062]]
 Precision Score = 0.5958590668948582
 Recall Score = 0.9874225469342458
 F1 Score = 0.7432217646035153
 Specificity Test = 0.8294600418401831
