In [1]:
%matplotlib inline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from imblearn.pipeline import Pipeline 
from imblearn.over_sampling import SMOTE
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

from sklearn.tree import DecisionTreeClassifier

import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
app = (
    pd.read_csv("application_record.csv")
    .drop_duplicates('ID', keep='last') 
    .drop('OCCUPATION_TYPE', axis=1)
)  # 438 510 rows

In [4]:
record = (
    pd.read_csv("credit_record.csv")
    .drop('MONTHS_BALANCE', axis=1)
    .replace({'C': 0, 'X' : 0})
    .assign(STATUS=lambda x: pd.to_numeric(x['STATUS']),
            RESULT = lambda x:1 if (x['STATUS'].astype(int) >= 2).any() else 0)
    .drop_duplicates('ID', keep='last')
)  # 45985 rows

In [5]:
df = app.join(record.set_index('ID'), on='ID', how='inner')  # 36457 rows

In [10]:
numeric_features = ['AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'CNT_CHILDREN', 'DAYS_EMPLOYED', 'CNT_FAM_MEMBERS']
categorical_features = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
                       'NAME_HOUSING_TYPE']

In [11]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [12]:
transformer = ColumnTransformer(
    transformers = [
        ('numeric_data_preprocessing', numeric_transformer, numeric_features),
        ('categorical_data_preprocessing', categorical_transformer, categorical_features),
    ]
)

In [13]:
final_pipeline = Pipeline(
    steps=[
        ('transformer', transformer),
        ('rf_estimator', DecisionTreeClassifier())
    ])

In [14]:
# Split the data into train and test
X = df.drop('RESULT', axis=1)
y = df['RESULT']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [15]:
final_pipeline.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('numeric_data_preprocessing',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['AMT_INCOME_TOTAL',
                                                   'DAYS_BIRTH', 'CNT_CHILDREN',
                                                   'DAYS_EMPLOYED',
                                                   'CNT_FAM_MEMBERS']),
                                                 ('categorical_data_preprocessing',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_val

In [16]:
y_pred = final_pipeline.predict(X_test)

In [17]:
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("F1 Score: ", f1_score(y_test, y_pred, average='weighted'))
print("Precision Score: ", precision_score(y_test, y_pred, average='weighted'))
print("Recall Score: ", recall_score(y_test, y_pred, average='weighted'))

Accuracy Score:  1.0
F1 Score:  1.0
Precision Score:  1.0
Recall Score:  1.0
