In [1]:
%load_ext kedro.ipython

In [46]:
import logging

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)

_logger = logging.getLogger(__name__)
_logger.setLevel(logging.DEBUG)

In [3]:
train_df = catalog.load("feature_engineered_train_dataset")
test_df = catalog.load("feature_engineered_test_dataset")

### Settings & Parameters

In [7]:
CATEGORICAL_VARS = [
    'GRADE', 'SUB_GRADE', 'HOME_OWNERSHIP', 'VERIFICATION_STATUS', 'LOAN_TITLE', 'INITIAL_LIST_STATUS', 'APPLICATION_TYPE', 
    'IS_CONSOLIDATION',
]

## Statistical Test
---
### Chi-Squared Test of Independence

In [5]:
from scipy.stats import chi2_contingency

In [55]:
chi2_results = dict()
for var in CATEGORICAL_VARS:
    contingency_table = pd.crosstab(train_df[var], train_df['LOAN_STATUS'])
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    n = contingency_table.sum().sum()
    min_dim = min(contingency_table.shape) - 1
    cramers_v = np.sqrt(chi2 / (n * min_dim))
    chi2_results[var] = {'P_VALUE': p_value, 'CRAMERS_V': cramers_v}
chi2_results = pd.DataFrame(chi2_results).T.reset_index(drop=False).rename(columns={'index': 'FEATURE'}).sort_values(by='P_VALUE', ascending=True)

In [58]:
chi2_results.style.background_gradient(cmap='BuGn_r', vmin=0, vmax=1)

Unnamed: 0,FEATURE,P_VALUE,CRAMERS_V
2,HOME_OWNERSHIP,0.000319,0.015449
5,INITIAL_LIST_STATUS,0.000504,0.013393
4,LOAN_TITLE,0.003142,0.030678
0,GRADE,0.029361,0.014418
7,IS_CONSOLIDATION,0.029854,0.008362
1,SUB_GRADE,0.505268,0.022193
3,VERIFICATION_STATUS,0.718691,0.003129
6,APPLICATION_TYPE,1.0,0.0


In [25]:
pd.crosstab(train_df['HOME_OWNERSHIP'], train_df['LOAN_STATUS'], normalize='index').style.background_gradient(cmap='BuGn', axis=None)

LOAN_STATUS,0,1
HOME_OWNERSHIP,Unnamed: 1_level_1,Unnamed: 2_level_1
MORTGAGE,0.911337,0.088663
OWN,0.898449,0.101551
RENT,0.904306,0.095694


In [26]:
pd.crosstab(train_df['INITIAL_LIST_STATUS'], train_df['LOAN_STATUS'], normalize='index').style.background_gradient(cmap='BuGn', axis=None)

LOAN_STATUS,0,1
INITIAL_LIST_STATUS,Unnamed: 1_level_1,Unnamed: 2_level_1
FORWARDED,0.903286,0.096714
WAITING,0.911099,0.088901


In [27]:
pd.crosstab(train_df['LOAN_TITLE'], train_df['LOAN_STATUS'], normalize='index').style.background_gradient(cmap='BuGn', axis=None)

LOAN_STATUS,0,1
LOAN_TITLE,Unnamed: 1_level_1,Unnamed: 2_level_1
BILL CONSOLIDATION,0.933333,0.066667
BILLS,0.6875,0.3125
BUSINESS,0.923497,0.076503
CAR FINANCING,0.868056,0.131944
CONSOLIDATION,0.911765,0.088235
CREDIT CARD,0.888889,0.111111
CREDIT CARD CONSOLIDATION,0.885714,0.114286
CREDIT CARD DEBT,0.888889,0.111111
CREDIT CARD LOAN,0.733333,0.266667
CREDIT CARD PAYDOWN,1.0,0.0


In [28]:
pd.crosstab(train_df['GRADE'], train_df['LOAN_STATUS'], normalize='index').style.background_gradient(cmap='BuGn', axis=None)

LOAN_STATUS,0,1
GRADE,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.908752,0.091248
B,0.912763,0.087237
C,0.906104,0.093896
D,0.90362,0.09638
E,0.904127,0.095873
F,0.89626,0.10374
G,0.893651,0.106349


In [29]:
pd.crosstab(train_df['SUB_GRADE'], train_df['LOAN_STATUS'], normalize='index').style.background_gradient(cmap='BuGn', axis=None)

LOAN_STATUS,0,1
SUB_GRADE,Unnamed: 1_level_1,Unnamed: 2_level_1
A1,0.908358,0.091642
A2,0.902014,0.097986
A3,0.905638,0.094362
A4,0.908569,0.091431
A5,0.905932,0.094068
B1,0.900137,0.099863
B2,0.91108,0.08892
B3,0.917479,0.082521
B4,0.908113,0.091887
B5,0.904343,0.095657


In [30]:
pd.crosstab(train_df['IS_CONSOLIDATION'], train_df['LOAN_STATUS'], normalize='index').style.background_gradient(cmap='BuGn', axis=None)

LOAN_STATUS,0,1
IS_CONSOLIDATION,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.905359,0.094641
1,0.91028,0.08972


In [32]:
pd.crosstab(train_df['VERIFICATION_STATUS'], train_df['LOAN_STATUS'], normalize='index').style.background_gradient(cmap='BuGn', axis=None)

LOAN_STATUS,0,1
VERIFICATION_STATUS,Unnamed: 1_level_1,Unnamed: 2_level_1
Not Verified,0.907884,0.092116
Source Verified,0.906617,0.093383
Verified,0.908729,0.091271


In [33]:
pd.crosstab(train_df['APPLICATION_TYPE'], train_df['LOAN_STATUS'], normalize='index').style.background_gradient(cmap='BuGn', axis=None)

LOAN_STATUS,0,1
APPLICATION_TYPE,Unnamed: 1_level_1,Unnamed: 2_level_1
INDIVIDUAL,0.907484,0.092516
JOINT,0.910569,0.089431
