In [68]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np
import statsmodels.api as sm
from scipy.stats import chi2_contingency

In [3]:
#import dataset
hmeq_df=pd.read_csv("C:/hmeq.csv")
hmeq_df.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1100,25860.0,39025.0,HomeImp,Other,10.5,0.0,0.0,94.366667,1.0,9.0,
1,1,1300,70053.0,68400.0,HomeImp,Other,7.0,0.0,2.0,121.833333,0.0,14.0,
2,1,1500,13500.0,16700.0,HomeImp,Other,4.0,0.0,0.0,149.466667,1.0,10.0,
3,1,1500,,,,,,,,,,,
4,0,1700,97800.0,112000.0,HomeImp,Office,3.0,0.0,0.0,93.333333,0.0,14.0,


In [4]:
np.random.seed(12345)
n_size = 1000

#sample for FICO
sam_df= hmeq_df.sample(n=n_size, replace = False)

print(len(sam_df))

1000


In [7]:
sam_df.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
1603,0,11600,89205.0,128462.0,DebtCon,Office,10.0,0.0,0.0,128.123598,0.0,21.0,37.523325
5920,0,68100,193298.0,288525.0,DebtCon,Self,10.0,0.0,0.0,228.287295,2.0,25.0,30.395869
1452,1,11000,77517.0,86214.0,HomeImp,ProfExe,2.0,2.0,0.0,352.379138,0.0,28.0,28.030214
604,0,7700,32243.0,36369.0,HomeImp,ProfExe,12.0,0.0,0.0,180.415787,0.0,13.0,21.720818
2172,1,13500,70000.0,93500.0,DebtCon,Sales,25.0,0.0,0.0,250.633333,0.0,46.0,


In [13]:
results = (
    hmeq_df["BAD"]
    .value_counts()
    .sort_index()
    .rename("Frequency")
    .to_frame()
)
results["Percent"] = (results["Frequency"]/len(hmeq_df)*100).round(2)
results                      

Unnamed: 0,Frequency,Percent
0,4771,80.05
1,1189,19.95


In [14]:
# Descritptive Statistics
results = (
    sam_df["BAD"]
    .value_counts()
    .sort_index()
    .rename("Frequency")
    .to_frame()
)
results["Percent"] = (results["Frequency"]/len(sam_df)*100).round(2)
results                      

Unnamed: 0,Frequency,Percent
0,815,81.5
1,185,18.5


In [17]:
# Descriptive Statistics
hmeq_df["LOAN"].describe().round(2)

count     5960.00
mean     18607.97
std      11207.48
min       1100.00
25%      11100.00
50%      16300.00
75%      23300.00
max      89900.00
Name: LOAN, dtype: float64

In [19]:
# What is the sum of the missing values?
hmeq_df.isnull().sum()

BAD           0
LOAN          0
MORTDUE     518
VALUE       112
REASON      252
JOB         279
YOJ         515
DEROG       708
DELINQ      580
CLAGE       308
NINQ        510
CLNO        222
DEBTINC    1267
dtype: int64

In [32]:
# Dropping some columns
mySample_cnt = sam_df.drop(sam_df.columns[[0,4,5]], axis =1)

# replace NaN with column mean for numeric columns only
mySample_cnt = mySample_cnt.apply(
    lambda x: x.fillna(x.mean()) if x.dtype.kind in "biufc" else x
)

In [34]:
mySample_cnt.isna().sum()

LOAN       0
MORTDUE    0
VALUE      0
YOJ        0
DEROG      0
DELINQ     0
CLAGE      0
NINQ       0
CLNO       0
DEBTINC    0
dtype: int64

In [99]:
mySample_cnt_mean = mySample_cnt.mean()
mySample_cnt_sd = mySample_cnt.std()

In [100]:
zscores = (mySample_cnt - mySample_cnt.mean())/ mySample_cnt.std(ddof=1)
zscores

Unnamed: 0,LOAN,MORTDUE,VALUE,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
1603,-0.584538,0.347507,0.449617,0.117663,-0.322535,-0.451824,-0.661902,-0.743905,-0.106671,0.539921
5920,4.307612,2.748932,3.183773,0.117663,-0.322535,-0.451824,0.510569,0.440350,0.281443,-0.491403
1452,-0.636490,0.077865,-0.272053,-0.954242,2.260720,-0.451824,1.963132,-0.743905,0.572529,-0.833707
604,-0.922227,-0.966606,-1.123493,0.385640,-0.322535,-0.451824,-0.049793,-0.743905,-0.882900,-1.746660
2172,-0.420024,-0.095552,-0.147595,2.127486,-0.322535,-0.451824,0.772142,-0.743905,2.319043,0.000000
...,...,...,...,...,...,...,...,...,...,...
2385,-0.342096,0.074381,0.481526,0.653616,-0.322535,0.505612,1.176764,-0.151778,-0.397757,-0.813241
1367,-0.671125,-0.356220,-0.583743,0.117663,-0.322535,-0.451824,-0.797584,-0.151778,-0.300728,-0.325819
4143,0.281329,-0.344270,-0.068831,1.189569,-0.322535,-0.451824,5.175044,-0.151778,0.087386,-1.815989
4627,0.515113,-0.537897,-0.579780,-0.284301,-0.322535,-0.451824,-1.328435,-0.151778,-0.300728,0.391299


In [43]:
max_abs = zscores.abs().max(axis=1)
max_abs

1603    0.743905
5920    4.307612
1452    2.260720
604     1.746660
2172    2.319043
          ...   
2385    1.176764
1367    0.797584
4143    5.175044
4627    1.328435
5807    2.740392
Length: 1000, dtype: float64

In [47]:
filtered_example =  zscores[max_abs < 3]
filtered_example

Unnamed: 0,LOAN,MORTDUE,VALUE,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
1603,-0.584538,0.347507,0.449617,0.117663,-0.322535,-0.451824,-0.661902,-0.743905,-0.106671,0.539921
1452,-0.636490,0.077865,-0.272053,-0.954242,2.260720,-0.451824,1.963132,-0.743905,0.572529,-0.833707
604,-0.922227,-0.966606,-1.123493,0.385640,-0.322535,-0.451824,-0.049793,-0.743905,-0.882900,-1.746660
2172,-0.420024,-0.095552,-0.147595,2.127486,-0.322535,-0.451824,0.772142,-0.743905,2.319043,0.000000
4897,0.636335,-0.958070,-0.667478,-1.222218,-0.322535,-0.451824,0.395831,-0.743905,-0.009643,1.073389
...,...,...,...,...,...,...,...,...,...,...
2052,-0.463317,2.372937,1.671611,-0.820254,-0.322535,1.463048,0.130678,-0.151778,2.707157,0.000000
2385,-0.342096,0.074381,0.481526,0.653616,-0.322535,0.505612,1.176764,-0.151778,-0.397757,-0.813241
1367,-0.671125,-0.356220,-0.583743,0.117663,-0.322535,-0.451824,-0.797584,-0.151778,-0.300728,-0.325819
4627,0.515113,-0.537897,-0.579780,-0.284301,-0.322535,-0.451824,-1.328435,-0.151778,-0.300728,0.391299


In [96]:
print(filtered_example.shape[0],"\n",zscores.shape[0])

866 
 1000


In [72]:
residence = pd.DataFrame({
    "default": ["good"]*6 + ["bad"]*6,
    "resstatus": ["owner", "rentunf", "rentfurn", "withpar", "other", "noanswer"]*2,
    "count": [6000, 1600, 350, 950, 90, 10,
              300, 400, 140, 100, 50, 10]
})    

In [71]:
#For coarse 1
coarse1 = pd.DataFrame({
    "default": ["good"]*3 + ["bad"]*3,
    "resstatus": ["owner", "withpar", "other"]*2,
    "count": [6000, 950, 1050, 300, 100, 600]
})


In [74]:
#For coarse 2
coarse2 = pd.DataFrame({
    "default":["good"]*3 +["bad"]*3,
    "resstatus": ["owner","withpar", "other"]*2,
    "count" :[6000,950,2050,300,100,600]
})

In [76]:
coarse1_tbl = coarse1.pivot_table(
    index="default",
    columns ="resstatus",
    values = "count",
    aggfunc= "sum"
)
coarse1_tbl

resstatus,other,owner,withpar
default,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bad,600,300,100
good,1050,6000,950


In [77]:
coarse2_tbl = coarse2.pivot_table(
    index="default",
    columns ="resstatus",
    values = "count",
    aggfunc= "sum"
)
coarse2_tbl

resstatus,other,owner,withpar
default,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bad,600,300,100
good,2050,6000,950


In [78]:
coarse1_row_prop = coarse1_tbl.div(coarse1_tbl.sum(axis=1), axis =0)
coarse2_row_prop = coarse2_tbl.div(coarse2_tbl.sum(axis=1), axis =0)
print(coarse1_row_prop)
print(coarse2_row_prop)

resstatus    other  owner  withpar
default                           
bad        0.60000   0.30  0.10000
good       0.13125   0.75  0.11875
resstatus     other     owner   withpar
default                                
bad        0.600000  0.300000  0.100000
good       0.227778  0.666667  0.105556


In [80]:
coarse1_col_prop = coarse1_tbl.div(coarse1_tbl.sum(axis=0), axis =1)
coarse2_col_prop = coarse2_tbl.div(coarse2_tbl.sum(axis=0), axis =1)
print(coarse1_col_prop)
print(coarse2_col_prop)

resstatus     other     owner   withpar
default                                
bad        0.363636  0.047619  0.095238
good       0.636364  0.952381  0.904762
resstatus     other     owner   withpar
default                                
bad        0.226415  0.047619  0.095238
good       0.773585  0.952381  0.904762


In [83]:
#Chi-square tests 

chi2_1,p_1, dof_1, expected_1 = chi2_contingency(coarse1_tbl)
print("Coarse chi-square:", chi2_1)
print("p-value:", p_1)

Coarse chi-square: 1325.1623376623374
p-value: 1.756525923308038e-288


In [87]:

chi2_2,p_2, dof_1, expected_2 = chi2_contingency(coarse2_tbl)
print("Coarse chi-square:", chi2_2)
print("p-value:", p_2)

Coarse chi-square: 662.8731157033042
p-value: 1.1453331317966107e-144


Both coarsenings show a strong and statistically significant association between residence and default.
However , Coarse 1 retains substantially more discriminatory information, as evidenced by a higher chi-squre statistic.