In [1]:
## Importing necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
## Importing our train dataset

df = pd.read_csv('train_rf.csv',index_col=0)
df.head()

Unnamed: 0,loan_amnt,int_rate,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,...,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,verification_status_Source Verified,verification_status_Verified,initial_list_status_w,grade,sub_grade,emp_length
182829,17325,12.12,40000.0,31.41,0,0,10,0,32236,77.7,...,0,0,0,0,0,1,1,1,7,10.0
743542,35000,23.99,160000.0,12.48,0,2,15,1,14600,42.2,...,0,0,0,0,1,0,0,5,26,2.0
200388,10000,12.12,60000.0,7.58,2,0,8,0,39414,49.8,...,0,0,0,1,0,1,0,1,7,5.0
34655,10000,13.22,35000.0,8.09,0,0,7,0,8907,61.9,...,0,0,0,1,0,0,0,2,11,3.0
83062,35000,23.4,100000.0,31.69,2,0,10,0,31528,82.7,...,0,0,0,1,0,1,0,4,24,10.0


In [3]:
df.dtypes

loan_amnt                                int64
int_rate                               float64
annual_inc                             float64
dti                                    float64
delinq_2yrs                              int64
inq_last_6mths                           int64
open_acc                                 int64
pub_rec                                  int64
revol_bal                                int64
revol_util                             float64
total_acc                                int64
tot_coll_amt                           float64
tot_cur_bal                            float64
total_rev_hi_lim                       float64
default_ind                              int64
term_ 60 months                          int64
home_ownership_MORTGAGE                  int64
home_ownership_NONE                      int64
home_ownership_OTHER                     int64
home_ownership_OWN                       int64
home_ownership_RENT                      int64
verification_

In [82]:
# Changing the datatypes of categorical variables from 'int' to 'object'

data_types_dict = {'default_ind': str,'term_ 60 months':str,'home_ownership_MORTGAGE':str,'home_ownership_NONE':str,'home_ownership_OTHER':str,
                   'home_ownership_OWN':str,'home_ownership_RENT':str,'verification_status_Source Verified':str,'verification_status_Verified':str,
                   'initial_list_status_w':str,'grade':str,'sub_grade':str,'emp_length':str}
  
df = df.astype(data_types_dict)
  
df.dtypes

loan_amnt                                int64
int_rate                               float64
annual_inc                             float64
dti                                    float64
delinq_2yrs                              int64
inq_last_6mths                           int64
open_acc                                 int64
pub_rec                                  int64
revol_bal                                int64
revol_util                             float64
total_acc                                int64
tot_coll_amt                           float64
tot_cur_bal                            float64
total_rev_hi_lim                       float64
default_ind                             object
term_ 60 months                         object
home_ownership_MORTGAGE                 object
home_ownership_NONE                     object
home_ownership_OTHER                    object
home_ownership_OWN                      object
home_ownership_RENT                     object
verification_

In [83]:
## List of Categorical Features

cat_df = df.select_dtypes(include= 'object')
cat_df.columns

Index(['default_ind', 'term_ 60 months', 'home_ownership_MORTGAGE',
       'home_ownership_NONE', 'home_ownership_OTHER', 'home_ownership_OWN',
       'home_ownership_RENT', 'verification_status_Source Verified',
       'verification_status_Verified', 'initial_list_status_w', 'grade',
       'sub_grade', 'emp_length'],
      dtype='object')

In [84]:
## List of Continuous Features

num_df = df.select_dtypes(exclude= 'object')
num_df.columns

Index(['loan_amnt', 'int_rate', 'annual_inc', 'dti', 'delinq_2yrs',
       'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util',
       'total_acc', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim'],
      dtype='object')

## Chi-square test (Categorical Features Vs Target)

In [85]:
from scipy.stats import chi2_contingency

In [86]:
cat_df.shape

(558814, 13)

In [87]:
target= cat_df.default_ind

In [88]:
cat_df1= cat_df.iloc[:,1:]
cat_df1.columns

Index(['term_ 60 months', 'home_ownership_MORTGAGE', 'home_ownership_NONE',
       'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT',
       'verification_status_Source Verified', 'verification_status_Verified',
       'initial_list_status_w', 'grade', 'sub_grade', 'emp_length'],
      dtype='object')

In [89]:
# Hypothesis for Chi-square test of Independence:

# H0: The two variables are independent
# H1: The two variables are dependent

signi_feat= []

for col in cat_df1.columns:
    table = pd.crosstab(target, cat_df1[col])
    observed_value = table.values   
    test_stat, p, dof, expected_value = chi2_contingency(observed = observed_value, correction = False)
    if p < 0.05:
        signi_feat.append(col)
        
print(signi_feat)

['term_ 60 months', 'home_ownership_MORTGAGE', 'home_ownership_NONE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT', 'verification_status_Source Verified', 'verification_status_Verified', 'initial_list_status_w', 'grade', 'sub_grade', 'emp_length']


## Mann-Whitney U Test (Continuous Features Vs Target)

In [90]:
df['default_ind'].value_counts()

0    528457
1     30357
Name: default_ind, dtype: int64

In [92]:
num_df.columns

Index(['loan_amnt', 'int_rate', 'annual_inc', 'dti', 'delinq_2yrs',
       'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util',
       'total_acc', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim'],
      dtype='object')

In [93]:
df2= pd.concat([df['default_ind'],num_df],axis=1)

In [96]:
non_def= df2[df2['default_ind']=='0']['loan_amnt']
defaulter= df2[df2['default_ind']=='1']['loan_amnt']

In [98]:
# Normality Test

for col in df2.iloc[:,1:]:
    non_def= df2[df2['default_ind']=='0'][col]
    defaulter= df2[df2['default_ind']=='1'][col]
    print(col,'_def:',stats.kstest(non_def, 'norm').pvalue)
    print(col,'_non-def:',stats.kstest(defaulter, 'norm').pvalue)

loan_amnt _def: 0.0
loan_amnt _non-def: 0.0
int_rate _def: 0.0
int_rate _non-def: 0.0
annual_inc _def: 0.0
annual_inc _non-def: 0.0
dti _def: 0.0
dti _non-def: 0.0
delinq_2yrs _def: 0.0
delinq_2yrs _non-def: 0.0
inq_last_6mths _def: 0.0
inq_last_6mths _non-def: 0.0
open_acc _def: 0.0
open_acc _non-def: 0.0
pub_rec _def: 0.0
pub_rec _non-def: 0.0
revol_bal _def: 0.0
revol_bal _non-def: 0.0
revol_util _def: 0.0
revol_util _non-def: 0.0
total_acc _def: 0.0
total_acc _non-def: 0.0
tot_coll_amt _def: 0.0
tot_coll_amt _non-def: 0.0
tot_cur_bal _def: 0.0
tot_cur_bal _non-def: 0.0
total_rev_hi_lim _def: 0.0
total_rev_hi_lim _non-def: 0.0


As the data is not normally distributed, we will go for non-parametric test.

In [99]:
from scipy.stats import mannwhitneyu

In [100]:
# Hypothesis for Chi-square test of Independence:

# H0: Sample distributions are equal.
# H1: Sample distributions are not equal

signi_feat= []

for col in df2.iloc[:,1:]:
    non_def= df2[df2['default_ind']=='0'][col]
    defaulter= df2[df2['default_ind']=='1'][col]
    stat, p = mannwhitneyu(non_def, defaulter)
    
    if p < 0.5:
        signi_feat.append(col)
        
print(signi_feat)

['loan_amnt', 'int_rate', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim']


In [101]:
# Printing the p values

for col in df2.iloc[:,1:]:
    non_def= df2[df2['default_ind']=='0'][col]
    defaulter= df2[df2['default_ind']=='1'][col]
    stat, p = mannwhitneyu(non_def, defaulter)
    
    print(col,':',p)
        

loan_amnt : 0.02969753132893765
int_rate : 0.0
annual_inc : 0.0
dti : 5.119484275184611e-31
delinq_2yrs : 3.6357055241299304e-16
inq_last_6mths : 0.0
open_acc : 1.9804769406637932e-35
pub_rec : 1.9504681956872042e-53
revol_bal : 2.7207775533952853e-32
revol_util : 2.156529989015617e-251
total_acc : 2.744209453791259e-50
tot_coll_amt : 3.737382912358296e-92
tot_cur_bal : 1.756576442863898e-121
total_rev_hi_lim : 4.131015550169742e-161


#### 

# List of Significant Features

### Continuous Features:

['loan_amnt', 'int_rate', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim']

### Categorical Features:

['term_ 60 months', 'home_ownership_MORTGAGE', 'home_ownership_NONE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT', 'verification_status_Source Verified', 'verification_status_Verified', 'initial_list_status_w', 'grade', 'sub_grade', 'emp_length']