In [1]:
#Categorical Feature Selection via Chi-Square
#Null Hypothesis (H0): There is no relationship between the variables
#Alternative Hypothesis (H1): There is a relationship between variables

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

df = pd.read_csv(r'D:\Learn\Statistics\Practical\Chi\loan_data_set.csv')

def initial_analysis(df):
    print(df.info())
    print(df.head())
    print(df.describe())
    print(df.isnull().sum())

In [2]:
initial_analysis(df)    
# so from initial analysi we found that there is null value as well some uninformative feature

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB
None
    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0 

In [3]:
# so from initial analysis we found that there is null value as well some uninformative feature
df1 = df.drop('Loan_ID', axis = 1)
df1.shape  # so we have 12 feature in hand

(614, 12)

In [4]:
# for better understanding of Chi-Square we will convert some numerical feature in categorical feature

df1['Loan_Amount_Term'] = df1['Loan_Amount_Term'].astype('O')
df1['Credit_History'] = df1['Credit_History'].astype('O')

In [6]:
# so finally apart from financial feature all have Object type

#here we will understand chi test so no need to fill null value so we will drop all null value

df2 = df1.dropna()
df2.shape

(480, 12)

In [7]:
#lets find all categorical column

categorical_column = df2.select_dtypes(exclude = 'number').drop('Loan_Status', axis = 1).columns

categorical_column.shape  #so we have 8 categorical column


(8,)

In [8]:
#to apply Chi-Square test, we will display the data in a cross-tabulation (contingency) format.

pd.crosstab(df2['Gender'], df2['Loan_Status'])

Loan_Status,N,Y
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,32,54
Male,116,278


In [9]:
#Now, let’s try to use the Chi-Square test of independence to test the relationship between these 2 features

from scipy.stats import chi2_contingency

In [10]:
chi_relation = chi2_contingency(pd.crosstab(df2['Gender'], df2['Loan_Status']))

print("chi_statictics {}, p_value {}".format(chi_relation[0], chi_relation[1]))

chi_statictics 1.6495637942018448, p_value 0.1990183114281211


In [None]:
# If we choose our p-value level to 0.05, as the p-value test result is more than 0.05 
# we fail to reject the Null Hypothesis. This means, 
# there is no relationship between the Gender and Loan Status feature based on the Chi-Square test of independence.

# Now we will apply this test for all feature

In [11]:
chi2_check = []
for i in categorical_column:
    if chi2_contingency(pd.crosstab(df2['Loan_Status'], df2[i]))[1] < 0.05:
        chi2_check.append('Reject Null Hypothesis')
    else:
        chi2_check.append('Fail to Reject Null Hypothesis')
res = pd.DataFrame(data = [categorical_column, chi2_check]).T
res.columns = ['Column', 'Hypothesis']
print(res)

             Column                      Hypothesis
0            Gender  Fail to Reject Null Hypothesis
1           Married          Reject Null Hypothesis
2        Dependents  Fail to Reject Null Hypothesis
3         Education  Fail to Reject Null Hypothesis
4     Self_Employed  Fail to Reject Null Hypothesis
5  Loan_Amount_Term          Reject Null Hypothesis
6    Credit_History          Reject Null Hypothesis
7     Property_Area          Reject Null Hypothesis


In [None]:
# from chi-square test we can say only which feature is correlated to target feature
# but for finding class within feature we will have to apply Post Hoc Testing


Post Hoc Testing


In [12]:
chi2_check = []
for i in categorical_column:
    if chi2_contingency(pd.crosstab(df2['Loan_Status'], df2[i]))[1] < 0.05:
        chi2_check.append('Reject Null Hypothesis')
    else:
        chi2_check.append('Fail to Reject Null Hypothesis')
res = pd.DataFrame(data = [categorical_column, chi2_check]).T
res.columns = ['Column', 'Hypothesis']
print(res)

             Column                      Hypothesis
0            Gender  Fail to Reject Null Hypothesis
1           Married          Reject Null Hypothesis
2        Dependents  Fail to Reject Null Hypothesis
3         Education  Fail to Reject Null Hypothesis
4     Self_Employed  Fail to Reject Null Hypothesis
5  Loan_Amount_Term          Reject Null Hypothesis
6    Credit_History          Reject Null Hypothesis
7     Property_Area          Reject Null Hypothesis
