## Mutual information

In [1]:
import numpy as np
import pandas as pd
import math
from scipy import stats

### Loading Data

In [2]:
df_lp = pd.read_csv('LoanPrediction_train.csv')
df_lp.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
# convert categorical variables to numericals
df_lp['Married'] =df_lp['Married'].astype('category').cat.codes
df_lp['Gender'] =df_lp['Gender'].astype('category').cat.codes
df_lp['Loan_ID'] =df_lp['Loan_ID'].astype('category').cat.codes
df_lp['Dependents'] =df_lp['Dependents'].astype('category').cat.codes
df_lp['Education'] =df_lp['Education'].astype('category').cat.codes
df_lp['Self_Employed'] =df_lp['Self_Employed'].astype('category').cat.codes
df_lp['Property_Area'] =df_lp['Property_Area'].astype('category').cat.codes
df_lp['Loan_Status'] =df_lp['Loan_Status'].astype('category').cat.codes

#handle missing data
df_lp['Gender'].fillna(df_lp['Gender'].mode()[0], inplace=True)
df_lp['Married'].fillna(df_lp['Married'].mode()[0] , inplace=True )
df_lp['Dependents'].fillna(df_lp['Dependents'].mode()[0], inplace=True)
df_lp['LoanAmount'].fillna(df_lp['LoanAmount'].mode()[0], inplace=True)
df_lp['Loan_Amount_Term'].fillna(df_lp['Loan_Amount_Term'].mode()[0], inplace=True)
df_lp['Credit_History'].fillna(df_lp['Credit_History'].mode()[0], inplace=True)

df_lp.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,1,0,0,0,0,5849,0.0,120.0,360.0,1.0,2,1
1,1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,1
3,3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
4,4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,1


In [4]:
def mutual_info(dataset, index_x , index_y):
    N = dataset.shape[0]
    x_unique=dataset[index_x].unique()
    y_unique=dataset[index_y].unique()
    xy = dataset[[index_x , index_y]].values
    
    result = 0
    for value_x in x_unique:
        x_count = len ([ i for i in dataset[index_x] if i == value_x])
        p_x = np.round( x_count / N , 6)
        for value_y in y_unique:
           
            p_y = np.round(len ([ i for i in dataset[index_y] if i == value_y]) / N    , 6)
            p_xy = np.round(len ([ item for item in xy if item[0] == value_x and item[1] == value_y]) / x_count , 6)
            
            if( p_xy > 0 and p_x > 0 and  p_y > 0 ):
                result +=  (p_xy * p_x) * math.log(  (p_xy * p_x) / (p_x * p_y) )
   
    

    return np.round(result,4)

In [5]:
for feat in df_lp.columns:
    if feat != "Loan_Status":
        print(mutual_info(df_lp, feat, "Loan_Status"))

0.6214
0.0004
0.0058
0.003
0.0036
0.0001
0.5173
0.2897
0.2004
0.0124
0.1424
0.0102
