In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from matplotlib.ticker import MaxNLocator

from scipy.stats import levene
import scipy.stats as stats
from scipy.stats import chi2_contingency
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv("Bank Leads Cleaned v2 20240119.csv")

## Scale numeric features and run hypothesis tests

In [3]:
approved = data[data['Approved']==1]
not_approved = data[data['Approved']!=1]
print("Approved: ", approved.shape)
print("Not Approved: ", not_approved.shape)

Approved:  (1015, 25)
Not Approved:  (67782, 25)


In [4]:
"""
Scaling: Box-Cox, Standardization
Testing: Levene's Test of Variance, Mann Whitney U Test of Distribution
"""

def scale_test(data1, data2):
    # Instantiate PowerTransformer and run
    pt = PowerTransformer(method='box-cox')

    pt_scaled_a = pt.fit_transform((data1.values + 1).reshape(-1, 1))
    pt_scaled_na = pt.fit_transform((data2.values + 1).reshape(-1, 1))

    ## Levene's test of variance
    lev_a = pd.Series(map(lambda x: x, pt_scaled_a))
    lev_na = pd.Series(map(lambda x: x, pt_scaled_na))
    print("***Box-Cox Scaled***\nLevene's test of variance...................................")
    if levene(lev_a, lev_na)[1] < 0.05:
        print('Reject the null hypothesis of equal variance between groups.')
        print(f'P-value is {levene(lev_a, lev_na)[1]}.')
    else:
        print('Fail to reject the null hypothesis of equal variance between groups.')
        print(f'P-value is {levene(lev_a, lev_na)[1]}.')

    ## Mann Whitney U test of distribution
    print("\nMann Whitney U test of distribution..........................")
    if mannwhitneyu(pt_scaled_a, pt_scaled_na, alternative='two-sided')[1] < 0.05:
        print("Reject the null hypothesis that Monthly Income are similar.")
        print(f"P-value is {mannwhitneyu(pt_scaled_a, pt_scaled_na, alternative='two-sided')[1]}")
    else:
        print("Fail to reject the null hypothesis that Monthly Income are similar.")
        print(f"P-value is {mannwhitneyu(pt_scaled_a, pt_scaled_na, alternative='two-sided')[1]}")

    # Instantiate StandardScaler and run
    ss = StandardScaler()

    ss_scaled_a = ss.fit_transform((data1.values + 1).reshape(-1, 1))
    ss_scaled_na = ss.fit_transform((data2.values + 1).reshape(-1, 1))

    # Levene's test of variance
    lev_a_ss = pd.Series(map(lambda x: x, ss_scaled_a))
    lev_na_ss = pd.Series(map(lambda x: x, ss_scaled_na))                              
    print("\n\n***Standardized***\nLevene's test of variance...................................")
    if levene(lev_a_ss, lev_na_ss)[1] < 0.05:
        print('Reject the null hypothesis of equal variance between groups.')
        print(f'P-value is {levene(lev_a_ss, lev_na_ss)[1]}.')
    else:
        print('Fail to reject the null hypothesis of equal variance between groups.')
        print(f'P-value is {levene(lev_a_ss, lev_na_ss)[1]}.')

    # Mann Whitney U test of distribution
    print("\nMann Whitney U test of distribution..........................")
    if mannwhitneyu(pt_scaled_a, pt_scaled_na, alternative='two-sided')[1] < 0.05:
        print("Reject the null hypothesis that Monthly Income are similar.")
        print(f"P-value is {mannwhitneyu(pt_scaled_a, pt_scaled_na, alternative='two-sided')[1]}")
    else:
        print("Fail to reject the null hypothesis that Monthly Income are similar.")
        print(f"P-value is {mannwhitneyu(pt_scaled_a, pt_scaled_na, alternative='two-sided')[1]}")

### Scale and test Monthly_Income

In [5]:
scale_test(approved['Monthly_Income'], not_approved['Monthly_Income'])

***Box-Cox Scaled***
Levene's test of variance...................................
Reject the null hypothesis of equal variance between groups.
P-value is [8.97713067e-06].

Mann Whitney U test of distribution..........................


NameError: name 'mannwhitneyu' is not defined

### Scale and test Age

In [None]:
scale_test(approved['Age'], not_approved['Age'])

## Scale features and run Welch's unequal variance t-test

In [6]:
"""
Scaling: Box-Cox, Standardization 
Testing: Levene's Test of Variance, Welch's T-Test of Unequal Variance
"""


def scale_test_welch(data1, data2):
    # Instantiate PowerTransformer and run
    pt = PowerTransformer(method='box-cox')

    pt_scaled_a = pt.fit_transform((data1.values + 1).reshape(-1, 1))
    pt_scaled_na = pt.fit_transform((data2.values + 1).reshape(-1, 1))

    ## Levene's test of variance
    lev_a = pd.Series(map(lambda x: x, pt_scaled_a))
    lev_na = pd.Series(map(lambda x: x, pt_scaled_na))
    print("***Box-Cox Scaled***\nLevene's test of variance...................................")
    if levene(lev_a, lev_na)[1] < 0.05:
        print('Reject the null hypothesis of equal variance between groups.')
        print(f'P-value is {levene(lev_a, lev_na)[1]}.')
    else:
        print('Fail to reject the null hypothesis of equal variance between groups.')
        print(f'P-value is {levene(lev_a, lev_na)[1]}.')

    ## Welch's T-test of distribution
    print("\nWelch's T-test of Unequal Variance..............................................")
    if stats.ttest_ind(pt_scaled_a, pt_scaled_na, equal_var=False)[1] < 0.05:
        print("Reject the null hypothesis that means are similar.")
        print(f"P-value is {stats.ttest_ind(pt_scaled_a, pt_scaled_na, equal_var=False)[1]}")
    else:
        print("Fail to reject the null hypothesis that means are similar.")
        print(f"P-value is {stats.ttest_ind(pt_scaled_a, pt_scaled_na, equal_var=False)[1]}")

    # Instantiate StandardScaler and run
    ss = StandardScaler()

    ss_scaled_a = ss.fit_transform((data1.values + 1).reshape(-1, 1))
    ss_scaled_na = ss.fit_transform((data2.values + 1).reshape(-1, 1))

    # Levene's test of variance
    lev_a_ss = pd.Series(map(lambda x: x, ss_scaled_a))
    lev_na_ss = pd.Series(map(lambda x: x, ss_scaled_na))                            
    print("\n\n***Standardized***\nLevene's test of variance...................................")
    if levene(lev_a_ss, lev_na_ss)[1] < 0.05:
        print('Reject the null hypothesis of equal variance between groups.')
        print(f'P-value is {levene(lev_a_ss, lev_na_ss)[1]}.')
    else:
        print('Fail to reject the null hypothesis of equal variance between groups.')
        print(f'P-value is {levene(lev_a_ss, lev_na_ss)[1]}.')

    # Welch's T-test of distribution
    print("\nWelch's T-test of Unequal Variance................................................")
    if stats.ttest_ind(pt_scaled_a, pt_scaled_na, equal_var=False)[1] < 0.05:
        print("Reject the null hypothesis that means are similar.")
        print(f"P-value is {stats.ttest_ind(pt_scaled_a, pt_scaled_na, equal_var=False)[1]}")
    else:
        print("Fail to reject the null hypothesis that means are similar.")
        print(f"P-value is {stats.ttest_ind(pt_scaled_a, pt_scaled_na, equal_var=False)[1]}")

In [None]:
scale_test_welch(approved['Monthly_Income'], not_approved['Monthly_Income'])

In [None]:
scale_test_welch(approved['Age'], not_approved['Age'])

In [None]:
pt = PowerTransformer(method='box-cox')

pt_scaled_a = pt.fit_transform((approved['Monthly_Income'].values + 1).reshape(-1, 1))
pt_scaled_na = pt.fit_transform((not_approved['Monthly_Income'].values + 1).reshape(-1, 1))

plt.figure(figsize=(15, 8))
plt.hist(x=pt_scaled_a)
plt.show()

In [None]:
plt.figure(figsize=(15, 8))
plt.hist(x=pt_scaled_na)
plt.show()

In [12]:
"""
Scaling: Box-Cox, Standardization, Robust Scaler
testing: Levene's Test of Variance, Welch's T-test of Distribution 
"""

def robust_scale_test_welch(data1, data2):
    # Instantiate PowerTransformer and run
    pt = PowerTransformer(method='box-cox')

    pt_scaled_a = pt.fit_transform((data1.values + 1).reshape(-1, 1))
    pt_scaled_na = pt.fit_transform((data2.values + 1).reshape(-1, 1))

    ## Levene's test of variance
    lev_a = pd.Series(map(lambda x: x, pt_scaled_a))
    lev_na = pd.Series(map(lambda x: x, pt_scaled_na))
    print("***Box-Cox Scaled***\nLevene's test of variance...................................")
    if levene(lev_a, lev_na)[1] < 0.05:
        print('Reject the null hypothesis of equal variance between groups.')
        print(f'P-value is {levene(lev_a, lev_na)[1]}.')
    else:
        print('Fail to reject the null hypothesis of equal variance between groups.')
        print(f'P-value is {levene(lev_a, lev_na)[1]}.')

    ## Welch's T-test of distribution
    print("\nWelch's T-test of Unequal Variance..............................................")
    if stats.ttest_ind(pt_scaled_a, pt_scaled_na, equal_var=False)[1] < 0.05:
        print("Reject the null hypothesis that means are similar.")
        print(f"P-value is {stats.ttest_ind(pt_scaled_a, pt_scaled_na, equal_var=False)[1]}")
    else:
        print("Fail to reject the null hypothesis that means are similar.")
        print(f"P-value is {stats.ttest_ind(pt_scaled_a, pt_scaled_na, equal_var=False)[1]}")

    # Instantiate StandardScaler and run
    ss = StandardScaler()

    ss_scaled_a = ss.fit_transform((data1.values + 1).reshape(-1, 1))
    ss_scaled_na = ss.fit_transform((data2.values + 1).reshape(-1, 1))

    # Levene's test of variance
    lev_a_ss = pd.Series(map(lambda x: x, ss_scaled_a))
    lev_na_ss = pd.Series(map(lambda x: x, ss_scaled_na))                            
    print("\n\n***Standardized***\nLevene's test of variance...................................")
    if levene(lev_a_ss, lev_na_ss)[1] < 0.05:
        print('Reject the null hypothesis of equal variance between groups.')
        print(f'P-value is {levene(lev_a_ss, lev_na_ss)[1]}.')
    else:
        print('Fail to reject the null hypothesis of equal variance between groups.')
        print(f'P-value is {levene(lev_a_ss, lev_na_ss)[1]}.')

    # Welch's T-test of distribution
    print("\nWelch's T-test of Unequal Variance................................................")
    if stats.ttest_ind(pt_scaled_a, pt_scaled_na, equal_var=False)[1] < 0.05:
        print("Reject the null hypothesis that means are similar.")
        print(f"P-value is {stats.ttest_ind(ss_scaled_a, ss_scaled_na, equal_var=False)[1]}")
    else:
        print("Fail to reject the null hypothesis that means are similar.")
        print(f"P-value is {stats.ttest_ind(ss_scaled_a, ss_scaled_na, equal_var=False)[1]}")

    # Instantiate RobustScaler and run
    rs = RobustScaler()

    rs_scaled_a = rs.fit_transform((data1.values + 1).reshape(-1, 1))
    rs_scaled_na = rs.fit_transform((data2.values + 1).reshape(-1, 1))

    # Levene's test of variance
    lev_a_rs = pd.Series(map(lambda x: x, rs_scaled_a))
    lev_na_rs = pd.Series(map(lambda x: x, rs_scaled_na))                            
    print("\n\n***Robust Scaled***\nLevene's test of variance...................................")
    if levene(lev_a_rs, lev_na_rs)[1] < 0.05:
        print('Reject the null hypothesis of equal variance between groups.')
        print(f'P-value is {levene(lev_a_rs, lev_na_rs)[1]}.')
    else:
        print('Fail to reject the null hypothesis of equal variance between groups.')
        print(f'P-value is {levene(lev_a_rs, lev_na_rs)[1]}.')

    # Welch's T-test of distribution
    print("\nWelch's T-test of Unequal Variance................................................")
    if stats.ttest_ind(rs_scaled_a, rs_scaled_na, equal_var=False)[1] < 0.05:
        print("Reject the null hypothesis that means are similar.")
        print(f"P-value is {stats.ttest_ind(rs_scaled_a, rs_scaled_na, equal_var=False)[1]}")
    else:
        print("Fail to reject the null hypothesis that means are similar.")
        print(f"P-value is {stats.ttest_ind(rs_scaled_a, rs_scaled_na, equal_var=False)[1]}")

In [13]:
robust_scale_test_welch(approved['Monthly_Income'], not_approved['Monthly_Income'])

***Box-Cox Scaled***
Levene's test of variance...................................
Reject the null hypothesis of equal variance between groups.
P-value is [8.97713067e-06].

Welch's T-test of Unequal Variance..............................................
Fail to reject the null hypothesis that means are similar.
P-value is [1.]


***Standardized***
Levene's test of variance...................................
Reject the null hypothesis of equal variance between groups.
P-value is [0.02733187].

Welch's T-test of Unequal Variance................................................
Fail to reject the null hypothesis that means are similar.
P-value is [1.]


***Robust Scaled***
Levene's test of variance...................................
Fail to reject the null hypothesis of equal variance between groups.
P-value is [0.21547021].

Welch's T-test of Unequal Variance................................................
Fail to reject the null hypothesis that means are similar.
P-value is [0.05599469]


In [14]:
robust_scale_test_welch(approved['Age'], not_approved['Age'])

***Box-Cox Scaled***
Levene's test of variance...................................
Fail to reject the null hypothesis of equal variance between groups.
P-value is [0.69973729].

Welch's T-test of Unequal Variance..............................................
Fail to reject the null hypothesis that means are similar.
P-value is [1.]


***Standardized***
Levene's test of variance...................................
Fail to reject the null hypothesis of equal variance between groups.
P-value is [0.34586098].

Welch's T-test of Unequal Variance................................................
Fail to reject the null hypothesis that means are similar.
P-value is [1.]


***Robust Scaled***
Levene's test of variance...................................
Reject the null hypothesis of equal variance between groups.
P-value is [0.01939138].

Welch's T-test of Unequal Variance................................................
Reject the null hypothesis that means are similar.
P-value is [0.00212042]


In [16]:
approved.describe()

Unnamed: 0,Employer_Category2,Monthly_Income,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,Var1,Approved,Age,DOB_Year,Lead_Creation_Month,Lead_Creation_Day,Lead_Creation_Day_of_Week
count,1015.0,1015.0,1015.0,1015.0,1015.0,1015.0,1015.0,1015.0,1015.0,1015.0,1015.0,1015.0,1015.0,1015.0
mean,3.6,6027.568966,883.968878,45945.812808,3.904433,17.459769,1361.368848,7.479803,1.0,40.841379,1987.19803,8.034483,15.675862,2.613793
std,0.937705,5518.303677,1680.852862,33566.443568,1.07221,2.683051,895.022609,3.440583,0.0,6.468667,16.53301,0.801744,8.839036,1.898979
min,1.0,1500.0,0.0,5000.0,1.0,11.99,156.349242,0.0,1.0,29.0,1969.0,7.0,1.0,0.0
25%,4.0,3500.0,0.0,30000.0,3.0,14.85,886.989519,7.0,1.0,36.0,1981.0,7.0,8.0,1.0
50%,4.0,4800.0,0.0,30000.0,4.0,19.21357,1010.248826,10.0,1.0,40.0,1985.0,8.0,15.0,2.0
75%,4.0,7000.0,1160.6,52500.0,5.0,19.21357,1724.0,10.0,1.0,43.0,1988.0,9.0,23.0,4.0
max,4.0,80000.0,30000.0,250000.0,5.0,35.5,6002.586626,10.0,1.0,73.0,2068.0,9.0,31.0,6.0


In [17]:
robust_scale_test_welch(approved['EMI'], not_approved['EMI'])

***Box-Cox Scaled***
Levene's test of variance...................................
Reject the null hypothesis of equal variance between groups.
P-value is [2.77616804e-11].

Welch's T-test of Unequal Variance..............................................
Fail to reject the null hypothesis that means are similar.
P-value is [1.]


***Standardized***
Levene's test of variance...................................
Reject the null hypothesis of equal variance between groups.
P-value is [1.77354453e-11].

Welch's T-test of Unequal Variance................................................
Fail to reject the null hypothesis that means are similar.
P-value is [1.]


***Robust Scaled***
Levene's test of variance...................................
Reject the null hypothesis of equal variance between groups.
P-value is [2.94668768e-21].

Welch's T-test of Unequal Variance................................................
Reject the null hypothesis that means are similar.
P-value is [1.25463289e-35]


In [18]:
robust_scale_test_welch(approved['Existing_EMI'], not_approved['Existing_EMI'])

***Box-Cox Scaled***
Levene's test of variance...................................
Reject the null hypothesis of equal variance between groups.
P-value is [1.89094933e-19].

Welch's T-test of Unequal Variance..............................................
Fail to reject the null hypothesis that means are similar.
P-value is [1.]


***Standardized***
Levene's test of variance...................................
Reject the null hypothesis of equal variance between groups.
P-value is [7.10501402e-07].

Welch's T-test of Unequal Variance................................................
Fail to reject the null hypothesis that means are similar.
P-value is [1.]


***Robust Scaled***
Levene's test of variance...................................
Reject the null hypothesis of equal variance between groups.
P-value is [0.00394097].

Welch's T-test of Unequal Variance................................................
Reject the null hypothesis that means are similar.
P-value is [1.30337836e-07]
