In [1]:
# import generic packages
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
import fairlens as fl

In [None]:
# Import ydata modules
from ydata.synthesizers.regular.model import BaseModel, RegularSynthesizer
from ydata.metadata import Metadata
from ydata.dataset.dataset import Dataset

In [None]:
# load loans data (available at kaggle https://www.kaggle.com/code/ajaymanwani/loan-approval-prediction/data)
data = pd.read_csv('~/Downloads/loans.csv').fillna(0)
data.shape

In [None]:
# check that the data in unbalanced for Married status
sns.countplot(x='Married',data=data, palette = 'Set2')   

In [None]:
# convert categorical to numerical
from sklearn.preprocessing import LabelEncoder
for c in data.columns:
    if data[c].dtype =='O':
        le = LabelEncoder()
        data[c] = le.fit_transform(data[c].astype(str)).astype('int')

In [None]:
# train a synthetizer
synthetizer = RegularSynthesizer()
original = Dataset(data)
metadata = Metadata()
synthetizer.fit(original)

In [None]:
#generate new data
synth = synthetizer.sample(n_samples=900)
s = synth.to_pandas()

In [None]:
#create a balanced dataset
data_balanced = pd.concat([data,s[s.Married==1]],axis=0)
data_balanced.shape

In [None]:
# check that new data is balanced
sns.countplot(x='Married',data=data_balanced, palette = 'Set2')   

In [None]:
# create training a test data
xtrainf = []
ytrainf = []
xtrain, xtest, ytrain, ytest = train_test_split(data.drop(columns='Loan_Status'), data['Loan_Status'], test_size=.2, random_state = 100)
xtrainf.append(xtrain)
ytrainf.append(ytrain)

xtrain, _, ytrain, _ = train_test_split(data_balanced.drop(columns='Loan_Status'), data_balanced['Loan_Status'], test_size=.2, random_state = 100)
ii = [i for i in xtrain.index if i not in xtest.index]
xtrainf.append(xtrain.loc[ii])
ytrainf.append(ytrain.loc[ii])

In [None]:
# check accuracy of models trained in both data sets
models = [RandomForestClassifier(),DecisionTreeClassifier(), AdaBoostClassifier()]
married = [1,2]
improvement = {}

for m in models:
    improvement[m] = {}
    for i in range(0,2):
        m.fit(xtrainf[i],ytrainf[i])
        p = m.predict(xtest)

        improvement[m][i] = np.round(accuracy_score(p,ytest),2)
        print('Overall Accuracy %f'% (improvement[m][i]))

        for g in married:
            p = m.predict(xtest[xtest.Married==g])
            print('model %s  accuracy, married_status %s is_augmented %i =  %f'% (m, g , i , 
                        np.round(accuracy_score(p,ytest[xtest.Married==g]),2)))
    print ('Relative improvement ', np.round(100*(improvement[m][1]/improvement[m][0] - 1),2))