In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
import joblib
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from sklearn.calibration import CalibratedClassifierCV


In [2]:
df=pd.read_csv('https://raw.githubusercontent.com/satssehgal/MLAPLI/master/bankloan.csv')
df=df.dropna()
df.isna().any()
df=df.drop('Loan_ID', axis=1)
df['LoanAmount']=(df['LoanAmount']*1000).astype(int)
Counter(df['Loan_Status'])

Counter({'N': 148, 'Y': 332})

In [3]:
y_pre = df['Loan_Status']
x_pre = df.drop('Loan_Status', axis=1)
dm_x = pd.get_dummies(x_pre)
dm_y = y_pre.map(dict(Y=1, N=0))
dm_x

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
1,1.0,4583,1508.0,128000,360.0,1.0,0,1,0,1,1,0,1,0,1,0,0
2,0.0,3000,0.0,66000,360.0,1.0,0,1,0,1,1,0,0,1,0,0,1
3,0.0,2583,2358.0,120000,360.0,1.0,0,1,0,1,0,1,1,0,0,0,1
4,0.0,6000,0.0,141000,360.0,1.0,0,1,1,0,1,0,1,0,0,0,1
5,2.0,5417,4196.0,267000,360.0,1.0,0,1,0,1,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0.0,2900,0.0,71000,360.0,1.0,1,0,1,0,1,0,1,0,1,0,0
610,3.0,4106,0.0,40000,180.0,1.0,0,1,0,1,1,0,1,0,1,0,0
611,1.0,8072,240.0,253000,360.0,1.0,0,1,0,1,1,0,1,0,0,0,1
612,2.0,7583,0.0,187000,360.0,1.0,0,1,0,1,1,0,1,0,0,0,1


In [4]:
smote = SMOTE(sampling_strategy='minority')
x1, y = smote.fit_sample(dm_x, dm_y)
sc = MinMaxScaler()
x = sc.fit_transform(x1)

In [5]:
Counter(y)

Counter({0: 332, 1: 332})

In [6]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=1234)

In [7]:
rf = RandomForestClassifier(n_estimators = 100)
rf = rf.fit(X_train, y_train)

In [8]:
et = ExtraTreesClassifier(n_estimators = 100)
et = et.fit(X_train, y_train)

In [9]:
mnb = MultinomialNB()
mnb = mnb.fit(X_train, y_train)

In [10]:
lr=LogisticRegression(max_iter=1000)
lr = lr.fit(X_train, y_train)

In [11]:
svc= svm.SVC(kernel='linear',probability=True)
svc = svc.fit(X_train,y_train)

In [12]:
evc=VotingClassifier(estimators=[('mnb', mnb),('lr', lr),('rf', rf),('et', et)],voting='soft')
evc = evc.fit(X_train, y_train)

array([[0.17427648, 0.82572352],
       [0.18276364, 0.81723636],
       [0.32287838, 0.67712162],
       [0.76389665, 0.23610335],
       [0.530895  , 0.469105  ],
       [0.21718181, 0.78281819],
       [0.25516575, 0.74483425],
       [0.90069597, 0.09930403],
       [0.58169472, 0.41830528],
       [0.30989895, 0.69010105],
       [0.15750196, 0.84249804],
       [0.7491731 , 0.2508269 ],
       [0.2489506 , 0.7510494 ],
       [0.68432698, 0.31567302],
       [0.12017056, 0.87982944],
       [0.88948923, 0.11051077],
       [0.85432255, 0.14567745],
       [0.26351032, 0.73648968],
       [0.37240049, 0.62759951],
       [0.40515355, 0.59484645],
       [0.89153447, 0.10846553],
       [0.81276749, 0.18723251],
       [0.88412002, 0.11587998],
       [0.83389118, 0.16610882],
       [0.27302287, 0.72697713],
       [0.29518755, 0.70481245],
       [0.86000921, 0.13999079],
       [0.2470557 , 0.7529443 ],
       [0.20972784, 0.79027216],
       [0.29555433, 0.70444567],
       [0.

In [13]:
joblib.dump(rf, "./random_forest.joblib", compress=True)
joblib.dump(et, "./extra_trees.joblib", compress=True)
joblib.dump(mnb, "./naive_bayes.joblib", compress=True)
joblib.dump(lr, "./logistic_regression.joblib", compress=True)
joblib.dump(svc, "./support_vector_machine.joblib", compress=True)
joblib.dump(evc, "./voting_classifier.joblib", compress=True)

['./voting_classifier.joblib']

In [14]:
train_mode = dict(df.mode().iloc[0])
print(train_mode)

{'Gender': 'Male', 'Married': 'Yes', 'Dependents': 0.0, 'Education': 'Graduate', 'Self_Employed': 'No', 'ApplicantIncome': 2500.0, 'CoapplicantIncome': 0.0, 'LoanAmount': 100000, 'Loan_Amount_Term': 360.0, 'Credit_History': 1.0, 'Property_Area': 'Semiurban', 'Loan_Status': 'Y'}
