In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from scipy import stats

import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('./loan_dataset_1.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001015,Male,Yes,0,Graduate,No,5720,0.0,110.0,360.0,1.0,Urban,Y
1,LP001022,Male,Yes,1,Graduate,No,3076,1500.0,126.0,360.0,1.0,Urban,N
2,LP001031,Male,Yes,2,Graduate,No,5000,1800.0,208.0,360.0,1.0,Urban,Y
3,LP001035,Male,Yes,2,Graduate,No,2340,2546.0,100.0,360.0,,Urban,Y
4,LP001051,Male,No,0,Not Graduate,No,3276,0.0,78.0,360.0,1.0,Urban,Y


In [3]:
total_null = df.isnull().sum().sort_values(ascending=False)
total_null.head(10)

Credit_History      79
Self_Employed       55
LoanAmount          27
Dependents          25
Gender              24
Loan_Amount_Term    20
Married              3
Loan_ID              0
Education            0
ApplicantIncome      0
dtype: int64

In [4]:
df['Gender'] = df['Gender'].fillna(df['Gender'].dropna().mode().values[0])
df['Married'] = df['Married'].fillna(df['Married'].dropna().mode().values[0])
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].dropna().mode().values[0])
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].dropna().mode().values[0])
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].dropna().mean())
                                           

df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].dropna().mode().values[0])
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].dropna().mode().values[0])


In [5]:
z = np.abs(stats.zscore(df['ApplicantIncome']))
print(z)


# Position of the outlier
print(np.where(z > 3))

0      0.094903
1      0.369593
2      0.031586
3      0.498892
4      0.334457
         ...   
976    0.400512
977    0.188643
978    0.508100
979    0.422193
980    0.104844
Name: ApplicantIncome, Length: 981, dtype: float64
(array([ 81, 143, 272, 279, 493, 522, 538, 550, 552, 700, 776, 810],
      dtype=int64),)


In [6]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001015,Male,Yes,0,Graduate,No,5720,0.0,110.0,360.0,1.0,Urban,Y
1,LP001022,Male,Yes,1,Graduate,No,3076,1500.0,126.0,360.0,1.0,Urban,N
2,LP001031,Male,Yes,2,Graduate,No,5000,1800.0,208.0,360.0,1.0,Urban,Y
3,LP001035,Male,Yes,2,Graduate,No,2340,2546.0,100.0,360.0,1.0,Urban,Y
4,LP001051,Male,No,0,Not Graduate,No,3276,0.0,78.0,360.0,1.0,Urban,Y


In [7]:
z = np.abs(stats.zscore(df['LoanAmount']))
print(z)


# Position of the outlier
arr = np.where(z > 3)
print(arr)


0      0.426052
1      0.216378
2      0.858203
3      0.557098
4      0.845400
         ...   
976    0.937133
977    1.343377
978    1.447912
979    0.583005
980    0.124645
Name: LoanAmount, Length: 981, dtype: float64
(array([ 91, 143, 194, 284, 497, 522, 538, 544, 645, 675, 692, 700, 736,
       739, 799, 854, 873, 890, 892, 928, 971], dtype=int64),)


In [8]:
z = np.abs(stats.zscore(df['CoapplicantIncome']))
print(z)


# Position of the outlier
arr2 = np.where(z > 3)
print(arr2)


0      0.589506
1      0.037505
2      0.072895
3      0.347423
4      0.589506
         ...   
976    0.589506
977    0.589506
978    0.501186
979    0.589506
980    0.589506
Name: CoapplicantIncome, Length: 981, dtype: float64
(array([ 25, 230, 237, 284, 351, 376, 544, 769, 784, 948, 967], dtype=int64),)


In [9]:
df.drop([df.index[81],df.index[143],df.index[272],df.index[279],df.index[493],df.index[522],df.index[538],df.index[550],df.index[552],df.index[700],df.index[776],df.index[810]],inplace = True)

In [10]:
for i in range(0,len(arr)):
    df.drop([df.index[i]])

In [11]:
for i in range(0,len(arr2)):
    df.drop([df.index[i]])

In [12]:
cols = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term','ApplicantIncome','Loan_ID']
df = df.drop(columns=cols,axis=1)
df.head()


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,Loan_Status
0,Male,Yes,0,Graduate,No,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,1.0,Urban,N
2,Male,Yes,2,Graduate,No,1.0,Urban,Y
3,Male,Yes,2,Graduate,No,1.0,Urban,Y
4,Male,No,0,Not Graduate,No,1.0,Urban,Y


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 969 entries, 0 to 980
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          969 non-null    object 
 1   Married         969 non-null    object 
 2   Dependents      969 non-null    object 
 3   Education       969 non-null    object 
 4   Self_Employed   969 non-null    object 
 5   Credit_History  969 non-null    float64
 6   Property_Area   969 non-null    object 
 7   Loan_Status     969 non-null    object 
dtypes: float64(1), object(7)
memory usage: 100.4+ KB


In [14]:
from sklearn.preprocessing import LabelEncoder
cols = ['Gender','Married','Education','Self_Employed','Property_Area','Loan_Status','Dependents']
#cols = ['Dependents']
le = LabelEncoder()
for col in cols:
    df[col] = le.fit_transform(df[col])
    

In [15]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,Loan_Status
0,1,1,0,0,0,1.0,2,1
1,1,1,1,0,0,1.0,2,0
2,1,1,2,0,0,1.0,2,1
3,1,1,2,0,0,1.0,2,1
4,1,0,0,1,0,1.0,2,1


# creating train and testing data

In [16]:
X = df.drop(columns=['Loan_Status'],axis=1)
y = df['Loan_Status']

In [17]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

# model training

In [18]:
from sklearn.model_selection import cross_val_score
def classify(model,x,y):
    x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)
    model.fit(x_train,y_train)
    print("Accuracy is", model.score(x_test,y_test)*100)
    #cross validation- it is used for better validation for model
    score = cross_val_score(model,x,y,cv=5)
    print("Cross validation is",np.mean(score)*100)

In [19]:
#logistic regression
model = LogisticRegression()
classify(model,X,y)

Accuracy is 71.19341563786008
Cross validation is 68.73083702793654


In [20]:
#decision tree
model = DecisionTreeClassifier()
classify(model,X,y)

Accuracy is 67.90123456790124
Cross validation is 63.57032209817851


In [21]:
#svm
model = SVC()
classify(model,X,y)

Accuracy is 71.19341563786008
Cross validation is 68.93702259494685


In [22]:
model = XGBClassifier()
classify(model,X,y)

Accuracy is 70.37037037037037
Cross validation is 66.77047166283853


In [23]:
model = RandomForestClassifier()
classify(model,X,y)

Accuracy is 69.54732510288066
Cross validation is 66.04721969980235


# Hyperparameter tuning


In [24]:
from sklearn.svm import LinearSVC
lin_clf = LinearSVC(random_state=42)
lin_clf.fit(x_train, y_train)
from sklearn.metrics import accuracy_score
y_pred = lin_clf.predict(x_train)
accuracy_score(y_train, y_pred)*100

67.99410029498524

In [25]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train.astype(np.float32))
x_test_scaled = scaler.transform(x_test.astype(np.float32))
lin_clf = LinearSVC(random_state=42)
lin_clf.fit(x_train_scaled, y_train)
y_pred = lin_clf.predict(x_train_scaled)
accuracy_score(y_train, y_pred)*100

67.99410029498524

In [26]:
from sklearn.svm import SVC
svm_clf = SVC(gamma="scale")
svm_clf.fit(x_train_scaled[:10000], y_train[:10000]) # We use an SVC with an RBF kernel
y_pred = svm_clf.predict(x_train_scaled)
accuracy_score(y_train, y_pred)*100

68.43657817109144

In [27]:
from sklearn.svm import SVC
svm_clf = SVC(gamma="scale")
svm_clf.fit(x_test_scaled[:10000], y_test[:10000]) # We use an SVC with an RBF kernel
y_pred = svm_clf.predict(x_test_scaled)
accuracy_score(y_test, y_pred)*100

74.22680412371135

In [28]:
lin_clf = LinearSVC(random_state=42)

In [29]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform
param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)}
#Adding all values of hyperparameters in a list from which the values of hyperparameter will randomly inserted as hyperparameter