In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [2]:
loan =pd.read_csv("loan.csv")

In [3]:
display(loan.head())
display(loan.info())

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,1.0,Urban,Y


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(3), int64(1), object(8)
memory usage: 57.7+ KB


None

### Data Exploration & Preprocessing

In [4]:
loan = loan.drop("Loan_ID",axis=1)

## Standard scaling of the continuous columns only

for i in ["ApplicantIncome","CoapplicantIncome","LoanAmount"]:
    loan[i] = (loan[i] - loan[i].mean())/loan[i].std()

### Outlier Detection using z-score

OutlierRows = loan[(loan["LoanAmount"]>3) |(loan["LoanAmount"]<-3) |
                   (loan["ApplicantIncome"]>3)|(loan["ApplicantIncome"]<-3)|
                   (loan["CoapplicantIncome"]>3)|(loan["CoapplicantIncome"]<-3)]

print("% of Outlier rows in the dataset is " + str(len(OutlierRows)/len(loan)*100)+"\n")
loan_OutlierFree = loan.drop(OutlierRows.index,axis=0)

### Missing Value Detection & treatment

MV = loan_OutlierFree.isna().sum()
print("% of Missing values by columns are")
print(MV[MV>0]/len(loan_OutlierFree)*100)

for i in MV[MV>0].index:
    
    if (i == "LoanAmount"):
        loan_OutlierFree[i].fillna(loan_OutlierFree[i].median(),inplace=True)
    else:
        loan_OutlierFree[i].fillna(loan_OutlierFree[i].mode()[0],inplace=True)

loan_noOutlierNoMV = loan_OutlierFree

#Converting all binary categorical columns into label encoded columns

loan_noOutlierNoMV["Loan_Status"].replace(['Y','N'],[1,0],inplace = True)
loan_noOutlierNoMV["Gender"].replace(['Male','Female'],[1,0],inplace = True)
loan_noOutlierNoMV["Married"].replace(['Yes','No'],[1,0],inplace = True)
loan_noOutlierNoMV["Education"].replace(['Graduate','Not Graduate'],[1,0],inplace = True)
loan_noOutlierNoMV["Self_Employed"].replace(['Yes','No'],[1,0],inplace = True)

#Converting all >2 class categorical columns into one-hot encoded columns

loan_new = pd.get_dummies(loan_noOutlierNoMV)

loan_new.info()
loan_new.head()

% of Outlier rows in the dataset is 3.908794788273615

% of Missing values by columns are
Gender            1.864407
Married           0.508475
Dependents        2.542373
Self_Employed     4.915254
LoanAmount        3.728814
Credit_History    7.966102
dtype: float64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 590 entries, 0 to 613
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Gender                   590 non-null    int64  
 1   Married                  590 non-null    int64  
 2   Education                590 non-null    int64  
 3   Self_Employed            590 non-null    int64  
 4   ApplicantIncome          590 non-null    float64
 5   CoapplicantIncome        590 non-null    float64
 6   LoanAmount               590 non-null    float64
 7   Credit_History           590 non-null    float64
 8   Loan_Status              590 non-null    int64  
 9   Dependents_0             590 

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Credit_History,Loan_Status,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,1,0,1,0,0.072931,-0.554036,-0.250179,1.0,1,1,0,0,0,0,0,1
1,1,1,1,0,-0.134302,-0.0387,-0.215127,1.0,0,0,1,0,0,1,0,0
2,1,1,1,1,-0.393427,-0.554036,-0.939534,1.0,1,1,0,0,0,0,0,1
3,1,1,0,0,-0.461686,0.251774,-0.308599,1.0,1,1,0,0,0,0,0,1
4,1,0,1,0,0.097649,-0.554036,-0.063236,1.0,1,1,0,0,0,0,0,1


### Data Splitting

In [6]:
X = loan_new.drop("Loan_Status",axis=1)
y = loan_new["Loan_Status"]

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=424)
len(X_train),len(X_test)

(442, 148)

### Apply SVM Algorithm

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#Import svm model
from sklearn import svm

kernel_values =['linear', 'poly', 'rbf', 'sigmoid']

#Create a svm Classifier
svm_clf = svm.SVC(kernel=kernel_values[0], C = 1, gamma =0.1) # Linear Kernel

#Train the model using the training sets
svm_clf = svm_clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = svm_clf.predict(X_test)

print("Model Performance metrics are as below :-\n")
print("Accuracy is "+str(accuracy_score(y_test,y_pred)))
print("Precision is "+str(precision_score(y_test,y_pred)))
print("Recall is "+str(recall_score(y_test,y_pred)))
print("F1-Score is "+str(f1_score(y_test,y_pred)))

Model Performance metrics are as below :-

Accuracy is 0.7702702702702703
Precision is 0.7596899224806202
Recall is 0.9702970297029703
F1-Score is 0.8521739130434782


#### Find best Kernel for the problem

In [10]:
k_values = np.arange(4,len(X_train),1)

# C_values = [0.001,0.01,0.1,1,10,100,1000] # C of 1 is a good starting point || C > 0
C_values = [0.01, 0.1, 1, 10]
# gamma_values = [0.1,0.5,0.9] || Gamma > 0 || starting value : gamma = 1/len(X) = 1/no. of rows in the dataset
gamma_values= [0.1, 1/len(X)]
kernel_values =['linear', 'poly', 'rbf', 'sigmoid']

PerfData = pd.DataFrame(columns=['Kernel Type','C','Gamma','Precision','Recall','Accuracy','F1-Score'])

for k in kernel_values:
    for c in C_values:
        for g in gamma_values:
            
            svm_clf = svm.SVC(kernel=k, C=c, gamma = g)
            svm_clf = svm_clf.fit(X_train, y_train)
            y_pred = svm_clf.predict(X_test)
            
            row = [[k, c, g, precision_score(y_test,y_pred), recall_score(y_test,y_pred), accuracy_score(y_test,y_pred),f1_score(y_test,y_pred)]]
        
        PerfData = PerfData.append( pd.DataFrame(row, columns=['Kernel Type','C','Gamma','Precision','Recall','Accuracy','F1-Score']), ignore_index=True)

# display(PerfData.head(10))
display(PerfData[PerfData['F1-Score'] == max(PerfData['F1-Score'])])

Unnamed: 0,Kernel Type,C,Gamma,Precision,Recall,Accuracy,F1-Score
1,linear,0.1,0.001695,0.75969,0.970297,0.77027,0.852174
2,linear,1.0,0.001695,0.75969,0.970297,0.77027,0.852174
3,linear,10.0,0.001695,0.75969,0.970297,0.77027,0.852174
11,rbf,10.0,0.001695,0.75969,0.970297,0.77027,0.852174
15,sigmoid,10.0,0.001695,0.75969,0.970297,0.77027,0.852174
