In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier

In [3]:
df = pd.read_csv('./loan_dataset_1.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001015,Male,Yes,0,Graduate,No,5720,0.0,110.0,360.0,1.0,Urban,Y
1,LP001022,Male,Yes,1,Graduate,No,3076,1500.0,126.0,360.0,1.0,Urban,N
2,LP001031,Male,Yes,2,Graduate,No,5000,1800.0,208.0,360.0,1.0,Urban,Y
3,LP001035,Male,Yes,2,Graduate,No,2340,2546.0,100.0,360.0,,Urban,Y
4,LP001051,Male,No,0,Not Graduate,No,3276,0.0,78.0,360.0,1.0,Urban,Y


In [4]:
total_null = df.isnull().sum().sort_values(ascending=False)
total_null.head(10)

Credit_History       79
Self_Employed        55
LoanAmount           27
Dependents           25
Gender               24
Loan_Amount_Term     20
Married               3
Loan_Status           0
Property_Area         0
CoapplicantIncome     0
dtype: int64

In [5]:
df['Gender'] = df['Gender'].fillna(df['Gender'].dropna().mode().values[0])
df['Married'] = df['Married'].fillna(df['Married'].dropna().mode().values[0])
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].dropna().mode().values[0])
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].dropna().mode().values[0])
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].dropna().mean())
                                           

df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].dropna().mode().values[0])
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].dropna().mode().values[0])


In [6]:
cols = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term','ApplicantIncome','Loan_ID']
df = df.drop(columns=cols,axis=1)
df.head()


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,Loan_Status
0,Male,Yes,0,Graduate,No,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,1.0,Urban,N
2,Male,Yes,2,Graduate,No,1.0,Urban,Y
3,Male,Yes,2,Graduate,No,1.0,Urban,Y
4,Male,No,0,Not Graduate,No,1.0,Urban,Y


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 981 entries, 0 to 980
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          981 non-null    object 
 1   Married         981 non-null    object 
 2   Dependents      981 non-null    object 
 3   Education       981 non-null    object 
 4   Self_Employed   981 non-null    object 
 5   Credit_History  981 non-null    float64
 6   Property_Area   981 non-null    object 
 7   Loan_Status     981 non-null    object 
dtypes: float64(1), object(7)
memory usage: 61.4+ KB


In [8]:
from sklearn.preprocessing import LabelEncoder
cols = ['Gender','Married','Education','Self_Employed','Property_Area','Loan_Status','Dependents']
#cols = ['Dependents']
le = LabelEncoder()
for col in cols:
    df[col] = le.fit_transform(df[col])
    

In [9]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,Loan_Status
0,1,1,0,0,0,1.0,2,1
1,1,1,1,0,0,1.0,2,0
2,1,1,2,0,0,1.0,2,1
3,1,1,2,0,0,1.0,2,1
4,1,0,0,1,0,1.0,2,1


# creating train and testing data

In [10]:
X = df.drop(columns=['Loan_Status'],axis=1)
y = df['Loan_Status']

In [11]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

# model training

In [12]:
from sklearn.model_selection import cross_val_score
def classify(model,x,y):
    x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)
    model.fit(x_train,y_train)
    print("Accuracy is", model.score(x_test,y_test)*100)
    #cross validation- it is used for better validation for model
    score = cross_val_score(model,x,y,cv=5)
    print("Cross validation is",np.mean(score)*100)

In [15]:
#logistic regression
from sklearn.metrics import classification_report
model = LogisticRegression()
classify(model,X,y)

y_pred=model.predict(x_test)
print(classification_report(y_test,y_pred))

Accuracy is 65.85365853658537
Cross validation is 68.80762457267171
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        97
           1       0.67      1.00      0.80       198

    accuracy                           0.67       295
   macro avg       0.34      0.50      0.40       295
weighted avg       0.45      0.67      0.54       295



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
#svm
from sklearn.metrics import classification_report
model = SVC()
classify(model,X,y)
y_pred=model.predict(x_test)
print(classification_report(y_test,y_pred))

Accuracy is 65.85365853658537
Cross validation is 69.0106702579509
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        97
           1       0.67      1.00      0.80       198

    accuracy                           0.67       295
   macro avg       0.34      0.50      0.40       295
weighted avg       0.45      0.67      0.54       295



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
