In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
df = pd.read_csv('Dataset 1.csv')

In [3]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [5]:
df.drop('Loan_ID', axis = 1, inplace = True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


In [7]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [8]:
df.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

## Feature Engineering

In [9]:
df.dropna(inplace = True) # no reliable method to replace null vals for any cols

In [10]:
df['Gender'].value_counts()

Male      394
Female     86
Name: Gender, dtype: int64

In [11]:
df['Married'].value_counts()

Yes    311
No     169
Name: Married, dtype: int64

In [12]:
df['Education'].value_counts()

Graduate        383
Not Graduate     97
Name: Education, dtype: int64

In [13]:
df['Self_Employed'].value_counts()

No     414
Yes     66
Name: Self_Employed, dtype: int64

In [14]:
df['Property_Area'].value_counts()

Semiurban    191
Urban        150
Rural        139
Name: Property_Area, dtype: int64

In [15]:
df['Loan_Status'].value_counts()

Y    332
N    148
Name: Loan_Status, dtype: int64

In [16]:
df['Gender'] = pd.get_dummies(df['Gender'], drop_first = True)
df['Married'] = pd.get_dummies(df['Married'], drop_first = True)
df['Education'] = pd.get_dummies(df['Education'], drop_first = True)
df['Self_Employed'] = pd.get_dummies(df['Self_Employed'], drop_first = True)
df[['Urban', 'Rural']] = pd.get_dummies(df['Property_Area'], drop_first = True)
df.drop('Property_Area', axis = 1, inplace = True)
df['Loan_Status'] = pd.get_dummies(df['Loan_Status'], drop_first = True)

In [17]:
df['Dependents'].value_counts()

0     274
2      85
1      80
3+     41
Name: Dependents, dtype: int64

In [18]:
df['Dependents'] = df['Dependents'].apply(lambda x: 3 if x == '3+' else int(x))

In [19]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Urban,Rural
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,1,0,1
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,1,0,1
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,1,0,1
5,1,1,2,0,1,5417,4196.0,267.0,360.0,1.0,1,0,1


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 480 entries, 1 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             480 non-null    uint8  
 1   Married            480 non-null    uint8  
 2   Dependents         480 non-null    int64  
 3   Education          480 non-null    uint8  
 4   Self_Employed      480 non-null    uint8  
 5   ApplicantIncome    480 non-null    int64  
 6   CoapplicantIncome  480 non-null    float64
 7   LoanAmount         480 non-null    float64
 8   Loan_Amount_Term   480 non-null    float64
 9   Credit_History     480 non-null    float64
 10  Loan_Status        480 non-null    uint8  
 11  Urban              480 non-null    uint8  
 12  Rural              480 non-null    uint8  
dtypes: float64(4), int64(2), uint8(7)
memory usage: 29.5 KB


## Train Test Split

In [21]:
df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Loan_Status', 'Urban', 'Rural'],
      dtype='object')

In [22]:
X = df[['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History',
       'Urban', 'Rural']]
y = df['Loan_Status']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [24]:
X_test

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Urban,Rural
14,1,1,2,0,0,1299,1086.0,17.0,120.0,1.0,0,1
101,1,0,0,0,0,4843,3806.0,151.0,360.0,1.0,1,0
80,0,0,0,0,0,3846,0.0,111.0,360.0,1.0,1,0
426,0,0,1,1,0,4606,0.0,81.0,360.0,1.0,0,0
356,1,1,2,0,0,8333,3167.0,165.0,360.0,1.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
191,1,0,0,0,0,12000,0.0,164.0,360.0,1.0,1,0
537,1,0,2,0,0,3617,0.0,107.0,360.0,1.0,1,0
417,1,1,2,0,1,1600,20000.0,239.0,360.0,1.0,0,1
55,1,1,2,0,0,2708,1167.0,97.0,360.0,1.0,1,0


## Logistic Regression

In [25]:
model = LogisticRegression(max_iter = 1000)

In [26]:
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [29]:
pred = model.predict(X_test)

In [30]:
print(confusion_matrix(pred, y_test))

[[17  5]
 [29 93]]


In [31]:
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.37      0.77      0.50        22
           1       0.95      0.76      0.85       122

    accuracy                           0.76       144
   macro avg       0.66      0.77      0.67       144
weighted avg       0.86      0.76      0.79       144



## Random Forest Classifier

In [26]:
model = RandomForestClassifier(n_estimators = 100)

In [27]:
model.fit(X_train, y_train)

RandomForestClassifier()

In [28]:
pred = model.predict(X_test)

In [29]:
print(confusion_matrix(pred, y_test))

[[20 10]
 [24 90]]


In [30]:
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.45      0.67      0.54        30
           1       0.90      0.79      0.84       114

    accuracy                           0.76       144
   macro avg       0.68      0.73      0.69       144
weighted avg       0.81      0.76      0.78       144

