In [107]:
#IMPORTING THE LIBRARIES
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

#IMPORTING AND READING DATASET
df = pd.read_csv('loan_prediction.csv')

In [108]:
#PRINTING FIRST FIVE ROWS
print(df.head())

    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

In [109]:
#DROPPING THE LOAN_ID COLUMN
df = df.drop('Loan_ID', axis = 1)

In [110]:
#CHECKING FOR NULL VALUES
df.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [111]:
#DESCRIPTIVE SUMMARY OF DATASET
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [112]:
#DEALING WITH NULL VALUES IN CATEGORICAL COLUMNS
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
df['Married'].fillna(df['Married'].mode()[0], inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)

In [113]:
#DEALING WITH NULL VALUES IN NUMERICAL COLUMNS
df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace = True)
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0], inplace = True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace = True)

## **VISUALISING RELATIONSHIPS IN DATASET**

In [114]:
import plotly.express as px
#VISUALISING LOAN APPROVAL STATUS IN APPLICANTS
loan_status_count = df['Loan_Status'].value_counts()
fig_loan_status = px.pie(loan_status_count,
                         names = loan_status_count.index,
                         title = 'Loan Approval Status')
fig_loan_status.show()

In [115]:
#VISUALISING GENDER DISTRIBUTION
gender_count = df['Gender'].value_counts()
fig_gender = px.bar(gender_count,
                    x=gender_count.index,
                    y=gender_count.values,
                    title = 'Gender Distribution')
fig_gender.show()

In [116]:
#VISUALISING MARITAL STATUSES
married_count = df['Married'].value_counts()
fig_married = px.bar(married_count,
                    x=married_count.index,
                    y=married_count.values,
                    title = 'Marital Status Distribution')
fig_gender.show()

In [117]:
#VISUALISING EDUCATIONAL QUALIFICATION
education_count = df['Education'].value_counts()
fig_education = px.bar(education_count,
                       x=education_count.index,
                       y=education_count.values,
                       title='Education Distribution')
fig_education.show()

In [118]:
#VISUALISING EMPLOYMENT STATUS
self_employed_count = df['Self_Employed'].value_counts()
fig_self_employed = px.bar(self_employed_count,
                           x=self_employed_count.index,
                           y=self_employed_count.values,
                           title='Self-Employment Distribution')
fig_self_employed.show()

In [119]:
#VISUALISING APPLICANTS INCOME
fig_applicant_income = px.histogram(df, x= 'ApplicantIncome',
                                    title='Applicant Income Distribution')
fig_applicant_income.show()

In [120]:
#VISUALISING LOAN STATUS VS APPLICANT INCOME
fig_income = px.box(df, x='Loan_Status',
                    y='ApplicantIncome',
                    color="Loan_Status",
                    title='Loan_Status vs ApplicantIncome')
fig_income.show()

In [121]:
#DEALING WITH OUTLIERS IN APPLICANT INCOME
Q1 = df['ApplicantIncome'].quantile(0.25)
Q3 = df['ApplicantIncome'].quantile(0.75)
IQR = Q3-Q1

lower_bound = Q1-1.5*IQR
upper_bound = Q3+1.5*IQR

df = df[(df['ApplicantIncome']>=lower_bound) & (df['ApplicantIncome']<=upper_bound)]

In [122]:
#VISUALISING LOAN STATUS VS CO-APPLICANT INCOME
fig_coapplicant_income = px.box(df,
                                x='Loan_Status',
                                y='CoapplicantIncome',
                                color="Loan_Status",
                                title='Loan_Status vs CoapplicantIncome')
fig_coapplicant_income.show()

In [123]:
#DEALING WITH OUTLIERS IN CO-APPLICANT INCOME
Q1 = df['CoapplicantIncome'].quantile(0.25)
Q3 = df['CoapplicantIncome'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df['CoapplicantIncome'] >= lower_bound) & (df['CoapplicantIncome'] <= upper_bound)]

In [124]:
#VISUALISING LOAN STATUS VS LOAN AMOUNT
fig_loan_amount = px.box(df, x='Loan_Status',
                         y='LoanAmount',
                         color="Loan_Status",
                         title='Loan_Status vs LoanAmount')
fig_loan_amount.show()

In [125]:
#VISUALISING LOAN STATUS VS CREDIT HISTORY
fig_credit_history = px.histogram(df, x='Credit_History', color='Loan_Status',
                                  barmode='group',
                                  title='Loan_Status vs Credit_His')
fig_credit_history.show()

In [126]:
#VISUALISING LOAN STATUS VS PROPERTY AREA
fig_property_area = px.histogram(df, x='Property_Area', color='Loan_Status',
                                 barmode='group',
                                title='Loan_Status vs Property_Area')
fig_property_area.show()

In [127]:
#DEFINING THE FEATURES AND DEPENDENT VARIABLE
cat_cols = ['Gender', 'Married', 'Dependents', 'Education','Self_Employed','Property_Area']
df = pd.get_dummies(df, columns = cat_cols)

X = df.drop('Loan_Status', axis = 1)
y = df['Loan_Status']

In [128]:
#SPLITTING THE DATASET INTO TRAINING AND TEST SETS
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
sc = StandardScaler()
numerical_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount','Loan_Amount_Term','Credit_History']
X_train[numerical_cols] = sc.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = sc.transform(X_test[numerical_cols])

In [129]:
#TRAINING THE RANDOM FOREST MODEL
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100)
classifier.fit(X_train, y_train)

In [130]:
y_pred = classifier.predict(X_test)

In [131]:
# Convert X_test to a DataFrame
X_test_df = pd.DataFrame(X_test, columns=X_test.columns)

# Add the predicted values to X_test_df
X_test_df['Loan_Status_Predicted'] = y_pred
print(X_test_df.head())

     ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
225        -0.449269          -0.942609    0.814931          0.264066   
39         -0.182608           0.228048   -0.358312          0.264066   
540         0.277891           1.054105    0.228309         -2.438288   
193        -0.123707          -0.942609   -1.023150          0.264066   
448        -0.401078           0.257525    0.423850          0.264066   

     Credit_History  Gender_Female  Gender_Male  Married_No  Married_Yes  \
225        0.409878              0            1           0            1   
39         0.409878              0            1           1            0   
540        0.409878              1            0           0            1   
193        0.409878              0            1           1            0   
448       -2.439750              0            1           0            1   

     Dependents_0  ...  Dependents_2  Dependents_3+  Education_Graduate  \
225             1  ...       

In [132]:
#CALCULATING ACCURACY OF THE MODEL
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[14 11]
 [ 5 80]]


0.8545454545454545