In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import files
uploaded = files.upload()


Saving LoanApprovalPrediction.csv to LoanApprovalPrediction.csv


In [3]:
df = pd.read_csv('LoanApprovalPrediction.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0.0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1.0,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0.0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0.0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0.0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
df.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
593,LP002978,Female,No,0.0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
594,LP002979,Male,Yes,3.0,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
595,LP002983,Male,Yes,1.0,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
596,LP002984,Male,Yes,2.0,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y
597,LP002990,Female,No,0.0,Graduate,Yes,4583,0.0,133.0,360.0,0.0,Semiurban,N


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598 entries, 0 to 597
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            598 non-null    object 
 1   Gender             598 non-null    object 
 2   Married            598 non-null    object 
 3   Dependents         586 non-null    float64
 4   Education          598 non-null    object 
 5   Self_Employed      598 non-null    object 
 6   ApplicantIncome    598 non-null    int64  
 7   CoapplicantIncome  598 non-null    float64
 8   LoanAmount         577 non-null    float64
 9   Loan_Amount_Term   584 non-null    float64
 10  Credit_History     549 non-null    float64
 11  Property_Area      598 non-null    object 
 12  Loan_Status        598 non-null    object 
dtypes: float64(5), int64(1), object(7)
memory usage: 60.9+ KB


In [6]:
df.describe()

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,586.0,598.0,598.0,577.0,584.0,549.0
mean,0.755973,5292.252508,1631.499866,144.968804,341.917808,0.843352
std,1.007751,5807.265364,2953.315785,82.704182,65.205994,0.3638
min,0.0,150.0,0.0,9.0,12.0,0.0
25%,0.0,2877.5,0.0,100.0,360.0,1.0
50%,0.0,3806.0,1211.5,127.0,360.0,1.0
75%,1.75,5746.0,2324.0,167.0,360.0,1.0
max,3.0,81000.0,41667.0,650.0,480.0,1.0


In [7]:
df.isnull().sum()

Unnamed: 0,0
Loan_ID,0
Gender,0
Married,0
Dependents,12
Education,0
Self_Employed,0
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,21
Loan_Amount_Term,14


In [8]:
#  HANDLE MISSING VALUES

# 1) Credit_History : filling with mode
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])

# 2) Dependents : filling with mode
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])

# 3) LoanAmount : filling with median
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())

# 4) Loan_Amount_Term : filling with a hardcoded median of 360.0 (based on initial df.describe() output)
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(360.0)

In [9]:
df.isnull().sum()

Unnamed: 0,0
Loan_ID,0
Gender,0
Married,0
Dependents,0
Education,0
Self_Employed,0
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,0
Loan_Amount_Term,0


In [10]:
# Combine ApplicantIncome and CoapplicantIncome
df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']

# Apply log transformation to reduce skewness
df['LoanAmount_log'] = np.log1p(df['LoanAmount'])
df['TotalIncome_log'] = np.log1p(df['TotalIncome'])

In [11]:
# Drop columns not needed for prediction
df.drop(['Loan_ID', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'TotalIncome'], axis=1, inplace=True, errors='ignore')

In [12]:
# Label Encoding for binary variables
le = LabelEncoder()
binary_cols = ['Gender', 'Married', 'Education', 'Self_Employed', 'Loan_Status']
for col in binary_cols:
    df[col] = le.fit_transform(df[col])

# One-hot encoding for multi-category columns
df = pd.get_dummies(df, columns=['Dependents', 'Property_Area'], drop_first=True)

In [13]:
# X = features, y = target
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("======================================================")
    print(f"Model: {name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


Model: Logistic Regression
Accuracy: 0.8166666666666667
Confusion Matrix:
 [[14 21]
 [ 1 84]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.40      0.56        35
           1       0.80      0.99      0.88        85

    accuracy                           0.82       120
   macro avg       0.87      0.69      0.72       120
weighted avg       0.84      0.82      0.79       120

Model: Decision Tree
Accuracy: 0.6833333333333333
Confusion Matrix:
 [[14 21]
 [17 68]]
Classification Report:
               precision    recall  f1-score   support

           0       0.45      0.40      0.42        35
           1       0.76      0.80      0.78        85

    accuracy                           0.68       120
   macro avg       0.61      0.60      0.60       120
weighted avg       0.67      0.68      0.68       120

Model: Random Forest
Accuracy: 0.8083333333333333
Confusion Matrix:
 [[17 18]
 [ 5 80]]
Classification Report:
     

In [15]:
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = accuracy_score(y_test, y_pred)

print("\nModel Comparison (Accuracy):")
print(results)



Model Comparison (Accuracy):
{'Logistic Regression': 0.8166666666666667, 'Decision Tree': 0.6833333333333333, 'Random Forest': 0.8, 'KNN': 0.7666666666666667}


In [16]:
# Train Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)


In [17]:
# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8
Confusion Matrix:
 [[16 19]
 [ 5 80]]
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.46      0.57        35
           1       0.81      0.94      0.87        85

    accuracy                           0.80       120
   macro avg       0.78      0.70      0.72       120
weighted avg       0.79      0.80      0.78       120



In [18]:
# Example: New customer's data
new_data = {
    'Gender': 1,               # 1 = Male, 0 = Female
    'Married': 1,              # 1 = Yes, 0 = No
    'Education': 1,            # 1 = Graduate, 0 = Not Graduate
    'Self_Employed': 0,        # 1 = Yes, 0 = No
    'Loan_Amount_Term': 360,
    'Credit_History': 1,
    'LoanAmount_log': 5.5,
    'TotalIncome_log': 8.2,
    # Corrected Dependents dummy variables based on training data column names
    'Dependents_1.0': 0,        # Assuming new customer has 3 dependents, so these are 0
    'Dependents_2.0': 0,
    'Dependents_3.0': 1,        # Set to 1 if new customer has 3 dependents
    'Property_Area_Semiurban': 0,
    'Property_Area_Urban': 1
}

# Convert dict â†’ DataFrame
import pandas as pd
new_df = pd.DataFrame([new_data])

# Predict with your trained model (Example: Random Forest)
prediction = model.predict(new_df)[0]

# Convert numeric output to labels
if prediction == 1:
    print("Loan Status: APPROVED")
else:
    print("Loan Status: NOT APPROVED")

Loan Status: NOT APPROVED


In [19]:
df['Predicted_Status'] = model.predict(X)

df['Predicted_Status'] = df['Predicted_Status'].map({1: 'Approved', 0: 'Not Approved'})
print(df[['Predicted_Status']].head())


  Predicted_Status
0         Approved
1     Not Approved
2         Approved
3         Approved
4         Approved
