In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
file_path = 'loan_approval_dataset.csv'
data = pd.read_csv(file_path)

# Display the first 5 rows
data.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


In [4]:
data.isnull().sum()

loan_id                      0
 no_of_dependents            0
 education                   0
 self_employed               0
 income_annum                0
 loan_amount                 0
 loan_term                   0
 cibil_score                 0
 residential_assets_value    0
 commercial_assets_value     0
 luxury_assets_value         0
 bank_asset_value            0
 loan_status                 0
dtype: int64

In [5]:
data.describe()

Unnamed: 0,loan_id,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
count,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0
mean,2135.0,2.498712,5059124.0,15133450.0,10.900445,599.936051,7472617.0,4973155.0,15126310.0,4976692.0
std,1232.498479,1.69591,2806840.0,9043363.0,5.709187,172.430401,6503637.0,4388966.0,9103754.0,3250185.0
min,1.0,0.0,200000.0,300000.0,2.0,300.0,-100000.0,0.0,300000.0,0.0
25%,1068.0,1.0,2700000.0,7700000.0,6.0,453.0,2200000.0,1300000.0,7500000.0,2300000.0
50%,2135.0,3.0,5100000.0,14500000.0,10.0,600.0,5600000.0,3700000.0,14600000.0,4600000.0
75%,3202.0,4.0,7500000.0,21500000.0,16.0,748.0,11300000.0,7600000.0,21700000.0,7100000.0
max,4269.0,5.0,9900000.0,39500000.0,20.0,900.0,29100000.0,19400000.0,39200000.0,14700000.0


In [6]:
# Identify categorical and numerical columns
categorical_cols = data.select_dtypes(include=['object']).columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns

print("Categorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)


Categorical Columns: Index([' education', ' self_employed', ' loan_status'], dtype='object')
Numerical Columns: Index(['loan_id', ' no_of_dependents', ' income_annum', ' loan_amount',
       ' loan_term', ' cibil_score', ' residential_assets_value',
       ' commercial_assets_value', ' luxury_assets_value',
       ' bank_asset_value'],
      dtype='object')


In [7]:
# Fill missing numerical values with mean
data['no_of_dependents'].fillna(data['no_of_dependents'].mean(), inplace=True)
data['income_annum'].fillna(data['income_annum'].mean(), inplace=True)
data['loan_amount'].fillna(data['loan_amount'].mean(), inplace=True)
data['loan_term'].fillna(data['loan_term'].mean(), inplace=True)
data['cibil_score'].fillna(data['cibil_score'].mean(), inplace=True)
data['residential_assets_value'].fillna(data['residential_assets_value'].mean(), inplace=True)
data['commercial_assets_value'].fillna(data['commercial_assets_value'].mean(), inplace=True)
data['luxury_assets_value'].fillna(data['luxury_assets_value'].mean(), inplace=True)
data['bank_asset_value'].fillna(data['bank_asset_value'].mean(), inplace=True)

KeyError: 'no_of_dependents'

In [8]:
print(data.columns)


Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')


In [9]:
# Remove whitespace and convert to lowercase
data.columns = data.columns.str.strip().str.lower()
print(data.columns)


Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status'],
      dtype='object')


In [10]:
data.rename(columns={'no_of_dependents': 'dependents'}, inplace=True)


In [11]:
# Fill missing numerical values with mean
data['dependents'].fillna(data['dependents'].mean(), inplace=True)
data['income_annum'].fillna(data['income_annum'].mean(), inplace=True)
data['loan_amount'].fillna(data['loan_amount'].mean(), inplace=True)
data['loan_term'].fillna(data['loan_term'].mean(), inplace=True)
data['cibil_score'].fillna(data['cibil_score'].mean(), inplace=True)
data['residential_assets_value'].fillna(data['residential_assets_value'].mean(), inplace=True)
data['commercial_assets_value'].fillna(data['commercial_assets_value'].mean(), inplace=True)
data['luxury_assets_value'].fillna(data['luxury_assets_value'].mean(), inplace=True)
data['bank_asset_value'].fillna(data['bank_asset_value'].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['dependents'].fillna(data['dependents'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['income_annum'].fillna(data['income_annum'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interm

In [12]:
# Fix chained assignment using direct assignment
data['dependents'] = data['dependents'].fillna(data['dependents'].mean())
data['income_annum'] = data['income_annum'].fillna(data['income_annum'].mean())
data['loan_amount'] = data['loan_amount'].fillna(data['loan_amount'].mean())
data['loan_term'] = data['loan_term'].fillna(data['loan_term'].mean())
data['cibil_score'] = data['cibil_score'].fillna(data['cibil_score'].mean())
data['residential_assets_value'] = data['residential_assets_value'].fillna(data['residential_assets_value'].mean())
data['commercial_assets_value'] = data['commercial_assets_value'].fillna(data['commercial_assets_value'].mean())
data['luxury_assets_value'] = data['luxury_assets_value'].fillna(data['luxury_assets_value'].mean())
data['bank_asset_value'] = data['bank_asset_value'].fillna(data['bank_asset_value'].mean())


In [13]:
# Fill missing categorical values with mode
data['education'].fillna(data['education'].mode()[0], inplace=True)
data['self_employed'].fillna(data['self_employed'].mode()[0], inplace=True)
data['loan_status'].fillna(data['loan_status'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['education'].fillna(data['education'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['self_employed'].fillna(data['self_employed'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the 

In [14]:
# Fix chained assignment using direct assignment
data['education'] = data['education'].fillna(data['education'].mode()[0])
data['self_employed'] = data['self_employed'].fillna(data['self_employed'].mode()[0])
data['loan_status'] = data['loan_status'].fillna(data['loan_status'].mode()[0])


In [15]:
# Drop rows with missing target values
data.dropna(subset=['loan_status'], inplace=True)


In [16]:
# Remove duplicate rows
data.drop_duplicates(inplace=True)


In [17]:
# Convert categorical data to lowercase
data['education'] = data['education'].str.lower()
data['self_employed'] = data['self_employed'].str.lower()
data['loan_status'] = data['loan_status'].str.lower()


In [18]:
# Final check for missing values
data.isnull().sum()


loan_id                     0
dependents                  0
education                   0
self_employed               0
income_annum                0
loan_amount                 0
loan_term                   0
cibil_score                 0
residential_assets_value    0
commercial_assets_value     0
luxury_assets_value         0
bank_asset_value            0
loan_status                 0
dtype: int64

In [19]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
encoder = LabelEncoder()

# Encode categorical variables
data['education'] = encoder.fit_transform(data['education'])
data['self_employed'] = encoder.fit_transform(data['self_employed'])
data['loan_status'] = encoder.fit_transform(data['loan_status'])


In [20]:
data[['education', 'self_employed', 'loan_status']].head()


Unnamed: 0,education,self_employed,loan_status
0,0,0,0
1,1,1,1
2,0,0,1
3,0,0,1
4,1,1,1


In [21]:
from sklearn.preprocessing import MinMaxScaler

# Create a scaler object
scaler = MinMaxScaler()

# Define the columns to scale
numerical_cols = [
    'dependents', 'income_annum', 'loan_amount', 'loan_term', 
    'cibil_score', 'residential_assets_value', 'commercial_assets_value', 
    'luxury_assets_value', 'bank_asset_value'
]

# Apply scaling
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])


In [22]:
data[numerical_cols].head()


Unnamed: 0,dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
0,0.4,0.969072,0.755102,0.555556,0.796667,0.085616,0.907216,0.575835,0.544218
1,0.0,0.402062,0.303571,0.333333,0.195,0.09589,0.113402,0.218509,0.22449
2,0.6,0.917526,0.75,1.0,0.343333,0.246575,0.231959,0.848329,0.870748
3,0.6,0.824742,0.77551,0.333333,0.278333,0.626712,0.170103,0.59126,0.537415
4,1.0,0.989691,0.609694,1.0,0.136667,0.428082,0.42268,0.748072,0.340136


In [23]:
from sklearn.model_selection import train_test_split

X = data.drop('loan_status', axis=1) 
y = data['loan_status']               

# Split into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display shapes of the splits
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((3415, 12), (854, 12), (3415,), (854,))

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"✅ Model Accuracy: {accuracy:.2f}")
print("\nConfusion Matrix:")
print(confusion)
print("\nClassification Report:")
print(report)


✅ Model Accuracy: 0.98

Confusion Matrix:
[[527   9]
 [ 12 306]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       536
           1       0.97      0.96      0.97       318

    accuracy                           0.98       854
   macro avg       0.97      0.97      0.97       854
weighted avg       0.98      0.98      0.98       854

