In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Load the dataset
data = pd.read_csv("loan_data.csv")
data.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [3]:
data.drop('Loan_ID', axis=1, inplace=True)
data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [4]:
data.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [5]:
data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [6]:
# Fill missing values
data.fillna({
    'LoanAmount': data['LoanAmount'].mean(),
    'Loan_Amount_Term': data['Loan_Amount_Term'].median(),
    'Credit_History': data['Credit_History'].mode()[0],
    'Gender': data['Gender'].mode()[0],
    'Married': data['Married'].mode()[0],
    'Dependents': data['Dependents'].mode()[0],
    'Self_Employed': data['Self_Employed'].mode()[0]
}, inplace=True)

In [7]:
data.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [8]:
data['Dependents'] = data['Dependents'].replace('3+', 3).astype(int)
# One-hot encoding for categorical features
data = pd.get_dummies(data, drop_first=True)

In [9]:
data.head()

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y
0,0,5849,0.0,146.412162,360.0,1.0,True,False,False,False,False,True,True
1,1,4583,1508.0,128.0,360.0,1.0,True,True,False,False,False,False,False
2,0,3000,0.0,66.0,360.0,1.0,True,True,False,True,False,True,True
3,0,2583,2358.0,120.0,360.0,1.0,True,True,True,False,False,True,True
4,0,6000,0.0,141.0,360.0,1.0,True,False,False,False,False,True,True


In [10]:
numerical_features = [
    'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term',
    'DTI', 'Income_per_Family_Member', 'Loan_to_Income_Ratio'
]
encoded_categorical_features = [
    'Gender_Male', 'Married_Yes', 'Education_Not Graduate', 
    'Self_Employed_Yes', 'Property_Area_Semiurban', 
    'Property_Area_Urban', 'Credit_History'
]
#FEATURE ENGINEERING
# 3. Debt-to-Income Ratio (DTI)
# Measures the proportion of income allocated to loan payments, indicating financial burden
data['DTI'] = (data['LoanAmount']*1000) / (data['ApplicantIncome'] + data['CoapplicantIncome'])


# 4. Income per Family Member
# Calculates income divided by family size (dependents), giving a sense of financial load
data['Income_per_Family_Member'] = data['ApplicantIncome'] / (data['Dependents'] + 1)

# 5. Loan Amount to Income Ratio
# Ratio of the loan amount to the applicant’s income, which helps assess loan affordability
data['Loan_to_Income_Ratio'] = (data['LoanAmount']*1000) / data['ApplicantIncome']

# 6. Coapplicant's Income Contribution
# Proportion of income coming from the coapplicant, useful in dual-income households
data['Coapplicant_Income_Contribution'] = data['CoapplicantIncome'] / (
    data['ApplicantIncome'] + data['CoapplicantIncome']
)

In [11]:
data.head()

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y,DTI,Income_per_Family_Member,Loan_to_Income_Ratio,Coapplicant_Income_Contribution
0,0,5849,0.0,146.412162,360.0,1.0,True,False,False,False,False,True,True,25.031999,5849.0,25.031999,0.0
1,1,4583,1508.0,128.0,360.0,1.0,True,True,False,False,False,False,False,21.014612,2291.5,27.929304,0.247578
2,0,3000,0.0,66.0,360.0,1.0,True,True,False,True,False,True,True,22.0,3000.0,22.0,0.0
3,0,2583,2358.0,120.0,360.0,1.0,True,True,True,False,False,True,True,24.286582,2583.0,46.457607,0.477231
4,0,6000,0.0,141.0,360.0,1.0,True,False,False,False,False,True,True,23.5,6000.0,23.5,0.0


In [12]:
# Define features and target

# Define the final feature set to use for training
final_features = numerical_features + encoded_categorical_features
X = data[final_features]
y = data['Loan_Status_Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale only the numerical features on the training set and apply the scaler to the test set
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

In [13]:
X_train

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,DTI,Income_per_Family_Member,Loan_to_Income_Ratio,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Credit_History
83,0.083915,0.180667,1.349992,0.287611,0.931245,0.559215,0.094422,True,True,False,False,True,False,1.0
90,-0.429338,0.385430,-0.204748,0.287611,-0.162212,-0.246828,0.096426,True,True,False,False,True,False,1.0
227,0.126095,0.005831,0.711852,0.287611,0.293736,-0.478589,-0.082026,True,True,False,False,True,False,1.0
482,-0.576970,0.464185,-0.239556,0.287611,0.072822,-0.478677,0.383027,True,True,False,False,True,False,1.0
464,-0.225522,-0.528127,-0.587632,0.287611,-0.032101,0.073258,-0.250285,True,False,False,False,True,False,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,-0.612064,0.062535,-0.599234,0.287611,0.230422,-0.865005,0.220775,True,True,True,True,True,False,1.0
106,0.997882,-0.173415,0.885890,0.287611,-0.657954,-0.022220,-0.314012,True,True,False,False,False,True,1.0
270,-0.382264,-0.528127,-1.376604,0.287611,-1.629480,-0.172901,-0.488339,False,False,False,False,False,True,1.0
435,0.766733,-0.528127,-0.025928,-1.506760,-1.035066,1.631553,-0.399755,False,True,False,False,True,False,1.0


In [14]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train a RandomForest with hyperparameter tuning
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gb))
print("Classification Report:\n", classification_report(y_test, y_pred_gb))


Random Forest Accuracy: 0.8048780487804879
Confusion Matrix:
 [[22 21]
 [ 3 77]]
Classification Report:
               precision    recall  f1-score   support

       False       0.88      0.51      0.65        43
        True       0.79      0.96      0.87        80

    accuracy                           0.80       123
   macro avg       0.83      0.74      0.76       123
weighted avg       0.82      0.80      0.79       123

Gradient Boosting Accuracy: 0.7967479674796748
Confusion Matrix:
 [[23 20]
 [ 5 75]]
Classification Report:
               precision    recall  f1-score   support

       False       0.82      0.53      0.65        43
        True       0.79      0.94      0.86        80

    accuracy                           0.80       123
   macro avg       0.81      0.74      0.75       123
weighted avg       0.80      0.80      0.78       123



In [15]:
import joblib

# Assuming Random Forest performed the best
joblib.dump(rf_model, "newM/loan_eligibility_model.pkl")
joblib.dump(scaler, "newM/numerical_features_scaler.pkl")


['newM/numerical_features_scaler.pkl']