1. Data Loading

In [1]:
import pandas as pd

In [2]:
# Load the customer loan prediction training dataset
df = pd.read_csv("Customer_Loan_Prediction_Train.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


2. Data Preprocessing

In [3]:
# Drop the 'Loan_ID' column, as it is not useful for prediction
df.drop("Loan_ID", axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


In [4]:
# Check for missing values in each column
df.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [5]:
l1 = ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History', 'Loan_Amount_Term','LoanAmount']

In [6]:
for i in l1:
    df[i].fillna(df[i].mode()[0], inplace=True)
df.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [7]:
l = ["ApplicantIncome","LoanAmount","CoapplicantIncome"]

In [8]:
for i in df:
    if i not in l:
        print(i,"-",df[i].unique())

Gender - ['Male' 'Female']
Married - ['No' 'Yes']
Dependents - ['0' '1' '2' '3+']
Education - ['Graduate' 'Not Graduate']
Self_Employed - ['No' 'Yes']
Loan_Amount_Term - [360. 120. 240. 180.  60. 300. 480.  36.  84.  12.]
Credit_History - [1. 0.]
Property_Area - ['Urban' 'Rural' 'Semiurban']
Loan_Status - ['Y' 'N']


3. Categorical Data Encoding using LabelEncoder

In [9]:
# LabelEncoder to encode categorical features
from sklearn.preprocessing import LabelEncoder

In [10]:
# Initialize the label encoder
label_encoder = LabelEncoder()

In [11]:
# Apply Label Encoding for categorical columns
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Married'] = label_encoder.fit_transform(df['Married'])
df['Education'] = label_encoder.fit_transform(df['Education'])
df['Self_Employed'] = label_encoder.fit_transform(df['Self_Employed'])
df['Property_Area'] = label_encoder.fit_transform(df['Property_Area'])
df['Dependents'] = label_encoder.fit_transform(df['Dependents'])

In [12]:
# Replace target variable 'Loan_Status' with 1 for approved and 0 for denied loans
df["Loan_Status"].replace({'Y': 1, 'N': 0}, inplace=True)

4. Model Training and Evaluation with Multiple Metrics

In [13]:
# Separate features and target variable
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

In [14]:
# Split data into training and testing sets (80% training, 20% testing)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
# Initialize and train the Logistic Regression model
Lr = LogisticRegression(max_iter=10000)
Lr.fit(X_train, y_train)

In [17]:
# Evaluate model accuracy
lr_score = Lr.score(X_test, y_test)
print(f"Accuracy of Linear Regression: {(lr_score*100):.2f}%")

Accuracy of Linear Regression: 78.86%


In [18]:
from sklearn.naive_bayes import BernoulliNB

In [19]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, make_scorer, f1_score

In [20]:
# Train Bernoulli Naive Bayes model
berno = BernoulliNB()
berno.fit(X_train, y_train)

In [21]:
y_pred = berno.predict(X_test)

In [22]:
# Evaluate model performance on test set
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [23]:
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")

Precision: 0.7596
Recall: 0.9875
Confusion Matrix:
[[18 25]
 [ 1 79]]


5. Hyperparameter Tuning (GridSearchCV)

In [24]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [25]:
# Define hyperparameter grid for tuning
param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
    'binarize': [0.0, 0.1, 0.5, 1.0, 2.0]
}

In [26]:
# Stratified K-Fold Cross-Validation to preserve class balance
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [27]:
# Use multiple scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'f1_score': make_scorer(f1_score),
    'roc_auc': 'roc_auc'
}

In [28]:
# Perform GridSearchCV with cross-validation
grid_search = GridSearchCV(
    estimator=berno, 
    param_grid=param_grid, 
    scoring=scoring, 
    refit='roc_auc',
    cv=cv,
    verbose=1
)

In [29]:
# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 35 candidates, totalling 175 fits


In [30]:
# Display best parameters and best score
print("Best Parameters :", grid_search.best_params_)
print(f"Best ROC-AUC Score: {(grid_search.best_score_*100):.2f}%")

Best Parameters : {'alpha': 10.0, 'binarize': 0.0}
Best ROC-AUC Score: 76.31%


In [31]:
# Train the BernoulliNB model with optimized hyperparameters
berno_optimized = BernoulliNB(alpha=grid_search.best_params_["alpha"], binarize=grid_search.best_params_["binarize"])
berno_optimized.fit(X_train, y_train)

In [32]:
y_pred_opti = berno_optimized.predict(X_test)

In [33]:
# Evaluate model performance on test set
precision = precision_score(y_test, y_pred_opti)
recall = recall_score(y_test, y_pred_opti)
conf_matrix = confusion_matrix(y_test, y_pred_opti)

In [34]:
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")

Precision: 0.7596
Recall: 0.9875
Confusion Matrix:
[[18 25]
 [ 1 79]]


7. Advanced Cross-Validation Techniques

In [35]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score

In [36]:
# Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [37]:
# Evaluate model using Stratified K-Fold Cross-Validation
cross_val_accuracy = cross_val_score(berno_optimized, X, y, cv=skf, scoring='accuracy')
cross_val_f1 = cross_val_score(berno_optimized, X, y, cv=skf, scoring='f1')
cross_val_roc_auc = cross_val_score(berno_optimized, X, y, cv=skf, scoring='roc_auc')

In [38]:
print(f"Cross-Validation Accuracy: {(cross_val_accuracy.mean()*100):.2f}%")
print(f"Cross-Validation F1-Score: {(cross_val_f1.mean()*100):.2f}%")
print(f"Cross-Validation ROC-AUC: {(cross_val_roc_auc.mean()*100):.2f}%")

Cross-Validation Accuracy: 80.79%
Cross-Validation F1-Score: 87.53%
Cross-Validation ROC-AUC: 75.08%


7. Predicting on the Test Dataset

In [39]:
# Load test dataset
test = pd.read_csv("Customer_Loan_Prediction_Test.csv")
test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [40]:
# Handle Missing Data
test1 = test.drop("Loan_ID", axis=1)
for i in l1:
    test1[i].fillna(test1[i].mode()[0], inplace=True)

In [41]:
# LabelEncoder to encode categorical features
from sklearn.preprocessing import LabelEncoder

In [42]:
# Initialize the label encoder
label_encoder = LabelEncoder()

In [43]:
# Apply Label Encoding for categorical columns
test1['Gender'] = label_encoder.fit_transform(test1['Gender'])
test1['Married'] = label_encoder.fit_transform(test1['Married'])
test1['Education'] = label_encoder.fit_transform(test1['Education'])
test1['Self_Employed'] = label_encoder.fit_transform(test1['Self_Employed'])
test1['Property_Area'] = label_encoder.fit_transform(test1['Property_Area'])
test1['Dependents'] = label_encoder.fit_transform(test1['Dependents'])

In [44]:
# Predict loan default status
Pred = berno_optimized.predict(test1)
test["Loan_Status"] = Pred
test["Loan_Status"].replace({1: 'Y', 0: 'N'}, inplace=True)

In [45]:
# Save the results
test.to_csv("Customer_Loan_Prediction_Test_Final.csv", index=False)