In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
#Storing Dtat
data=pd.read_csv("C:\\Users\\suman\\Downloads\\loan_approval_data.csv")

In [3]:
#Getting Info about dataset
data.info()
data.isnull().sum()
data=data.drop(columns=["Applicant_ID"])
data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Applicant_ID        950 non-null    float64
 1   Applicant_Income    950 non-null    float64
 2   Coapplicant_Income  950 non-null    float64
 3   Employment_Status   950 non-null    object 
 4   Age                 950 non-null    float64
 5   Marital_Status      950 non-null    object 
 6   Dependents          950 non-null    float64
 7   Credit_Score        950 non-null    float64
 8   Existing_Loans      950 non-null    float64
 9   DTI_Ratio           950 non-null    float64
 10  Savings             950 non-null    float64
 11  Collateral_Value    950 non-null    float64
 12  Loan_Amount         950 non-null    float64
 13  Loan_Term           950 non-null    float64
 14  Loan_Purpose        950 non-null    object 
 15  Property_Area       950 non-null    object 
 16  Educati

In [4]:
#Handling missing value 
#For numeric data missing value filled by mean of other data
#For categorical data missing values filled by most frequent categories
from sklearn.impute import SimpleImputer
category_cols=data.select_dtypes(include=["object"]).columns
numeric_cols=data.select_dtypes(include=["number"]).columns

num_imp=SimpleImputer(strategy="mean")
data[numeric_cols]=num_imp.fit_transform(data[numeric_cols])

cat_imp=SimpleImputer(strategy="most_frequent")
data[category_cols]=cat_imp.fit_transform(data[category_cols])



In [None]:
#Checing for outliers using box plots
fig, axes = plt.subplots(2, 2)

sns.boxplot(ax=axes[0, 0], data=data, x="Loan_Approved",y="Applicant_Income")
sns.boxplot(ax=axes[0, 1], data=data, x="Loan_Approved",y="Credit_Score")
sns.boxplot(ax=axes[1, 0], data=data, x="Loan_Approved",y="DTI_Ratio")
sns.boxplot(ax=axes[1, 1], data=data, x="Loan_Approved",y="Savings")

plt.tight_layout()

In [None]:
#Analysis of reletion betwen Credit_Score and Loan_Approved ment status
sns.histplot(
    data=data,
    x="Credit_Score",
    bins=20,
    hue="Loan_Approved",
    multiple="dodge"
)

In [None]:
#Analysis of reletion betwen DTI_Score and Loan_Approved ment status
sns.histplot(
    data=data,
    x="DTI_Ratio",
    bins=20,
    hue="Loan_Approved",
    multiple="dodge"
)

In [None]:
#Analysis of reletion betwen Applicant_Income and Loan_Approved ment status
sns.histplot(
    data=data,
    x="Applicant_Income",
    hue="Loan_Approved",
    bins=20,
    multiple="dodge"
)

In [None]:
#Encoding done by lebel and onehot encoder
#Education_level and result should give priority wise value 
#Other should have same contribution
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

lebelEncoder=LabelEncoder()
data["Education_Level"]=lebelEncoder.fit_transform(data["Education_Level"])
data["Loan_Approved"]=lebelEncoder.fit_transform(data["Loan_Approved"])

oneHotEncoder=OneHotEncoder(drop="first",sparse_output=False, handle_unknown="ignore")
column=["Employment_Status", "Marital_Status", "Loan_Purpose", "Property_Area", "Gender", "Employer_Category"]
encodedColums=oneHotEncoder.fit_transform(data[column])
dataFrame=pd.DataFrame(encodedColums,columns=oneHotEncoder.get_feature_names_out(column),index=data.index)
data=pd.concat([data.drop(columns=column),dataFrame],axis=1)


# OneHotEncoding Using Pandas Library pd.get_dummies
# column=["Employment_Status", "Marital_Status", "Loan_Purpose", "Property_Area", "Gender", "Employer_Category"]
# data=pd.get_dummies(data,columns=column,drop_first=True,dtype=float)
# data.head()


In [None]:
#Analysing the correlation between all features by Correlation heatmap
number_columns=data.select_dtypes(include="number")
corr_matrix=number_columns.corr()
plt.figure(figsize=(20,10))
sns.heatmap(
    corr_matrix,
    annot=True,
    fmt=".2f",
    cmap="coolwarm"
)
plt.title("Corelation Matrix")
plt.tight_layout()

In [None]:
number_columns.corr()["Loan_Approved"].sort_values(ascending=False)

In [None]:
#Taking input and output data
X=data.drop(columns=["Loan_Approved"])
y=data["Loan_Approved"]

In [None]:
#Spliting data for train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#Feature Scaling data using standard scaler
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_scale=scaler.fit_transform(X_train)
X_test_scale=scaler.transform(X_test)

In [None]:
#Using LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

log_model = LogisticRegression()
log_model.fit(X_train_scale, y_train)

y_pred = log_model.predict(X_test_scale)

# Evaluation
print("Logistic Regression Model")
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1 score: ", f1_score(y_test, y_pred))
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("CM: ", confusion_matrix(y_test, y_pred))

In [None]:
#Using Naive Byes
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train_scale, y_train)

y_pred = nb_model.predict(X_test_scale)

# Evaluation
print("Naive Bayes Model")
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1 score: ", f1_score(y_test, y_pred))
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("CM: ", confusion_matrix(y_test, y_pred))

In [None]:
#Applying feature enginnering on this two columns because they have most impact in mode
data["DTI_Ratio"] = data["DTI_Ratio"] ** 2
data["Credit_Score"] = data["Credit_Score"] ** 2
data["Applicant_Income_log"] = np.log1p(data["Applicant_Income"])

X = data.drop(columns=["Loan_Approved"])
y = data["Loan_Approved"]

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train.head()

In [None]:
# Logistic regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)

y_pred = log_model.predict(X_test_scaled)

# Evaluation
print("Logistic Regression Model")
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1 score: ", f1_score(y_test, y_pred))
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("CM: ", confusion_matrix(y_test, y_pred))

In [None]:
# Naive Bayes

from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)

y_pred = nb_model.predict(X_test_scaled)

# Evaluation
print("Naive Bayes Model")
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1 score: ", f1_score(y_test, y_pred))
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("CM: ", confusion_matrix(y_test, y_pred))