In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
df=pd.read_csv("loan_approval_data.csv")
df.info()

# Handling Missing Values

In [None]:
# shortlist the categorical values using the sleect of data type object
categorical_cols= df.select_dtypes(include=["object"]).columns

# shortlist the numerical values using the sleect of data type  float64
numerical_cols = df.select_dtypes(include=["float64"]).columns


In [None]:
numerical_cols.size +categorical_cols.size

# Filling the null values using sklearn

In [None]:
# sklearn imputer use for fixing thw missing values
from sklearn.impute import SimpleImputer

# num_imp=SimpleImputer(strategy = "mean","median","mode")
num_imp=SimpleImputer(strategy = "mean")
df[numerical_cols]=num_imp.fit_transform(df[numerical_cols])

cat_imp=SimpleImputer(strategy = "most_frequent")
df[categorical_cols]=cat_imp.fit_transform(df[categorical_cols])

In [None]:
df.head()
# df.isnull().sum()
df=df.drop("Applicant_ID",axis=1)


# EDA - data visualization

In [None]:
# how balanced our classes are?

classes_count = df["Loan_Approved"].value_counts()

plt.pie(classes_count, labels=["No", "Yes"], autopct="%1.1f%%")
plt.title("Is Loan approved or not?")

In [None]:
edu_cnt = df["Education_Level"].value_counts()
ax = sns.barplot(edu_cnt)
ax.bar_label(ax.containers[0])

In [None]:
sns.histplot(
    data=df,
    x="Applicant_Income",
    bins=20
)

In [None]:
# outliers for box pots

sns.boxplot(
    data=df,
    x="Loan_Approved",
    y="Applicant_Income"
)

# Multiplots

In [None]:
fig,axes = plt.subplots(2,2)

sns.boxplot(ax=axes[0,0],data=df,x="Loan_Approved",y="Employment_Status")
sns.boxplot(ax=axes[0,1],data=df,x="Loan_Approved",y="Savings")
sns.boxplot(ax=axes[1,0],data=df,x="Loan_Approved",y="Credit_Score")
sns.boxplot(ax=axes[1,1],data=df,x="Loan_Approved",y="Applicant_Income")

plt.tight_layout()

In [None]:
fig,axes = plt.subplots(1,2)

sns.boxplot(ax=axes[0],data=df,x="Loan_Approved",y="Age")
sns.boxplot(ax=axes[1],data=df,x="Loan_Approved",y="Loan_Amount")

plt.tight_layout()

In [None]:
sns.histplot(
    data=df,
    x="Credit_Score",
    hue="Loan_Approved",
    multiple="dodge",
    bins=20
)

# Encoding

1. hot encoding - get dummy
2. binary encoding -- maps

## 1. lebal encoder
## 2. one hot encoder

In [None]:
df.info()
df.head()

## LAbel Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

# LABEL_ENCODER= Assign an integer to each category (0 to n-1) || it is used for ordinal data 
le=LabelEncoder()
df["Education_Level"]=le.fit_transform(df["Education_Level"])
df["Loan_Approved"]=le.fit_transform(df["Loan_Approved"])
df["Gender"]=le.fit_transform(df["Gender"])



# OneHotEncoder 

In [None]:
# OneHotEncoder = Creates binary columns for each category ( 0 0, 0 1) || it is used for nominal data(no as such order)
cols = ["Employment_Status", "Marital_Status", "Loan_Purpose", "Property_Area", "Gender", "Employer_Category"]

ohe = OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore")

# send us to the 2d array and we convert it to our df 
encoded = ohe.fit_transform(df[cols])

# convert
encoded_df = pd.DataFrame(encoded, columns=ohe.get_feature_names_out(cols), index=df.index)

# we concatinate the two data frame in one, and drop our orignal columns value to to the df
df = pd.concat([df.drop(columns=cols), encoded_df], axis=1)

## Corelation heatmap

#### 1. it is a visual representation of the relationship between numerical variables in a dataset
#### 2. it shows correlation coefficient (r) between twoo numericals variables

1. range from -1 to 1
2. 1 is perfect positive corelation , thats mean x1^^ so x2^^
3. -1 is perfect nagetive correlation, thats mean x1^^ so x2 down
4. 0 is linear correlation


In [None]:
num_cols=df.select_dtypes(include="number")
corr_metrix=num_cols.corr()

plt.figure(figsize=(15,8))
sns.heatmap(
    corr_metrix,
    annot=True,
    fmt=".2f",
    cmap="coolwarm"
)

In [None]:
num_cols.corr()["Loan_Approved"].sort_values(ascending=False)


# Train-Test-Split + Feature Engeenearing

In [None]:
X=df.drop("Loan_Approved",axis=1)
y=df["Loan_Approved"]

# X = df.drop(columns=["Loan_Approved", "Credit_Score", "DTI_Ratio"])
# y = df["Loan_Approved"]


In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=42)

# Naive Bayes model

In [None]:
# Naive Bayes
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler

# scalling
scaler=StandardScaler()

X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)


nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)

y_pred = nb_model.predict(X_test_scaled)

# Evaluation
print("Naive Bayes Model")
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1 score: ", f1_score(y_test, y_pred))
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("CM: ", confusion_matrix(y_test, y_pred))