# Loan Prediction
# - [Import Library](#Import-Library)
# - [Load dataset](#Load-dataset)
# - [Rename columns](#Rename-columns)
# - [Display Row and Column](#Display-Row-and-Column)
# - [Display Column_name](#Display-Column_name)
# - [Display Null Columns](#Display-Null-Columns)
# - [Remove Nan value](#Remove-Nan-value)
# - [Delete Duplicates](#Delete-Duplicates)
# - [Encode Categorical Data](#Encode-Categorical-Data)
# - [Handle Outlier](#Handle-Outlier)
# - [Binning](#Binning)
# - [Normalization](#Normalization)
# - [Visualisation](#Visualisation)
# - [Line Plot](#Line-Plot)
# - [Bar Plot](#Bar-Plot)
# - [Histogram](#Histogram)
# - [Box Plot](#Box-Plot)
# - [Area Plot](#Area-Plot)
# - [Scatter Plot](#Scatter-Plot)
# - [Hexagonal Bin Plot](#Hexagonal-Bin-Plot)
# - [Pie Chart](#Pie-Chart)
# - [HeatMap](#HeatMap)
# 

# Import Library


In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,ConfusionMatrixDisplay

# Load dataset

In [None]:
loan_data=pd.read_csv("Loan Application Accept or Reject.csv")
loan_data.head()

In [None]:
loan_data.tail()

# Rename columns

In [None]:
loan_data.columns=['loan_id', 'no_of_dependents', 'education', 'employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status']

In [None]:
loan_data.columns

In [None]:
loan_data['loan_status'].unique()

# Display Row and Column

In [None]:
print(loan_data.shape)
row,column=loan_data.shape
print("total number of rows:", row)
print("total number of column:", column)

 # Display Column_name

In [None]:
# display column name, data type and size
print(loan_data.columns)
print("---------------XXXX------------")
print(loan_data.dtypes)
print("---------------XXXX------------")
loan_data.size

# Display Null Columns

In [None]:
loan_data.isnull().sum()

In [None]:
rows_with_nulls = loan_data[loan_data.isnull().any(axis=1)]
print(rows_with_nulls)

In [None]:
#show only nan columns name
nan_column=loan_data.columns[loan_data.isnull().any()]
for i in nan_column:
    print("Columns name: ", i,":",loan_data[i].isnull().sum())

# Remove Nan value

In [None]:
loan_data.isnull().sum()

In [None]:
# Visualizing the missing data before removing null value
sns.heatmap(loan_data.isnull())

In [None]:
loan_data.tail(5)

In [None]:
# add some data in dataset
loan_data.loc[len(loan_data)]=[4270,2,'Not Graduate', 'Yes',np.nan,10000,10,300,100000.0,100000.0,np.nan,np.nan,'Approved']

In [None]:
loan_data.loc[len(loan_data)]=[4271,2,np.nan, 'Yes',np.nan,10000,10,300,100000.0,100000.0,np.nan,np.nan,np.nan]

In [None]:
loan_data.loc[len(loan_data)]=[np.nan]*len(loan_data.columns)

In [None]:
loan_data.tail(1)

In [None]:
# i want to delete data where all row value is nan
loan_data.dropna(how='all',inplace=True)

In [None]:
loan_data['loan_id']=loan_data['loan_id'].astype(int) # here we change because we add nan value. nan value is by-default float data type

In [None]:
# delete row from data set.
loan_data.tail()

In [None]:
# fill nan value in education column by using mean value of data
# education data is string type so use mode. we use mean, median for numerical data type
# here we used mode for imputy function 
loan_data['education'].fillna(loan_data['education'].mode()[0],inplace=True)

In [None]:
# here we delete two row.those row we added before 
loan_data=loan_data.drop(4269).reset_index(drop=True)
loan_data.tail()

In [None]:
nan_column=loan_data.columns[loan_data.isnull().any()]
for i in nan_column:
    print("Columns name: ", i,":",loan_data[i].isnull().sum())

In [None]:
# here we use mean for imputing
loan_data['income_annum'].fillna(loan_data['income_annum'].mean(),inplace=True)

In [None]:
# here we use median for imputing
loan_data['residential_assets_value'].fillna(loan_data['residential_assets_value'].median(),inplace=True)

In [None]:
# Here we use dropna function to delete the remaining nan value
loan_data.dropna(inplace=True)

In [None]:
# Visualizing the missing data after removing null value
sns.heatmap(loan_data.isnull())

# Delete Duplicates

In [None]:
# Delete Duplicates values from datasets
loan_data[loan_data.duplicated()]

In [None]:
loan_data.duplicated().sum()

In [None]:
loan_data.shape

In [None]:
loan_data.drop_duplicates()

# Encode Categorical Data

In [None]:
loan_df=loan_data.copy()

In [None]:
loan_df.head()

In [None]:
print(loan_data['education'].unique())
print(loan_data['employed'].unique())
print(loan_data['loan_status'].unique())

In [None]:
# for education column we apply encoder
education_encoder=LabelEncoder()
loan_df['education']=education_encoder.fit_transform(loan_df['education'])
# for employed column we apply encode
employed_encoder=LabelEncoder()
loan_df['employed']=employed_encoder.fit_transform(loan_df['employed'])
# for loan_status column we apply encode 
loan_encoder=LabelEncoder()
loan_df['loan_status']=loan_encoder.fit_transform(loan_df['loan_status'])

In [None]:
loan_df.head()

 # Handle Outlier

In [None]:
# for finding outlier on dataset we use select_dtypes method to 
numeric_cols=loan_df.select_dtypes(include=['number']).columns
for col in numeric_cols:
    q1=loan_df[col].quantile(0.25)
    q3=loan_df[col].quantile(0.75)
    IQR=q3-q1
    print("IQR value : ",IQR)
    lower_bound=q1-1.5*IQR
    upper_bound=q3+1.5*IQR
    outliers=loan_df[(loan_df[col]<lower_bound) | (loan_df[col]>upper_bound)]
    outlier_row=outliers.any(axis=1)
    print("Total outlier:",outlier_row.sum())
    print(f"outlier in {col}: ",outliers.shape,"\n")

In [None]:
# for finding outlier on dataset we use select_dtypes method to 
numeric_cols=loan_df.select_dtypes(include=['number']).columns
for col in numeric_cols:
    z = np.abs(stats.zscore(loan_df[col]))
    outliers=loan_df[(abs(z)>3)]
    outlier_row=outliers.any(axis=1)
    print("Total outlier:",outlier_row.sum())
    print(f"Outlier using z-szore: {col}:",outliers.shape)

In [None]:
# showing outlier using box plot
# Assuming `loan_df` is your DataFrame and `numeric_cols` contains numeric column names
numeric_cols=loan_df.select_dtypes(include=['number']).columns
# set number of columns per row
plots_per_row=4
#calculate number of rows needed
num_rows=(len(numeric_cols)+plots_per_row-1)//plots_per_row
# Create subplots
fig,axes=plt.subplots(num_rows,plots_per_row,figsize=(5*plots_per_row,4*num_rows))
# Flatten axes if it's a 2D array
if len(numeric_cols) > 1:
    axes = axes.flatten()
else:
    axes = [axes] # Ensure it's iterable for a single column
# Loop through numeric columns and axes
for ax,col in zip(axes,numeric_cols):
    sns.boxplot(y=loan_df[col],ax=ax)
    ax.set_title(f'Boxplot of {col} ')
# remove unused subplots if any
for i in range(len(numeric_cols),len(axes)):
    fig.delaxes(axes[i])
plt.tight_layout()
plt.show()

In [None]:
# outlier using quertile
# for finding outlier on dataset we use select_dtypes method to 
# outlier using quertile
df=loan_df.copy()
numeric_cols=df.select_dtypes(include=['number']).columns
q1=df[numeric_cols].quantile(0.25)
q3=df[numeric_cols].quantile(0.75)
IQR=q3-q1
outliers=((df[numeric_cols]<(q1-1.2*IQR)) |(df[numeric_cols]>(q3+1.2*IQR)))
outlier_row=outliers.any(axis=1)
print(f"number of outliers detected:{outlier_row.sum()}")

In [None]:
#Remove outlier
df_cleaned=df[~outlier_row].reset_index(drop=True)
print(f"original dataset shape: {df.shape}")
print(f"cleaned dataset shape: {df_cleaned.shape}")

In [None]:
for feature in numeric_cols:
    fig,axes=plt.subplots(1,2,figsize=(12,4))
    sns.boxplot(x=df[feature],ax=axes[0])
    axes[0].set_title(f"Before cleaning:{feature}")
    sns.boxplot(x=df_cleaned[feature],ax=axes[1])
    axes[1].set_title(f"After cleaning: {feature}")
    plt.tight_layout()
    plt.show()

In [None]:
df_cleaned.head()

# Binning



 Binning data is a common technique in data analysis where you group continuous data into categorical intervals, or bins, to gain insights into the distribution or trends within the data.

In [None]:
# Example of binning for cibil_score
df_cleaned['cibil_score_bin']=pd.cut(df_cleaned['cibil_score'],bins=[300,600,700,850],labels=['Poor','Average','Good'])

In [None]:
#Binning loan_amount
df_cleaned['loan_amount_bin']=pd.cut(df_cleaned['loan_amount'],bins=5)
#Binning income_annum
df_cleaned['income_bin']=pd.cut(df_cleaned['income_annum'],bins=3,labels=['low Salary','Medium Salary','High Salary'])
#Binning loan_term
df_cleaned['loan_term_bin']=pd.cut(df_cleaned['loan_term'],bins=[0,5,10,15,20,30],labels=['<5','5-10','10-15','15-20','20-30'])

In [None]:
df_cleaned['loan_amount_bin'].head()

In [None]:

df_cleaned['income_annum'].min()

# Normalization



Data normalization is the process of scaling numeric features to a standard range, preventing large values from dominating the learning process in machine learning models<br>
The MinMaxScaler() function scales each feature to a given range, typically [0, 1]

In [None]:
scaler=MinMaxScaler()
df_cleaned[['loan_amount','loan_term','cibil_score','residential_assets_value','commercial_assets_value','luxury_assets_value','bank_asset_value','income_annum']]=scaler.fit_transform(df_cleaned[['loan_amount','loan_term','cibil_score','residential_assets_value','commercial_assets_value','luxury_assets_value','bank_asset_value','income_annum']])

# Visualisation

# Line Plot

In [None]:
# Basic plot: line plot
df_sorted=df_cleaned.sort_values('loan_term')
plt.plot(df_sorted['loan_term'])
plt.title("loan Term trend")
plt.xlabel("Applicants")
plt.ylabel("Loan Term")
plt.show()

Inference:<br>
It basically show all sorted loan Applicants on the basis of loan Term. It basically show more people intrested in taking loan for upto 4 to 6 years

# Bar Plot

In [None]:
# bar plot
ax=sns.countplot(x='education',hue='loan_status',data=df_cleaned)
for p in ax.patches:
    height=p.get_height()
    ax.annotate(f'{height}',(p.get_x()+p.get_width()/2.,height),ha='center',va='bottom')
plt.title('Loan Approval by education')
plt.show()

Inference:-<br>
According to bank policies graduates have higher chance to approval loan by bank. 

In [None]:
sns.countplot(x='loan_status',data=df_cleaned)
plt.title("loan Approval Distribution")
plt.xlabel("Approval")
plt.ylabel("count")
plt.show()

Inference:- <br>
The Majority of application are rejected, with a lesser number Approved. This shows the bank's rejection rate is higher than its Approval rate rate.

# Histogram

In [None]:
#histogram
df['income_annum'].hist(bins=40)
plt.title("Distribution of Annual income")
plt.xlabel("Income")
plt.ylabel("Frequency")
plt.show()

Inference:-<br>
The Distribution of annual income is right skewed, meaning most people earn less, with a few high earner. This might indicated income inequality within the application pool.

 Skewness is a key statistical measure that shows how data is spread out in a dataset. It tells us if the data points are skewed to the left (negative skew) or to the right (positive skew) in relation to the mean. It is important because it helps us to understand the shape of the data distribution which is important for accurate data analysis and helps in identifying outliers and finding the best statistical methods to use for analysis.
1. Positive Skewness (Right Skew)
In a positively skewed distribution, the right tail is longer than the left which means most data points are on the left with a few large values pulling the distribution to the right.
Relationship:
Mean > Median > Mode
2. Negative Skewness (Left Skew)
In a negatively skewed distribution, the left tail is longer which means most data points are on the right with a few smaller values pulling the distribution to the left.
Relationship:
Mean < Median < Mode
3. Zero Skewness (Symmetrical Distribution)
Zero skewness shows a perfectly symmetrical distribution where the mean, median and mode are equal. In a symmetrical distribution, the data points are evenly distributed around the central point.
Relationship:
Mean = Median = Mode


# Box Plot

In [None]:
#Box plot
sns.boxplot(x='loan_status',y='loan_amount',data=df_cleaned)
plt.title("loan Amount by Approval")
plt.xlabel("Approval")
plt.ylabel("loan Approval (Normalization)")
plt.show()

Inference:-<br>
The box plot shows that the median loan amount is higher for approval loans. there are a few outliers with very large amount, which might corresspond to special cases or high-risk application

# Area Plot

In [None]:
#Area Plot
df_cleaned[['income_annum','loan_amount']].sort_values(by='income_annum').plot.area()
plt.title("Area plot of Annual income and loan Amount")
plt.xlabel("Index")
plt.ylabel("values")
plt.show()

 Inference:- <br>
This Area plot illurstrate the cummulative distribution of Annual Income and loan amount. we observe a clear upward trend-higher income typically comes alongside larger loans.

# Scatter Plot

In [None]:
#Scatter Plot
sns.scatterplot(x='income_annum',y='loan_amount',hue='loan_status',data=df_cleaned)
plt.title("Scatter plot of Annual income vs loan Amount by Approval")
plt.xlabel("Annual Income")
plt.ylabel("Loan Amount")
plt.legend(title='Approval')
plt.show()

Inference:-<br>
This scatter plot shows a positive correlation between annual income and loan amount.Approved application are predominantly in  the higher income and higher amount quadrant, while non-approval ones are more dispersed at lower values


# Hexagonal Bin Plot

In [None]:
#Hexbin Plot
df_cleaned.plot.hexbin(x='income_annum',y='loan_amount',gridsize=25,cmap='Blues')
plt.title("Hexbin plot of annual income vs loan amount")
plt.xlabel("Annual income")
plt.ylabel("Loan Amount")
plt.show()

Inference:-<br>
 This hexbin plot highlights the connection of data point. The densest cluster are the lower income and lower loan which resonattes with our observation from the scatter plot.

# Pie Chart

In [None]:
explode=[0.1,0,0]
df_cleaned['income_bin'].value_counts().plot.pie(autopct='%1.1f%%',counterclock=False,startangle=50,explode=explode)
plt.title("Annual income distribution")
plt.show()

inference:-<br>
The Pie chart shows the residential assets value which are divided into three categories of high, low and medium. More people earn medium salary and lower salary. fewer people lies in higher salary range.

# HeatMap

In [None]:
#Heatmap
plt.figure(figsize=(10,8))
sns.heatmap(df_cleaned.corr(numeric_only=True),annot=True,cmap=sns.cubehelix_palette(as_cmap=True))
plt.title("Correlation heatmap")
plt.show()

Inference:-<br>
According to the correlation graph, loan status is significantly influenced by factors such as education level, employemnt status, and residential asset value. Individual with higher annual income also tend to own more residential, commerical and luxury assets. Additionally, the loan amount is strongly correlated with annual income, indicating that higher earners are eligible for longer loans.

In [None]:
# for education column we apply encoder
income_encoder=LabelEncoder()
df_cleaned['income_bin']=income_encoder.fit_transform(df_cleaned['income_bin'])
# for employed column we apply encode
cibil_encoder=LabelEncoder()
df_cleaned['cibil_score_bin']=cibil_encoder.fit_transform(df_cleaned['cibil_score_bin'])
loan_amount_encoder=LabelEncoder()
df_cleaned['loan_amount_bin']=loan_amount_encoder.fit_transform(df_cleaned['loan_amount_bin'])
loan_term_encoder=LabelEncoder()
df_cleaned['loan_term_bin']=loan_term_encoder.fit_transform(df_cleaned['loan_term_bin'])

In [None]:
df_cleaned.head()

In [None]:
df_cleaned=df_cleaned.drop(['loan_id','cibil_score_bin','loan_amount_bin','income_bin','loan_term_bin'],axis=1)
df_cleaned.head()

In [None]:
# Train model
x=df_cleaned.drop(columns=['loan_status'])
y=df_cleaned['loan_status']

In [None]:

import joblib

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
model=LogisticRegression()
model.fit(x_train,y_train)

In [None]:
# predict on X_test(from train_test_split)
joblib.dump(model,"load_model.pkl")
print("model saved as loan_model.pkl")
y_pred=model.predict(x_test)
y_pred.shape

In [None]:
#show Confusion Matrix
cm=confusion_matrix(y_test,y_pred)

disp=ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model.classes_)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()
print("Confusion Matrix",cm)

Inference:<br>
total True negative is 467<br>
total false positive is 39<br>
total false negative is 16 <br>
total positive is 269<br>

Confusion Matrix:
A confusion matrix is a table used to evaluate the performance of classification model especially in binary and multi-class classification <br>
                 predicted<br>
                  0      1 <br>
 Actual value  0  TN     FP<br>
               1  FP    TP <br>
 TP (True Positive): model predicted 1(positive and actual is also 1)<br>
 TN ( True Negative): Model predicted 0(negative) and actual is also 0)<br>
 FP ( False Positive): Model predicted 1 but actual is 0( type | error)<br>
 FN ( False Negative): Model predicted 0 but acutal is 1 ( type| error)<br>

In [None]:
# show Accuracy, score and classification report
print("Accuracy Score:",accuracy_score(y_test,y_pred))
print(f"Accuracy: {accuracy_score(y_test,y_pred)*100:.2f}%")
print("Classification Report:\n",classification_report(y_test,y_pred))

Classification Report:<br>
A classification report summarizes the performance of a classification model using key metrics for each class. <br>
Precision: out of all predicted positive , how many were actually correct (TP/(TP+FP))<br>
Recall: out of all actual positives, how many did the model correct identity?(TP/(TP+FN)) <br>
 F1-score: Balance between precision and recall (2*(precision * recall)/(precision+recall)) <br>
support: Number of actual instance for each class in the test set <br>
Accuracy: (TP+TN)/Total overall how many prediction were correct<br>
Macro Avg: Average of precision , recall, f1-score across all classess equally. <br>
weighted Avg: Average of precision , recall , f1-score weighted by support(sample count)

In [None]:
# Plot Graph: Actual vs prediction
plt.figure(figsize=(8,4))
plt.plot(list(range(len(y_test))),y_test.values,marker='o',label="Actual")
plt.plot(list(range(len(y_pred))),y_pred,marker='X',label="Predicted",linestyle='dashed')
plt.title("Actual vs predicted loan status")
plt.xlabel("Sample Index")
plt.ylabel("loan Status (0=Reject,1=Approved)")
plt.legend()
plt.grid()
plt.show()

In [None]:
# For User:
def predict_loan_status(user_input):
    user_df=pd.DataFrame([user_input])
    prediction=model.predict(user_df)[0]
    return "Loan Approved" if prediction==1 else "Loan Rejected"

In [None]:
df_cleaned.columns

# for User input

In [None]:
# take user input:
user_input={
    'no_of_dependents':int(input("Enter your number of dependents: ")),
    'education':int(input("Enter your education status(if Graduate then enter 1 otherwise 0): ")),
    'employed':int(input("Enter your employed status(if employed press 1 otherwise 0): ")),
    'income_annum':int(input("Enter your per annum income: ")),
    'loan_amount':int(input("Enter your loan amount: ")),
    'loan_term':int(input("Enter your loan term: ")),
    'cibil_score':int(input("Enter your cibil score(300-1000): ")),
    'residential_assets_value':int(input("Enter your Residential assets: ")),
    'commercial_assets_value':int(input("Enter your commercial assets value: ")),
    'luxury_assets_value':int(input("Enter your luxury assets value: ")),
    'bank_asset_value':int(input("Enter your bank assets value: ")),


}
print(predict_loan_status(user_input))