#Loan Prediction

The loan approval dataset is a collection of financial records and associated information used to determine the eligibility of individuals or organizations for obtaining loans from a lending institution. It includes various factors such as cibil score, income, employment status, loan term, loan amount, assets value, and loan status. This dataset is commonly used in machine learning and data analysis to develop models and algorithms that predict the likelihood of loan approval based on the given features.

##Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_palette(palette='viridis')
sns.set_style('whitegrid')
from sklearn.preprocessing import RobustScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
df = pd.read_csv('loan_approval_dataset.csv')
df.head()

In [None]:
df.shape

##Data Cleaning

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df = df.drop('loan_id', axis=1)

##Exploratory Data Analysis

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns = ['no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status']


In [None]:
df.columns

In [None]:
df['no_of_dependents'].value_counts()

In [None]:
df['education'].value_counts()

In [None]:
df['self_employed'].value_counts()

In [None]:
df['loan_term'].value_counts()

In [None]:
df['loan_status'].value_counts()

In [None]:
categorical_cols = ['no_of_dependents', 'education', 'self_employed', 'loan_status']

numerical_cols = ['income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value']

###Univariate Analysis

In [None]:
def univariate_categorical(ax, df, col):
  sns.countplot(x=df[col], data=df, ax=ax)
  ax.set_title(f'Count of {col}')
  ax.set_xlabel(f'{col}')
  ax.set_ylabel('count')


fig, axes = plt.subplots(2, 2, figsize=(14,10))
axes = axes.flatten()

for i, col in enumerate(categorical_cols):
  univariate_categorical(axes[i], df, col)

plt.tight_layout()
plt.show()

In [None]:
def univariate_numerical(ax, df, col):
  sns.histplot(x=df[col], data=df, kde=True, ax=ax)
  ax.set_title(f'Distribution of {col}')
  ax.set_xlabel(f'{col}')
  ax.set_ylabel('Frequency')

fig, axes = plt.subplots(4, 2, figsize=(14,20))
axes = axes.flatten()

for i, col in enumerate(numerical_cols):
  univariate_numerical(axes[i], df, col)

plt.tight_layout()
plt.show()

###Bivariate Analysis

In [None]:
def bivariate_categorical(ax, df, col):
  sns.countplot(hue=df['loan_status'], x=df[col], data=df, ax=ax)
  ax.set_title(f'{col} vs Loan Status')
  ax.set_xlabel('Loan Status')
  ax.set_ylabel(f'{col}')

fig, axes = plt.subplots(2, 2, figsize=(14,10))
axes = axes.flatten()

for i, col in enumerate(categorical_cols):
  bivariate_categorical(axes[i], df, col)

plt.tight_layout()
plt.show()

In [None]:
def bivariate_numerical(ax, df, col):
  sns.boxplot(x=df['loan_status'], y=df[col], data=df, ax=ax)
  ax.set_title(f'{col} vs Loan Status')
  ax.set_xlabel('Loan Status')
  ax.set_ylabel(f'{col}')

fig, axes = plt.subplots(4, 2, figsize=(14,20))
axes = axes.flatten()

for i, col in enumerate(numerical_cols):
  bivariate_numerical(axes[i], df, col)

plt.tight_layout()
plt.show()

##Train-Test-Split

In [None]:
le = LabelEncoder()
y = le.fit_transform(df['loan_status'])
X = df.drop('loan_status', axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##Preprocessing

scaling, encoding

In [None]:
X_train.columns

In [None]:
print(numerical_cols)

In [None]:
num_features = ['no_of_dependents', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
                'residential_assets_value', 'commercial_assets_value',
                'luxury_assets_value', 'bank_asset_value']

cat_features = ['education', 'self_employed']

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('Scaling', RobustScaler(), num_features),
        ('Encoding', OneHotEncoder(), cat_features)
    ]
)

##Training and Evaluation

In [None]:
rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('RandomForest', RandomForestClassifier(max_depth=None, n_estimators=100))
])

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [None]:
lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('LogisticRegression', LogisticRegression())
])

lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [None]:
print("Classification report of Random Forest: \n", classification_report(y_test, y_pred_rf))
print("Confusion matrix of Random Forest: \n")
cm_rf = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(7,5))
sns.heatmap(cm_rf, annot=True, fmt='d', yticklabels=['Approved', 'Not Approved'], xticklabels=['Not Approved', 'Approved'])
plt.show()

In [None]:
print("Classification report of Logistic Regression: \n", classification_report(y_test, y_pred_lr))
print("Confusion matrix of Logistic Regression: \n")
cm_lr = confusion_matrix(y_test, y_pred_lr)
plt.figure(figsize=(7,5))
sns.heatmap(cm_lr, annot=True, fmt='d', yticklabels=['Approved', 'Not Approved'], xticklabels=['Not Approved', 'Approved'])
plt.show()

In [None]:
import joblib

filename = "rf_pipeline.pkl"
joblib.dump(rf, open(filename, "wb"))

In [None]:
import sklearn
print(sklearn.__version__)

In [None]:
print(le.classes_)