# Loan Approval Prediction

In [57]:
import pandas as pd
import numpy as np

In [58]:
df=pd.read_csv("dataset/Training Dataset.csv")
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [59]:
df['Loan_Status'].value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

In [60]:
df.describe()


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


### Checking Missing Value

In [61]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [62]:
df.LoanAmount=df.LoanAmount.fillna(df.LoanAmount.mean())

df.Credit_History=df.Credit_History.fillna(df.Credit_History.mean())

df.Loan_Amount_Term=df.Loan_Amount_Term.fillna(df.Loan_Amount_Term.mean())

df['Gender'].fillna(df['Gender'].value_counts().idxmax(), inplace=True)

df['Married'].fillna(df['Married'].value_counts().idxmax(), inplace=True)

df.Dependents.fillna(df.Dependents.value_counts().idxmax(), inplace=True)

df.Self_Employed.fillna(df.Self_Employed.value_counts().idxmax(), inplace=True)

In [63]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

### Checking duploicate enteries

In [64]:
df.duplicated().sum()

0

## Removing unnecessary columns

In [65]:
df.drop(['Loan_ID'],axis=1, inplace=True)

In [66]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## Outlier

In [67]:
def outlier_thresholds(df, col_name, q1=0.25, q3=0.75):
    quartile1 = df[col_name].quantile(q1)
    quartile3 = df[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

numerical_cols = df.select_dtypes(include=[float, int]).columns.tolist()

def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False
    
cols_with_outliers = []
for i in numerical_cols:
    if check_outlier(df, i):
        cols_with_outliers.append(i)
print(cols_with_outliers)

['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']


### Now we will assign outliers with thresholds

In [68]:
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

for i in cols_with_outliers:
    replace_with_thresholds(df, i)

### Now lets convert categorical data to numerical data

In [69]:
from sklearn.preprocessing import OneHotEncoder

categorical_columns = df.select_dtypes(include=[object, 'category']).columns.tolist()
encoder = OneHotEncoder(sparse=False, drop='first') 
encoded_array = encoder.fit_transform(df[categorical_columns])
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_columns))
dataset = pd.concat([df.drop(columns=categorical_columns), encoded_df], axis=1)
dataset



Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y
0,5849.0,0.0,146.412162,360.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,4583.0,1508.0,128.000000,360.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3000.0,0.0,66.000000,360.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
3,2583.0,2358.0,120.000000,360.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
4,6000.0,0.0,141.000000,360.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,2900.0,0.0,71.000000,360.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
610,4106.0,0.0,40.000000,360.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
611,8072.0,240.0,253.000000,360.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
612,7583.0,0.0,187.000000,360.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


In [70]:
df = dataset

## Now lets deal with imbalanced data

In [71]:
df['Loan_Status_Y'].value_counts()

1.0    422
0.0    192
Name: Loan_Status_Y, dtype: int64

In [72]:
!pip install imblearn



In [73]:
from imblearn.over_sampling import SMOTE, RandomOverSampler

In [74]:
# Initialize SMOTE
smote = SMOTE(random_state=1)

# Apply SMOTE to the training data
X_resampled_smote, y_resampled_smote = smote.fit_resample(df[df.columns.to_list()[:-1]], df['Loan_Status_Y'])

# Initialize RandomOverSampler
ros = RandomOverSampler(random_state=1)

# Apply Random Over-Sampling to the SMOTE-resampled data
X_resampled_final, y_resampled_final = ros.fit_resample(X_resampled_smote, y_resampled_smote)

X, y = X_resampled_final, y_resampled_final

In [75]:
y.value_counts()

1.0    422
0.0    422
Name: Loan_Status_Y, dtype: int64

## Train Test Split

In [76]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((675, 14), (169, 14), (675,), (169,))

## Standardize our data

In [77]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

((675, 14), (169, 14))

## Apply Principal Component Analysis

In [78]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(X_train_scaled)
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
cumulative_variance_ratio
        

array([0.16067077, 0.29461593, 0.42084613, 0.52058962, 0.61942424,
       0.71076131, 0.78284094, 0.84530353, 0.9028868 , 0.94305039,
       0.98068893, 1.        , 1.        , 1.        ])

- We can see that only first 12 compenents captures almost 99% of variance of dataset so we will take first 12 principal component

In [79]:
pca = PCA(n_components=12)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
X_train_pca.shape, X_test_pca.shape, y_train.shape, y_test.shape

((675, 12), (169, 12), (675,), (169,))

## Train our model using Random Forest Classifier

In [82]:

from sklearn.tree import DecisionTreeClassifier  
classifier = DecisionTreeClassifier()
classifier.fit(X_train_pca, y_train) 
y_pred = classifier.predict(X_test_pca) 



In [84]:
from sklearn.metrics import accuracy_score, confusion_matrix

print(accuracy_score(y_pred=y_pred, y_true=y_test)*100)
print(confusion_matrix(y_pred=y_pred, y_true=y_test))

63.905325443786985
[[53 31]
 [30 55]]
