# Importing the libaries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset

In [2]:
df = pd.read_csv('Loan_Default.csv')

# Data exploration

### Observing the first 5 rows of the dataset.

In [3]:
df.head()

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0
3,24893,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,...,EXP,587,CIB,45-54,not_inst,69.3769,North,direct,0,42.0
4,24894,2019,cf,Joint,pre,type1,p1,l1,nopc,nob/c,...,CRIF,602,EXP,25-34,not_inst,91.886544,North,direct,0,39.0


### Checking whether there is any missing data.

In [4]:
df.isnull().sum()

ID                               0
year                             0
loan_limit                    3344
Gender                           0
approv_in_adv                  908
loan_type                        0
loan_purpose                   134
Credit_Worthiness                0
open_credit                      0
business_or_commercial           0
loan_amount                      0
rate_of_interest             36439
Interest_rate_spread         36639
Upfront_charges              39642
term                            41
Neg_ammortization              121
interest_only                    0
lump_sum_payment                 0
property_value               15098
construction_type                0
occupancy_type                   0
Secured_by                       0
total_units                      0
income                        9150
credit_type                      0
Credit_Score                     0
co-applicant_credit_type         0
age                            200
submission_of_applic

- According to this, it appears that some of the data are missing.

### Before we get into the null value data, let's see what data we can remove (unnecessary data).

#### Summarizing the data (Observing data name and type).

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148670 entries, 0 to 148669
Data columns (total 34 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   ID                         148670 non-null  int64  
 1   year                       148670 non-null  int64  
 2   loan_limit                 145326 non-null  object 
 3   Gender                     148670 non-null  object 
 4   approv_in_adv              147762 non-null  object 
 5   loan_type                  148670 non-null  object 
 6   loan_purpose               148536 non-null  object 
 7   Credit_Worthiness          148670 non-null  object 
 8   open_credit                148670 non-null  object 
 9   business_or_commercial     148670 non-null  object 
 10  loan_amount                148670 non-null  int64  
 11  rate_of_interest           112231 non-null  float64
 12  Interest_rate_spread       112031 non-null  float64
 13  Upfront_charges            10

- According to our objective, which is to predict whether a customer would default on a loan, the ID, year, approve_in_advance, and submission_of_application may not be relevant. Therefore, it should be removed.

In [6]:
df.columns

Index(['ID', 'year', 'loan_limit', 'Gender', 'approv_in_adv', 'loan_type',
       'loan_purpose', 'Credit_Worthiness', 'open_credit',
       'business_or_commercial', 'loan_amount', 'rate_of_interest',
       'Interest_rate_spread', 'Upfront_charges', 'term', 'Neg_ammortization',
       'interest_only', 'lump_sum_payment', 'property_value',
       'construction_type', 'occupancy_type', 'Secured_by', 'total_units',
       'income', 'credit_type', 'Credit_Score', 'co-applicant_credit_type',
       'age', 'submission_of_application', 'LTV', 'Region', 'Security_Type',
       'Status', 'dtir1'],
      dtype='object')

#### Removing the unnecessary data.

In [7]:
df_dropped = df.drop(columns = ['ID', 'year', 'approv_in_adv', 'submission_of_application'])

#### Check the new dataset.

In [8]:
df_dropped.head()

Unnamed: 0,loan_limit,Gender,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,loan_amount,rate_of_interest,Interest_rate_spread,...,income,credit_type,Credit_Score,co-applicant_credit_type,age,LTV,Region,Security_Type,Status,dtir1
0,cf,Sex Not Available,type1,p1,l1,nopc,nob/c,116500,,,...,1740.0,EXP,758,CIB,25-34,98.728814,south,direct,1,45.0
1,cf,Male,type2,p1,l1,nopc,b/c,206500,,,...,4980.0,EQUI,552,EXP,55-64,,North,direct,1,
2,cf,Male,type1,p1,l1,nopc,nob/c,406500,4.56,0.2,...,9480.0,EXP,834,CIB,35-44,80.019685,south,direct,0,46.0
3,cf,Male,type1,p4,l1,nopc,nob/c,456500,4.25,0.681,...,11880.0,EXP,587,CIB,45-54,69.3769,North,direct,0,42.0
4,cf,Joint,type1,p1,l1,nopc,nob/c,696500,4.0,0.3042,...,10440.0,CRIF,602,EXP,25-34,91.886544,North,direct,0,39.0


# Data preprocessing

## Separate data for observing the prediction

In [9]:
X = df_dropped.drop(columns=['Status'], axis = 1)
y = df_dropped['Status'].values

In [10]:
print(X)

       loan_limit             Gender loan_type loan_purpose Credit_Worthiness  \
0              cf  Sex Not Available     type1           p1                l1   
1              cf               Male     type2           p1                l1   
2              cf               Male     type1           p1                l1   
3              cf               Male     type1           p4                l1   
4              cf              Joint     type1           p1                l1   
...           ...                ...       ...          ...               ...   
148665         cf  Sex Not Available     type1           p3                l1   
148666         cf               Male     type1           p1                l1   
148667         cf               Male     type1           p4                l1   
148668         cf             Female     type1           p4                l1   
148669         cf             Female     type1           p3                l1   

       open_credit business

In [11]:
print(y)

[1 1 0 ... 0 0 0]


- The object data needed to be encoded so that it can be used for the prediction.

- From this code, we will split the data into a 75% training set and a 25% test set.

## Encoding categorical data

### Encoding the X dataset

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
object_columns_indices = X.select_dtypes(include=['object']).columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), object_columns_indices)], remainder='passthrough')
X = np.array(ct.fit_transform(X))

#### Check the X

In [13]:
print(X)

[[  1.           0.           0.         ... 758.          98.72881356
   45.        ]
 [  1.           0.           0.         ... 552.                  nan
           nan]
 [  1.           0.           0.         ... 834.          80.01968504
   46.        ]
 ...
 [  1.           0.           0.         ... 702.          61.33241758
   49.        ]
 [  1.           0.           0.         ... 737.          70.68345324
   29.        ]
 [  1.           0.           0.         ... 830.          72.84946237
   44.        ]]


## Splitting the data into training set and test set

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (111502, 69)
y_train shape: (111502,)
X_test shape: (37168, 69)
y_test shape: (37168,)


## Filling the missing values with the data's average value.

- It should be noted that we fill the null value after splitting the data into test and training sets because we will fill the null value with the average value; therefore, doing so after splitting the data will not result in leaked data from the test set.

- Also, based on our prior checks for missing values, it appears that the Status value has no missing values; thus, there is no need to do it for the y_train and y_test datasets.

### Filling the X_train dataset

#### Numerical values

In [15]:
X_train_df = pd.DataFrame(X_train)
numerical_columns = X_train_df.select_dtypes(include=['float64', 'int64']).columns
X_train_df[numerical_columns] = X_train_df[numerical_columns].fillna(X_train_df[numerical_columns].mean())
X_train = X_train_df.values

- Fill the missing numerical values with the mean

#### Object values

In [16]:
X_train_df = pd.DataFrame(X_train)
X_train_df = X_train_df.ffill()
X_train = X_train_df.values

- Fill the missing object value with the previous value

#### Check the X_train dataset

In [17]:
np.isnan(X_train).sum()

0

- 0 Missing values

### Filling the X_test dataset

#### Numerical values

In [18]:
X_test_df = pd.DataFrame(X_test)
numerical_columns = X_test_df.select_dtypes(include=['float64', 'int64']).columns
X_test_df[numerical_columns] = X_test_df[numerical_columns].fillna(X_test_df[numerical_columns].mean())
X_test = X_test_df.values

- Fill the missing numerical values with the mean

#### Object values

In [19]:
X_test_df = pd.DataFrame(X_test)
X_test_df = X_test_df.ffill()
X_test = X_test_df.values

- Fill the missing object value with the previous value

#### Check the X_test dataset

In [20]:
np.isnan(X_test).sum()

0

## Feature Scaling

- To normalize the range of independent variables of data

In [21]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [22]:
print(X_train)

[[-3.18716341e+00  3.73452123e+00 -1.52463580e-01 ... -6.87778783e-01
  -1.03390558e-01 -1.22547787e-12]
 [ 3.13758622e-01 -2.67771942e-01 -1.52463580e-01 ...  1.04665197e+00
  -6.92701204e-01 -1.80628622e-01]
 [-3.18716341e+00 -2.67771942e-01  6.55894347e+00 ...  6.92862610e-01
   1.76498611e-01 -1.22547787e-12]
 ...
 [ 3.13758622e-01 -2.67771942e-01 -1.52463580e-01 ... -7.48181844e-01
  -4.84733300e-01  6.48059318e-01]
 [-3.18716341e+00  3.73452123e+00 -1.52463580e-01 ... -1.09334219e+00
  -1.61419836e-01 -2.84214614e-01]
 [ 3.13758622e-01 -2.67771942e-01 -1.52463580e-01 ... -2.56328348e-01
  -2.72152424e-02  2.65433630e-02]]


In [23]:
print(X_test)

[[-3.18716341e+00  3.73452123e+00 -1.52463580e-01 ...  1.05528098e+00
  -1.27080238e-03 -4.48255388e-03]
 [ 3.13758622e-01 -2.67771942e-01 -1.52463580e-01 ... -1.69737280e+00
  -1.45511039e-01 -1.00931656e+00]
 [ 3.13758622e-01 -2.67771942e-01 -1.52463580e-01 ... -3.51247444e-01
   5.69212619e-01  3.37301340e-01]
 ...
 [ 3.13758622e-01 -2.67771942e-01 -1.52463580e-01 ...  1.59027952e+00
   2.68165092e-01 -1.80628622e-01]
 [ 3.13758622e-01 -2.67771942e-01 -1.52463580e-01 ...  1.29689322e+00
   2.59310694e-01 -6.98558584e-01]
 [ 3.13758622e-01 -2.67771942e-01 -1.52463580e-01 ...  9.34474854e-01
   4.35118842e-01  6.48059318e-01]]


# Training using the Random Forest Classification model on the training set

In [24]:
print(X_train.shape)

(111502, 69)


In [25]:
print(y_train.shape)

(111502,)


In [26]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

## Predicting a new Test set result

In [27]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


## Making the Confusion Matrix

In [28]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[27952     0]
 [ 4945  4271]]


0.8669554455445545

# Training the Naive Bayes Model on the Training set

In [29]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

## Predicting new Test set results

In [30]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


## Making the Confusion Matrix

In [31]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[27951     1]
 [ 5322  3894]]


0.8567854068015497

# Summary

- Comparing 2 classification model which is the random forest and the naive bayes, both models have quite good accuracy scores, however the Random Forest model is slightly more accurate than Naive Bayes. Nevertheless, accuracy alone may not provide a complete picture, especially in imbalanced datasets or when the cost of misclassification varies between classes.

- Furthermore, Random Forest produces less false positives (4945) and false negatives (4271) than Naive Bayes (5322 false positives and 3894 false negatives). This shows that the Random Forest model strikes a better balance between false positives and false negatives, which is critical in loan default prediction because both types of errors have serious consequences.