In [35]:
%matplotlib inline 
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import random
import math 

# Read hmeq.csv file
hmeq = pd.read_csv("hmeq.csv")
hmeq.head() #Look at the head of the data (just the first few rows)

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,1,1100,25860.0,39025.0,HomeImp,Other,10.5,0.0,0.0,94.366667,1.0,9.0,,,,,
1,1,1300,70053.0,68400.0,HomeImp,Other,7.0,0.0,2.0,121.833333,0.0,14.0,,,,,
2,1,1500,13500.0,16700.0,HomeImp,Other,4.0,0.0,0.0,149.466667,1.0,10.0,,,,,
3,1,1500,,,,,,,,,,,,,,,
4,0,1700,97800.0,112000.0,HomeImp,Office,3.0,0.0,0.0,93.333333,0.0,14.0,,,,,


In [36]:
# Removing any unnecessary column
hmeq_prepared = hmeq.drop(columns=['Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16'])

In [37]:
# Check statistical data from each numerical column
hmeq_prepared.describe()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
count,5962.0,5962.0,5444.0,5850.0,5254.0,5382.0,5654.0,5452.0,5740.0,4695.0
mean,0.201275,19726.05,73756.449706,258073.8,0.259136,0.449275,179.738555,1.237366,21.287456,35.185829
std,0.420589,85973.3,44450.895907,11769070.0,0.890592,1.12709,85.853356,3.615545,10.141083,73.596762
min,0.0,1100.0,2063.0,-25000.0,0.0,0.0,0.0,0.0,0.0,-21.719596
25%,0.0,11100.0,46292.0,66085.75,0.0,0.0,115.089691,0.0,14.0,29.136864
50%,0.0,16400.0,65019.0,89245.0,0.0,0.0,173.466667,1.0,20.0,34.81696
75%,0.0,23300.0,91482.0,119838.8,0.0,0.0,231.587389,2.0,26.0,39.008591
max,10.0,6600000.0,399550.0,900140000.0,20.0,15.0,1168.233561,229.518295,71.0,4537.512708


In [38]:
# Change the object type of YOJ to numerical
hmeq_prepared['YOJ'] = pd.to_numeric(hmeq_prepared['YOJ'], errors='coerce')
hmeq_prepared.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1100,25860.0,39025.0,HomeImp,Other,10.5,0.0,0.0,94.366667,1.0,9.0,
1,1,1300,70053.0,68400.0,HomeImp,Other,7.0,0.0,2.0,121.833333,0.0,14.0,
2,1,1500,13500.0,16700.0,HomeImp,Other,4.0,0.0,0.0,149.466667,1.0,10.0,
3,1,1500,,,,,,,,,,,
4,0,1700,97800.0,112000.0,HomeImp,Office,3.0,0.0,0.0,93.333333,0.0,14.0,


In [39]:
# If there's any NaN value in dataset, it will be changed with their column mean
hmeq_prepared = hmeq_prepared.fillna(hmeq_prepared.mean())
hmeq_prepared = hmeq_prepared.round(2)

In [40]:
# Checking first 5 column in prepared hmeq
hmeq_prepared.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1100,25860.0,39025.0,HomeImp,Other,10.5,0.0,0.0,94.37,1.0,9.0,35.19
1,1,1300,70053.0,68400.0,HomeImp,Other,7.0,0.0,2.0,121.83,0.0,14.0,35.19
2,1,1500,13500.0,16700.0,HomeImp,Other,4.0,0.0,0.0,149.47,1.0,10.0,35.19
3,1,1500,73756.45,258073.79,,,8.92,0.26,0.45,179.74,1.24,21.29,35.19
4,0,1700,97800.0,112000.0,HomeImp,Office,3.0,0.0,0.0,93.33,0.0,14.0,35.19


In [41]:
# Saving the prepared hmeq to new csv file called hmeq_prepared.csv
hmeq_prepared.to_csv('hmeq_prepared.csv', sep=',', encoding='utf-8',index=False)

In [42]:
# Trying to load the new hmeq csv file
hmeq_new = pd.read_csv("hmeq_prepared.csv")
hmeq_new.head() #Look at the head of the data (just the first few rows)

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1100,25860.0,39025.0,HomeImp,Other,10.5,0.0,0.0,94.37,1.0,9.0,35.19
1,1,1300,70053.0,68400.0,HomeImp,Other,7.0,0.0,2.0,121.83,0.0,14.0,35.19
2,1,1500,13500.0,16700.0,HomeImp,Other,4.0,0.0,0.0,149.47,1.0,10.0,35.19
3,1,1500,73756.45,258073.79,,,8.92,0.26,0.45,179.74,1.24,21.29,35.19
4,0,1700,97800.0,112000.0,HomeImp,Office,3.0,0.0,0.0,93.33,0.0,14.0,35.19


In [43]:
# Checking correlation in every feature that hmeq_prepared have
hmeq_new.corr(method='pearson').style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
BAD,1.0,-0.016,-0.046,-0.006,-0.058,0.24,0.32,-0.16,0.075,-0.0068,0.023
LOAN,-0.016,1.0,0.018,0.00032,0.015,-0.00033,-0.0045,0.011,0.0023,0.0064,-0.0032
MORTDUE,-0.046,0.018,1.0,0.0076,-0.083,-0.049,-0.00094,0.13,-0.0042,0.31,0.01
VALUE,-0.006,0.00032,0.0076,1.0,0.023,0.011,-0.0053,-0.002,-0.001,0.0058,2.7e-05
YOJ,-0.058,0.015,-0.083,0.023,1.0,-0.056,0.037,0.19,-0.033,0.023,-0.017
DEROG,0.24,-0.00033,-0.049,0.011,-0.056,1.0,0.17,-0.084,0.35,0.046,-0.0024
DELINQ,0.32,-0.0045,-0.00094,-0.0053,0.037,0.17,1.0,0.022,0.025,0.16,-0.002
CLAGE,-0.16,0.011,0.13,-0.002,0.19,-0.084,0.022,1.0,-0.082,0.23,-0.015
NINQ,0.075,0.0023,-0.0042,-0.001,-0.033,0.35,0.025,-0.082,1.0,0.015,0.0075
CLNO,-0.0068,0.0064,0.31,0.0058,0.023,0.046,0.16,0.23,0.015,1.0,0.0012


In [46]:
# Trying to normalize the number in hmeq_prepared
hmeq_new_num = hmeq_new.select_dtypes(include=[np.number])
hmeq_new_norm = (hmeq_new_num - hmeq_new_num.mean()) / (hmeq_new_num.max() - hmeq_new_num.min())
hmeq_new_norm_round = hmeq_new_norm.round(4)
hmeq_new_norm_round.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,0.0799,-0.0028,-0.1205,-0.0002,0.0385,-0.013,-0.03,-0.0731,-0.001,-0.1731,0.0
1,0.0799,-0.0028,-0.0093,-0.0002,-0.0469,-0.013,0.1034,-0.0496,-0.0054,-0.1026,0.0
2,0.0799,-0.0028,-0.1516,-0.0003,-0.12,-0.013,-0.03,-0.0259,-0.001,-0.159,0.0
3,0.0799,-0.0028,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.0201,-0.0027,0.0605,-0.0002,-0.1444,-0.013,-0.03,-0.074,-0.0054,-0.1026,0.0
