# Neural Network to predict loan risk

dataset used for training: https://www.kaggle.com/datasets/laotse/credit-risk-dataset

In [3]:
# Tensorflow / Keras
from tensorflow import keras # for building Neural Networks
print('Tensorflow/Keras: %s' % keras.__version__) # print version
from keras.models import Sequential # for creating a linear stack of layers for our Neural Network
from keras import Input # for instantiating a keras tensor
from keras.layers import Dense # for creating regular densely-connected NN layers.

# Data manipulation
import pandas as pd # for data manipulation
print('pandas: %s' % pd.__version__) # print version
import numpy as np # for data manipulation
print('numpy: %s' % np.__version__) # print version

# Sklearn
import sklearn # for model evaluation
print('sklearn: %s' % sklearn.__version__) # print version
from sklearn.model_selection import train_test_split # for splitting data into train and test samples
from sklearn.metrics import classification_report # for model evaluation metrics

Tensorflow/Keras: 2.12.0
pandas: 2.0.2
numpy: 1.23.5
sklearn: 1.2.2


In [43]:
df = pd.read_csv("data/credit_risk_dataset.csv")
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [44]:
## Checking for Duplicates
dups = df.duplicated()
dups.value_counts() #There are 165 Duplicated rows

False    32416
True       165
Name: count, dtype: int64

In [45]:
## Removing the Duplicates
print(f"Shape of Data before removing duplicates -----> ({df.shape[0]},{df.shape[1]}) \n")
df.drop_duplicates(inplace=True)
print(f"Shape of Data after removing duplicates -----> ({df.shape[0]},{df.shape[1]})")

Shape of Data before removing duplicates -----> (32581,12) 

Shape of Data after removing duplicates -----> (32416,12)


#### Target variable is loan_status

In [46]:
df["loan_status"].value_counts(normalize=True)

loan_status
0    0.781312
1    0.218688
Name: proportion, dtype: float64

#### Split the data into train and test sets

In [47]:
X, X_test, y, y_test = train_test_split(df.drop('loan_status', axis=1), df['loan_status'],
                                        random_state=0,  test_size=0.2, stratify=df['loan_status'],
                                        shuffle=True)

In [48]:
y.head()

21415    0
12916    0
2938     0
19114    1
6057     0
Name: loan_status, dtype: int64

In [49]:
# Set Pandas options to display more columns
pd.options.display.max_columns=50

# For columns with missing values, fill them in with column mean
# df=df.fillna(df.mean())

df.isnull().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              887
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3095
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [50]:
df['person_emp_length']=df['person_emp_length'].fillna(df['person_emp_length'].mean())
df['loan_int_rate']=df['loan_int_rate'].fillna(df['loan_int_rate'].mean())

#### categorical data to numeric

In [51]:
print(df['person_home_ownership'].unique())
print(df['loan_intent'].unique())
print(df['loan_grade'].unique())
print(df['cb_person_default_on_file'].unique())

['RENT' 'OWN' 'MORTGAGE' 'OTHER']
['PERSONAL' 'EDUCATION' 'MEDICAL' 'VENTURE' 'HOMEIMPROVEMENT'
 'DEBTCONSOLIDATION']
['D' 'B' 'C' 'A' 'E' 'F' 'G']
['Y' 'N']


In [52]:
df['person_home_ownership'].replace(['RENT', 'OWN', 'MORTGAGE', 'OTHER'],
                        [0, 1, 2, 3], inplace=True)
df['loan_intent'].replace(['PERSONAL', 'EDUCATIONAL', 'MEDICAL', 'VENTURE', 'HOMEIMPROVEMENT', 'DEBTCONSOLIDATION'],
                        [0, 1, 2, 3, 4, 5], inplace=True)
df['loan_grade'].replace(['A', 'B', 'C', 'D', 'E', 'F', 'G'],
                        [0, 1, 2, 3, 4, 5, 6], inplace=True)
df['cb_person_default_on_file'].replace(['Y', 'N'],
                        [0, 1], inplace=True)

In [53]:
model = Sequential(name="Model-with-One-Input") # Model
model.add(Input(shape=(11,), name='Input-Layer')) # Input Layer - need to speicfy the shape of inputs
model.add(Dense(2, activation='softplus', name='Hidden-Layer')) # Hidden Layer, softplus(x) = log(exp(x) + 1)
model.add(Dense(1, activation='sigmoid', name='Output-Layer')) # Output Layer, sigmoid(x) = 1 / (1 + exp(-x))

### Training a Model