In [2]:
# Imports
import numpy as np
import pandas as pd
import tensorflow as tf

In [3]:
ds = pd.read_csv('SBA Dataset.csv')
X = ds.iloc[:, 3:16].values
Y = ds.iloc[:, -1].values

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 10)


ds.head(3)

Unnamed: 0,LoanNr_ChkDgt,Name,City,State,Bank,...,DisbursementGross,BalanceGross,ApprovalDate,Zip,Term
0,1000014003,ABC HOBBYCRAFT,EVANSVILLE,IN,FIFTH THIRD BANK,...,"$60,000.00",$0.00,28-Feb-97,47711,84
1,1000024006,LANDMARK BAR & GRILLE (THE),NEW PARIS,IN,1ST SOURCE BANK,...,"$40,000.00",$0.00,28-Feb-97,46526,60
2,1000034009,"WHITLOCK DDS, TODD M.",BLOOMINGTON,IN,GRANT COUNTY STATE BANK,...,"$287,000.00",$0.00,28-Feb-97,47401,180


In [4]:
# For some reason there is 1976A so I edited the dataset
ds.RevLineCr.value_counts()

RevLineCr
N    420288
0    257602
Y    201397
T     15284
1        23
R        14
`        11
2         6
C         2
3         1
,         1
7         1
A         1
5         1
.         1
4         1
-         1
Q         1
Name: count, dtype: int64

In [5]:
ds['LowDoc'].describe()

count     896582
unique         8
top            N
freq      782822
Name: LowDoc, dtype: object

In [6]:
# Null Values in MIS Status
ds.dropna(subset=['MIS_Status'], inplace=True)
ds.dropna(subset=['State'], inplace=True)
ds.shape

(897154, 26)

In [7]:

    
# Convert LowDoc and RevLineCR 
#ds['Low_Doc'] = ds['LowDoc'].apply(yes_no_binary).astype('Int64')

In [8]:
print(X)
print(Y)

[['IN' 'FIFTH THIRD BANK' 'OH' ... 'N' 'Y' 'P I F']
 ['IN' '1ST SOURCE BANK' 'IN' ... 'N' 'Y' 'P I F']
 ['IN' 'GRANT COUNTY STATE BANK' 'IN' ... 'N' 'N' 'P I F']
 ...
 ['CA' 'RABOBANK, NATIONAL ASSOCIATION' 'CA' ... 'N' 'N' 'P I F']
 ['HI' 'BANK OF HAWAII' 'HI' ... 'N' 'Y' 'CHGOFF']
 ['HI' 'CENTRAL PACIFIC BANK' 'HI' ... 'N' 'N' 'P I F']]
[ 84  60 180 ... 108  60  48]


In [9]:
ds = ds.drop(ds[(ds['MIS_Status'] == 'P I F') & (ds['ApprovalFY'] >= 2014)].index)

# Cleaning up LowDoc Values 
val_to_drop = ['0', 'C', 'S', 'A', 'R', '1']
ds.dropna(subset=['LowDoc'], inplace=True)
#ds = ds[~ds['LowDoc'].isin(val_to_drop)]
ds = ds[ds['LowDoc'] != '0']
ds = ds[ds['LowDoc'] != 'C']
ds = ds[ds['LowDoc'] != 'S']
ds = ds[ds['LowDoc'] != 'A']
ds = ds[ds['LowDoc'] != 'R']
ds = ds[ds['LowDoc'] != '1']
ds.shape

(890896, 26)

In [10]:
val_to_drop_revlinecr = ['1', 'R', '`', '2', 'C', '3', ',', '7', 'A', '5', '.', '4', '-', 'Q']
ds = ds[~ds['RevLineCr'].isin(val_to_drop_revlinecr)]
ds.shape

(890831, 26)

In [11]:
ds.Term.describe()

count    890831.000000
mean        110.834972
std          78.880232
min           0.000000
25%          60.000000
50%          84.000000
75%         120.000000
max         569.000000
Name: Term, dtype: float64

## Encoding

In [13]:
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing

#figure out encoding for Low Doc, go down the columns
le = LabelEncoder()

X[:, 12] = le.fit_transform(X[:, 12])
X[:, 11] = le.fit_transform(X[:, 11])
X[:, 10] = le.fit_transform(X[:, 10])
X[:, 8] = le.fit_transform(X[:, 8])
X[:, 3] = le.fit_transform(X[:, 3])
X[:, 2] = le.fit_transform(X[:, 2])
X[:, 1] = le.fit_transform(X[:, 1])
X[:, 0] = le.fit_transform(X[:, 0])

# Print the first few rows to see the encoding
print(X)

[[15 2143 38 ... 12 7 1]
 [15 132 18 ... 12 7 1]
 [15 2859 18 ... 12 4 1]
 ...
 [4 4335 5 ... 12 4 1]
 [11 551 14 ... 12 7 0]
 [11 1077 14 ... 12 4 1]]


## Spliting Training set and Test set

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### Feature Scaling

In [15]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Neural Network Architecture

In [16]:
model = tf.keras.models.Sequential()

In [21]:
# Input Layer
model.add(tf.keras.layers.Dense(units=64, activation='relu'))

# Second Hidden Layer
model.add(tf.keras.layers.Dense(units=32, activation='relu'))
#model.add(Dropout(0.5))

model.add(tf.keras.layers.Dense(units=16, activation='relu'))
#model.add(Dropout(0.5))
#Output Layer
model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Training NN

In [22]:
# Compiling
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [23]:
model.fit(X_train, Y_train, batch_size=64, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x2392d26d850>