In [10]:
import numpy as np
import pandas as pd
import imblearn
import tensorflow as tf
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
def preprocessing():
    # Removes the dollar signs and commas
    def custom_dollar_converter(dollar_str):
        if '#' in dollar_str:
            return np.nan
        else:
            dollar_str = dollar_str.replace('$', '').replace(',', '')
            return float(dollar_str)
    
    # Determines whether a business is a franchise or not
    # If the value is 0 or 1, the business is not a franchise
    def custom_franchise_converter(franchise_str):
        if franchise_str.strip() == '0' or franchise_str.strip() == '1':
            return 0
        else:
            return 1
    
    # Trims zip codes to the first two numbers
    def zip_trimmer(zip_str):
        return zip_str[:2]
    
    # reads in data, using the custom converters
    bank_converters = {
        'DisbursementGross': custom_dollar_converter,
        'BalanceGross': custom_dollar_converter,
        'ChgOffPrinGr': custom_dollar_converter,
        'GrAppv': custom_dollar_converter,
        'SBA_Appv': custom_dollar_converter,
        'FranchiseCode': custom_franchise_converter,
        'Zip': zip_trimmer,
    }
    
    bank_parse_dates = [ 'ApprovalDate', 'ChgOffDate', 'DisbursementDate' ]

    df = pd.read_csv(
        'SBAnational.csv',
        converters=bank_converters,
        parse_dates=bank_parse_dates,
        date_parser=pd.to_datetime,
    )

    # drops unnecessary columns
    drop_columns = [
        'Name', 'City', 'ChgOffDate', 'DisbursementDate',
        'LoanNr_ChkDgt', 'Bank', 'NAICS',
        'CreateJob', 'RetainedJob', 'ChgOffPrinGr',
        'RevLineCr', 'LowDoc',
    ]
    working_df = df.drop(columns=drop_columns)
    
    # removes all null values
    working_df = working_df.dropna()
    
    
    # label encode MIS_Status
    mis_label_encoder = LabelEncoder()
    mis_encoded = mis_label_encoder.fit_transform(working_df['MIS_Status'])
    working_df['MIS_Status'] = mis_encoded
    
    
    # makes the NewExist variable more intuitive
    # a value of 1 means the business is new
    # a value of 0 means the business is not new
    working_df['NewExist'] = working_df['NewExist'].replace({ 2:1, 1:0 })
    # one hot encoding NewExist
    new_exist_true = working_df['NewExist'] == 1
    new_exist_false = working_df['NewExist'] == 0
    working_df['NewExistTrue'] = new_exist_true
    working_df['NewExistFalse'] = new_exist_false
    working_df = working_df.drop(columns=['NewExist'])
    

    # handling datetime information
    approval_date_months = working_df['ApprovalDate'].dt.month
    approval_date_days = working_df['ApprovalDate'].dt.day
    working_df['ApprovalMonth'] = approval_date_months
    working_df['ApprovalDay'] = approval_date_days
    working_df = working_df.drop(columns=['ApprovalDate'])

    approval_years = []
    for date in working_df['ApprovalFY']:
        if date == '1976A':
            approval_years.append(1976)
        else:
            approval_years.append(int(date))

    working_df['ApprovalFY'] = np.array(approval_years).astype(np.int64)
    
    
    # label encoding state information
    state_label_encoder = LabelEncoder()
    state_encoded = state_label_encoder.fit_transform(working_df['State'])
    bank_state_encoded = state_label_encoder.fit_transform(working_df['BankState'])
    working_df['State'] = state_encoded
    working_df['BankState'] = bank_state_encoded
    
    
    # converting zip information to the right datatype
    working_df['Zip'] = pd.to_numeric(working_df['Zip'])
    
    
    # separate data by features and target
    X = working_df.drop(columns=['MIS_Status'])
    y = working_df['MIS_Status']
    # separate the testing data from the training data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    
    # balancing the training data based on MIS_Status
    sampler = RandomOverSampler(sampling_strategy='minority')
    df_without_status, df_with_status = X_train, y_train
    df_without_status_rebalanced, df_with_status_rebalanced = sampler.fit_resample(
        df_without_status, df_with_status
    )
    X_train, y_train = df_without_status_rebalanced, df_with_status_rebalanced
    
    
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = preprocessing()

  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(


In [26]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [27]:
model = tf.keras.models.Sequential()

In [28]:
# Input Layer
model.add(tf.keras.layers.Dense(units=300, activation='relu'))

# Second Hidden Layer
model.add(tf.keras.layers.Dense(units=200, activation='relu'))
#model.add(Dropout(0.5))
model.add(tf.keras.layers.Dense(units=100, activation='relu'))
#model.add(Dropout(0.5))
model.add(tf.keras.layers.Dense(units=50, activation='relu'))
#Output Layer
model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [29]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [30]:
model.fit(X_train, y_train, batch_size=50, epochs=60, verbose=1)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


<keras.src.callbacks.History at 0x2ce5d57b350>