In [8]:
import numpy as np
import pandas as pd
import imblearn
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier


def preprocessing():
    # Removes the dollar signs and commas
    def custom_dollar_converter(dollar_str):
        if '#' in dollar_str:
            return np.nan
        else:
            dollar_str = dollar_str.replace('$', '').replace(',', '')
            return float(dollar_str)
    
    # Determines whether a business is a franchise or not
    # If the value is 0 or 1, the business is not a franchise
    def custom_franchise_converter(franchise_str):
        if franchise_str.strip() == '0' or franchise_str.strip() == '1':
            return 0
        else:
            return 1
    
    # Trims zip codes to the first two numbers
    def zip_trimmer(zip_str):
        return zip_str[:2]
    
    # reads in data, using the custom converters
    bank_converters = {
        'DisbursementGross': custom_dollar_converter,
        'BalanceGross': custom_dollar_converter,
        'ChgOffPrinGr': custom_dollar_converter,
        'GrAppv': custom_dollar_converter,
        'SBA_Appv': custom_dollar_converter,
        'FranchiseCode': custom_franchise_converter,
        'Zip': zip_trimmer,
    }

    bank_parse_dates = [ 'ApprovalDate', 'ChgOffDate', 'DisbursementDate' ]

    df = pd.read_csv(
        'SBAnational.csv',
        converters=bank_converters,
        parse_dates=bank_parse_dates,
        date_parser=pd.to_datetime,
    )

    # drops unnecessary columns
    drop_columns = [
        'Name', 'City', 'ChgOffDate', 'DisbursementDate',
        'LoanNr_ChkDgt', 'Bank', 'NAICS',
        'CreateJob', 'RetainedJob', 'ChgOffPrinGr',
        'RevLineCr', 'LowDoc',
    ]
    working_df = df.drop(columns=drop_columns)
    
    # removes all null values
    working_df = working_df.dropna()
    
    
    # label encode MIS_Status
    mis_label_encoder = LabelEncoder()
    mis_encoded = mis_label_encoder.fit_transform(working_df['MIS_Status'])
    working_df['MIS_Status'] = mis_encoded
    
    
    # makes the NewExist variable more intuitive
    # a value of 1 means the business is new
    # a value of 0 means the business is not new
    working_df['NewExist'] = working_df['NewExist'].replace({ 2:1, 1:0 })
    # one hot encoding NewExist
    new_exist_true = working_df['NewExist'] == 1
    new_exist_false = working_df['NewExist'] == 0
    working_df['NewExistTrue'] = new_exist_true
    working_df['NewExistFalse'] = new_exist_false
    working_df = working_df.drop(columns=['NewExist'])
    

    # handling datetime information
    approval_date_months = working_df['ApprovalDate'].dt.month
    approval_date_days = working_df['ApprovalDate'].dt.day
    working_df['ApprovalMonth'] = approval_date_months
    working_df['ApprovalDay'] = approval_date_days
    working_df = working_df.drop(columns=['ApprovalDate'])

    approval_years = []
    for date in working_df['ApprovalFY']:
        if date == '1976A':
            approval_years.append(1976)
        else:
            approval_years.append(int(date))

    working_df['ApprovalFY'] = np.array(approval_years).astype(np.int64)
    
    
    # label encoding state information
    state_label_encoder = LabelEncoder()
    state_encoded = state_label_encoder.fit_transform(working_df['State'])
    bank_state_encoded = state_label_encoder.fit_transform(working_df['BankState'])
    working_df['State'] = state_encoded
    working_df['BankState'] = bank_state_encoded
    
    
    # converting zip information to the right datatype
    working_df['Zip'] = pd.to_numeric(working_df['Zip'])
    
    
    # separate data by features and target
    X = working_df.drop(columns=['MIS_Status'])
    y = working_df['MIS_Status']
    # separate the testing data from the training data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    
    # balancing the training data based on MIS_Status
    sampler = RandomOverSampler(sampling_strategy='minority')
    df_without_status, df_with_status = X_train, y_train
    df_without_status_rebalanced, df_with_status_rebalanced = sampler.fit_resample(
        df_without_status, df_with_status
    )
    X_train, y_train = df_without_status_rebalanced, df_with_status_rebalanced
    
    
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = preprocessing()

  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(
  df = pd.read_csv(


In [9]:
model = CatBoostClassifier(iterations = 2, learning_rate= 1, depth = 2);
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
accuracy

0:	learn: 0.4910775	total: 99ms	remaining: 99ms
1:	learn: 0.4283798	total: 215ms	remaining: 0us


0.8515482797512032

In [16]:

model = CatBoostClassifier(iterations = 9 , learning_rate = 0.1, depth = 6);
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
accuracy

0:	learn: 0.6417551	total: 121ms	remaining: 965ms
1:	learn: 0.6001338	total: 276ms	remaining: 967ms
2:	learn: 0.5653032	total: 481ms	remaining: 961ms
3:	learn: 0.5366426	total: 662ms	remaining: 827ms
4:	learn: 0.5090879	total: 844ms	remaining: 675ms
5:	learn: 0.4875136	total: 1.04s	remaining: 521ms
6:	learn: 0.4672581	total: 1.22s	remaining: 349ms
7:	learn: 0.4511044	total: 1.39s	remaining: 174ms
8:	learn: 0.4367958	total: 1.55s	remaining: 0us


0.8511797746535493

In [4]:
model = CatBoostClassifier(iterations = 500 , learning_rate = 0.1, depth = 6);
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
accuracy

0:	learn: 0.6175587	total: 368ms	remaining: 3m 3s
1:	learn: 0.5613407	total: 747ms	remaining: 3m 6s
2:	learn: 0.5172752	total: 1.18s	remaining: 3m 14s
3:	learn: 0.4822994	total: 1.47s	remaining: 3m 2s
4:	learn: 0.4545762	total: 1.8s	remaining: 2m 57s
5:	learn: 0.4342318	total: 2.14s	remaining: 2m 56s
6:	learn: 0.4178105	total: 2.45s	remaining: 2m 52s
7:	learn: 0.4042374	total: 2.77s	remaining: 2m 50s
8:	learn: 0.3936385	total: 3.1s	remaining: 2m 49s
9:	learn: 0.3821812	total: 3.44s	remaining: 2m 48s
10:	learn: 0.3740790	total: 3.78s	remaining: 2m 48s
11:	learn: 0.3661048	total: 4.11s	remaining: 2m 47s
12:	learn: 0.3589571	total: 4.49s	remaining: 2m 48s
13:	learn: 0.3529250	total: 4.9s	remaining: 2m 50s
14:	learn: 0.3484937	total: 5.34s	remaining: 2m 52s
15:	learn: 0.3439298	total: 5.67s	remaining: 2m 51s
16:	learn: 0.3384730	total: 6s	remaining: 2m 50s
17:	learn: 0.3335199	total: 6.29s	remaining: 2m 48s
18:	learn: 0.3298405	total: 6.71s	remaining: 2m 50s
19:	learn: 0.3252797	total: 7.0

0.9313854674989671