In [213]:
#1.Undersampling 
#2.Oversampling 
#3.SMOTE
#4.Ensemble Method

In [299]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline

# Read data
df = pd.read_csv("/Users/victorialu/Desktop/Datasets/cleaned_bankruptcy.csv")
print(df[:5])
df.shape[1]

   status_label  Current_Ratio  Quick_Ratio  Gross_Profit_Margin  \
0             1       2.828383     1.184518            19.167191   
1             1       0.848425     0.606935            41.189988   
2             1       2.880662     2.830330            62.776229   
3             1       1.405975     1.405975            68.751104   
4             1       1.411216     0.655987             5.729787   

   Net_Income_Margin  Return_on_Assets  Return_on_Equity  \
0           0.892295          1.023152          1.666133   
1          13.700293          2.870049          9.510936   
2          36.223372          8.490236          9.246249   
3          37.201908         24.293459        118.248175   
4           0.972325          3.869466         12.264547   

   Debt_to_Assets_Ratio  Interest_Coverage_Ratio  Debt_to_EBITDA_Ratio  ...  \
0              0.116226                 0.485300              2.060581  ...   
1              0.281433                 0.195090              5.125840  

22

In [300]:
df.shape

(8465, 22)

In [372]:
# Testint set-containing 10% 0s and 10% 1s from the original dataset

#separate the positive and negative samples
pos_data = df[df['status_label'] == 1]
neg_data = df[df['status_label'] == 0]

#sample 10% of positive and negative samples
np.random.seed(567) 
pos_holdout_index = np.random.choice(pos_data.index, int(len(pos_data) * 0.1), replace=False)
neg_holdout_index = np.random.choice(neg_data.index, int(len(neg_data) * 0.1), replace=False)

#create the holdout indices
holdout_pos = pos_data.loc[pos_holdout_index]
holdout_neg = neg_data.loc[neg_holdout_index]

#concatenate to form final holdout sets
bankrupt_test = pd.concat([holdout_pos, holdout_neg])
bankrupt_test.shape
class_distribution = bankrupt_test["status_label"].value_counts()
print(class_distribution)

1    786
0     60
Name: status_label, dtype: int64


In [373]:
# Training set
training_pos = pos_data.drop(index=pos_holdout_index)
training_neg = neg_data.drop(index=neg_holdout_index)

bankrupt_train = pd.concat([training_pos, training_neg])

X_train = bankrupt_train.drop(columns=['status_label'])
y_train = bankrupt_train['status_label']
X_test = bankrupt_test.drop(columns=['status_label'])
y_test = bankrupt_test['status_label']
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(7619, 21)
(7619,)
(846, 21)
(846,)


In [394]:
# Class dist. for training and testing
train_class_distribution = y_train.value_counts()
print("Training Set Class Distribution:")
print(train_class_distribution)

test_class_distribution = y_test.value_counts()
print("\nTesting Set Class Distribution:")
print(test_class_distribution)

Training Set Class Distribution:
1    7076
0     543
Name: status_label, dtype: int64

Testing Set Class Distribution:
1    786
0     60
Name: status_label, dtype: int64


In [374]:
# Separate majority and minority classes
minority_class = bankrupt_train[bankrupt_train['status_label'] == 0]
majority_class = bankrupt_train[bankrupt_train['status_label'] == 1]

In [375]:
from sklearn.utils import resample

desired_ratio = 1
num_samples = len(minority_class) * desired_ratio

# Perform random undersampling on the majority class
undersampled_majority = resample(majority_class, n_samples=num_samples, random_state=1)

# Combine the classes
undersampled_data = pd.concat([undersampled_majority, minority_class])

# Split the undersampled data back into X_train and y_train
X_train_undersampled = undersampled_data.drop(columns=['status_label'])
y_train_undersampled = undersampled_data['status_label']

In [376]:
value_counts = pd.value_counts(y_train_undersampled) 
print(value_counts)

1    543
0    543
Name: status_label, dtype: int64


In [377]:
######### Oversampling #########
desired_ratio = 1
num_samples = len(majority_class) * desired_ratio

#perform random oversampling on the minority class
oversampled_minority = resample(minority_class, n_samples=num_samples, random_state=1)

#combine the classes 
oversampled_data = pd.concat([majority_class, oversampled_minority])

X_train_oversampled = oversampled_data.drop(columns=['status_label'])
y_train_oversampled = oversampled_data['status_label']

In [378]:
value_counts = pd.value_counts(y_train_oversampled) 
print(value_counts)

1    7076
0    7076
Name: status_label, dtype: int64


In [379]:
######### SMOTE #########
#!pip install -U imbalanced-learn

In [380]:
import imblearn
from imblearn.over_sampling import SMOTE

In [381]:
#create the SMOTE resampler
smote = SMOTE(sampling_strategy='auto', random_state=1) # auto-1:1 oversampling
X_train_smote, y_train_smote = smote.fit_resample(bankrupt_train.drop('status_label', axis=1), bankrupt_train['status_label'])


X_train_smote = pd.DataFrame(X_train_smote)
y_train_smote = pd.Series(y_train_smote)

In [382]:
value_counts = pd.value_counts(y_train_smote) 
print(value_counts)

1    7076
0    7076
Name: status_label, dtype: int64


In [383]:
######### Ensemble Method #########
#create three separate balanced datasets-majority vote when making predictions

#define a function to get a batch of training data with varying class distributions
def get_train_batch(df_majority, df_minority, start, end):
    df_train = pd.concat([df_majority[start:end], df_minority], axis=0)

    X_train = df_train.drop('status_label', axis='columns')
    y_train = df_train['status_label']
    return X_train, y_train


#6298/474 = 13-can generate 12 models
#randomly choose 3
import random

def generate_unique_random_numbers(seed=None):        
    if seed is not None:
        random.seed(seed)
        
    #generate a list of three unique random numbers between 0 and 12
    return random.sample(range(13), 3)

random_num = generate_unique_random_numbers(seed=123)
print(random_num)

X_train_edata1, y_train_edata1 = get_train_batch(majority_class, minority_class, random_num[0]*len(minority_class), 
                                                 random_num[0]*len(minority_class)+len(minority_class))
X_train_edata2, y_train_edata2 = get_train_batch(majority_class, minority_class, random_num[1]*len(minority_class), 
                                                 random_num[1]*len(minority_class)+len(minority_class))
X_train_edata3, y_train_edata3 = get_train_batch(majority_class, minority_class, random_num[2]*len(minority_class), 
                                                 random_num[2]*len(minority_class)+len(minority_class))

[0, 4, 1]


In [399]:
value_counts = pd.value_counts(y_train_edata1) 
print(value_counts)
value_counts = pd.value_counts(y_train_edata2) 
print(value_counts)

1    543
0    543
Name: status_label, dtype: int64
1    543
0    543
Name: status_label, dtype: int64


In [401]:
# !pip install numpy pandas xgboost scikit-learn

from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from imblearn.ensemble import BalancedBaggingClassifier


def evaluate_xgboost(X_train, y_train, X_test, y_test):
    #initialize the XGBoost classifier
    xgb = XGBClassifier(random_state=123)
    
    xgb.fit(X_train, y_train)
    
    ##########
    #get predicted probabilities for class 1
    y_pred_prob = xgb.predict_proba(X_test)[:, 1]
    print(y_pred_prob)
    
    #adjust the threshold
    y_pred = (y_pred_prob >= 0.50).astype(int)
    
    conf_matrix = confusion_matrix(y_test, y_pred)

    sensitivity = recall_score(y_test, y_pred)
    specificity = conf_matrix[0, 0] / (conf_matrix[0, 0] + conf_matrix[0, 1])
    misclass = 1 - accuracy_score(y_test, y_pred)
    return sensitivity, specificity, misclass

# Sensitivity: If good at identifying alive companies 
# Specificity: Correctly predicted bankrupt companies

In [402]:
# Comparing model performance on different training sets
# Unbalanced data
sensitivity, specificity, misclass = evaluate_xgboost(X_train, y_train, X_test, y_test)
print('metrics', sensitivity, specificity, misclass)
#0.9974554707379135 0.016666666666666666 0.0721040189125296

[0.99917006 0.9929295  0.98727477 0.878097   0.992961   0.9968189
 0.9656749  0.9971079  0.9923201  0.9755955  0.9698741  0.9650643
 0.70485127 0.99907935 0.5261377  0.96194404 0.9958313  0.92316383
 0.8884994  0.9946938  0.998923   0.999742   0.9967817  0.9975605
 0.99433714 0.95793873 0.91729796 0.99858284 0.99068624 0.9781148
 0.9886474  0.99855024 0.99850297 0.99788624 0.9952355  0.93967056
 0.99904436 0.91058713 0.9887644  0.8345164  0.9945293  0.97174555
 0.83484185 0.99939144 0.9813835  0.95088136 0.9517599  0.96670467
 0.9956969  0.99804723 0.9990533  0.9987225  0.999642   0.9973133
 0.9986035  0.9937761  0.9901286  0.93692803 0.92658657 0.8186249
 0.98275465 0.9758695  0.924284   0.8951169  0.9658663  0.9260426
 0.65396327 0.9583694  0.99865437 0.99295384 0.98486334 0.9741769
 0.9393638  0.99593097 0.997776   0.98996174 0.9964103  0.9964072
 0.99056715 0.9987771  0.9113033  0.98741907 0.91456205 0.98224026
 0.9475575  0.7924257  0.97894764 0.9547751  0.96190524 0.99926645
 0.9

In [403]:
# Undersampled data 
sensitivity, specificity, misclass = evaluate_xgboost(X_train_undersampled, y_train_undersampled, X_test, y_test)
print('metrics', sensitivity, specificity, misclass)
#0.5776081424936387 0.65 0.41725768321513

[0.94460773 0.4155201  0.5693606  0.02686688 0.60274947 0.9993166
 0.10579599 0.9727585  0.35946462 0.02035164 0.64111453 0.1086268
 0.05155915 0.9577952  0.0410782  0.27768975 0.9966304  0.06385197
 0.38207445 0.9660016  0.8824126  0.99908435 0.55610144 0.9769135
 0.5177677  0.4148035  0.07862664 0.8712629  0.8493357  0.5840977
 0.95262444 0.93786156 0.9522608  0.9612885  0.10942335 0.44481143
 0.94249237 0.5876088  0.09257105 0.569048   0.7998267  0.03535303
 0.10726528 0.9765496  0.04335463 0.18106246 0.93743455 0.02823821
 0.9672998  0.6032664  0.9414748  0.527022   0.99713683 0.92212325
 0.98900455 0.39770633 0.9569418  0.020941   0.01638636 0.15633394
 0.18464176 0.9179416  0.05602549 0.5599025  0.98758626 0.01805872
 0.633666   0.1560526  0.9850503  0.99588275 0.98801315 0.24627659
 0.21687971 0.90194654 0.8962776  0.5699415  0.78768975 0.55982596
 0.32982215 0.27411607 0.22555725 0.22093531 0.6953764  0.92366225
 0.9338383  0.0045768  0.83239526 0.7685037  0.27884743 0.8655449


In [404]:
# Oversampled data 
sensitivity, specificity, misclass = evaluate_xgboost(X_train_oversampled, y_train_oversampled, X_test, y_test)
print('metrics', sensitivity, specificity, misclass)
#0.9669211195928753 0.13333333333333333 0.09219858156028371

[0.99948055 0.69417894 0.97564507 0.9486155  0.9844168  0.998679
 0.9839199  0.9943626  0.9661778  0.8085253  0.99191576 0.96908027
 0.3926043  0.9965109  0.14696804 0.97778773 0.99481624 0.6143122
 0.70819813 0.99500763 0.9998746  0.999253   0.996367   0.99914944
 0.9891663  0.95894927 0.9052356  0.99843496 0.97331965 0.92961997
 0.99287355 0.9912152  0.99906677 0.97906816 0.99931324 0.74407965
 0.999752   0.8698368  0.923955   0.7796674  0.99711573 0.9412356
 0.65509856 0.9996233  0.8889628  0.5721253  0.802917   0.9489308
 0.9965095  0.9901453  0.9904266  0.9985012  0.9998449  0.9622222
 0.99790525 0.9322568  0.9942006  0.73094696 0.45636383 0.9566007
 0.9261595  0.9818255  0.8741952  0.96627796 0.94162655 0.8632734
 0.30927697 0.8339516  0.9912487  0.99760324 0.9970976  0.9703588
 0.7971054  0.9973482  0.99331313 0.9799212  0.98471826 0.98778373
 0.93565017 0.9867033  0.7314167  0.9597275  0.973169   0.9582998
 0.97499585 0.1961178  0.9630274  0.9598197  0.9445113  0.9974565
 0.998

In [405]:
# SMOTE
sensitivity, specificity, misclass = evaluate_xgboost(X_train_smote, y_train_smote, X_test, y_test)
print('metrics', sensitivity, specificity, misclass)
#0.926208651399491 0.3 0.11820330969267134

[0.9982999  0.95751095 0.93235874 0.67247653 0.9704598  0.9698229
 0.85613257 0.9981463  0.97398925 0.6402799  0.9732387  0.5813303
 0.89887595 0.99430746 0.44329336 0.61394364 0.9408942  0.649676
 0.30808794 0.98177755 0.9997149  0.99986815 0.9299455  0.99477804
 0.9812857  0.9951746  0.66545475 0.99145997 0.9796641  0.710858
 0.8938956  0.9959959  0.997813   0.99890995 0.9461746  0.9437099
 0.99702966 0.4079875  0.8533231  0.841605   0.9861668  0.97745836
 0.75724167 0.9994167  0.37493464 0.93792814 0.9114156  0.9553289
 0.9980824  0.9966798  0.99772567 0.9971987  0.99972016 0.9917127
 0.9933206  0.41226423 0.9871885  0.81054676 0.12572484 0.85816187
 0.7993977  0.92997956 0.36360887 0.93933827 0.88590485 0.96851593
 0.23776862 0.85489136 0.99629885 0.9686886  0.9751799  0.97484493
 0.55262446 0.9982224  0.99474394 0.9525643  0.8363729  0.99836594
 0.8902972  0.9894688  0.9409363  0.9904484  0.98669565 0.9516597
 0.6427118  0.19526163 0.9758446  0.66451234 0.9536584  0.9986916
 0.941

In [406]:
# Emseble
from collections import Counter

def evaluate_xgboost(X_train, y_train, X_test):
    xgb = XGBClassifier(random_state=42)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)
    return y_pred

predictions = []

for X_train_e, y_train_e in [(X_train_edata1, y_train_edata1), 
                            (X_train_edata2, y_train_edata2), 
                            (X_train_edata3, y_train_edata3)]:
    y_pred = evaluate_xgboost(X_train_e, y_train_e, X_test)
    predictions.append(y_pred)

#combine predictions using a majority vote
ensemble_predictions = []
for i in range(len(X_test)):
    votes = [pred[i] for pred in predictions]
    majority_vote = Counter(votes).most_common(1)[0][0]
    ensemble_predictions.append(majority_vote)
    
misclass = 1-accuracy_score(y_test, ensemble_predictions)
sensitivity = recall_score(y_test, ensemble_predictions)
conf_matrix = confusion_matrix(y_test, ensemble_predictions)
specificty = conf_matrix[0, 0] / (conf_matrix[0, 0] + conf_matrix[0, 1])

print('metrics', sensitivity, specificity, misclass)
#0.5216284987277354 0.2 0.12411347517730498

metrics 0.5368956743002544 0.3 0.458628841607565


In [393]:
# Export the balanced training set
combined_undersampled_training_data = pd.concat([X_train_smote, y_train_smote], axis=1)
combined_test_data = pd.concat([X_test, y_test], axis=1)
#training_path = "/Users/victorialu/Desktop/Datasets/bankruptcy_training.csv"
#testing_path = "/Users/victorialu/Desktop/Datasets/bankruptcy_testing.csv"

training_path = "/capstone_project/datasets/bankruptcy_training.csv"
testing_path = "/capstone_project/datasets/bankruptcy_testing.csv"

combined_undersampled_training_data.to_csv(training_path, index=False)  # Set index=False to exclude row numbers
combined_test_data.to_csv(testing_path, index=False)

In [119]:
# Experimenting with hreshold adjustment-not included in project

thresholds = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.96, 0.97, 0.98]

threshold_values = []
accuracy_values = []
precision_values = []
sensitivity_values = []
specificity_values = []

for threshold in thresholds:
    xgb = XGBClassifier(random_state=123)
    xgb.fit(X_train, y_train)
    
    #get predicted probabilities for class 1
    y_pred_prob = xgb.predict_proba(X_test)[:, 1]
    
    #adjust the threshold
    y_pred = (y_pred_prob >= threshold).astype(int)
    
    #calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    true_negatives = conf_matrix[0, 0]
    false_positives = conf_matrix[0, 1]
    false_negatives = conf_matrix[1, 0]
    true_positives = conf_matrix[1, 1]
    sensitivity = true_positives / (true_positives + false_negatives)
    specificity = true_negatives / (true_negatives + false_positives)
    
    #store values
    threshold_values.append(threshold)
    accuracy_values.append(accuracy)
    precision_values.append(precision)
    sensitivity_values.append(sensitivity)
    specificity_values.append(specificity)

In [90]:
metrics_df = pd.DataFrame({
    'Threshold': threshold_values,
    'Accuracy': accuracy_values,
    'Precision': precision_values,
    'Sensitivity': sensitivity_values,
    'Specificity': specificity_values
})
print(metrics_df)

   Threshold  Accuracy  Precision  Sensitivity  Specificity
0       0.50  0.925532   0.928826     0.996183     0.000000
1       0.60  0.924350   0.930788     0.992366     0.033333
2       0.70  0.920804   0.931573     0.987277     0.050000
3       0.80  0.914894   0.936430     0.974555     0.133333
4       0.90  0.869976   0.938961     0.919847     0.216667
5       0.95  0.795508   0.950073     0.823155     0.433333
6       0.96  0.764775   0.950845     0.787532     0.466667
7       0.97  0.703310   0.948074     0.720102     0.483333
8       0.98  0.607565   0.946850     0.611959     0.550000
