In [1]:
!pip install imbalanced-learn



In [2]:
# Step 1: Import your libraries
import pandas as pd
import numpy as np

# Sklearn ML libraries
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score, confusion_matrix, classification_report

#imbalanced-learn ML libraries
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN, SMOTETomek

#turn off warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Step 2: Read CSV from Part II
df = pd.read_csv('Taiwan_data_ENG_95.csv', index_col=False)

In [4]:
# Step 3: Prepare your independent and dependent variables
X = df.drop('Flag', axis=1)
y = df['Flag']

In [5]:
# Step 4: Split your data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
# Step 5: Upsample your train data
sm = SMOTE(sampling_strategy='auto', k_neighbors=1, random_state=42)
X_sm, y_sm = sm.fit_resample(X_train, y_train)

In [7]:
# Optional: Measure length of the new upsample data
print("X: Before resample: ",len(X_train), " After resample: ",len(X_sm))
print("y: Before resample: ",len(y_train), " After resample: ",len(y_sm))

X: Before resample:  5455  After resample:  10558
y: Before resample:  5455  After resample:  10558


In [8]:
def performance(model_name, actual, prediction):
    print(f'Scores for {model_name}:')
    f1 = f1_score(actual, prediction)
#     print('\nF1 Score: ', f1)
#     print('\nConfusion Matrix:\n ', confusion_matrix(actual, prediction))
    cm = confusion_matrix(actual, prediction).flatten().tolist()
    print('\nClassification Report:\n ', classification_report(actual, prediction))
    
    return f1, cm

In [9]:
def train_access_models(models_list, X_fit, y_fit):
    model_scores = []
    model_cm = []
    for model in models_list:
        model.fit(X_fit, y_fit)
        pred = model.predict(X_test)
        f1, cm = performance(type(model).__name__, y_test, pred)
        model_scores.append(f1)
        model_cm.append(cm)
    
    return model_scores, model_cm

In [10]:
# train_access_models([DummyClassifier, LogisticRegression, RandomForestClassifier, GradientBoostingClassifier])
dummy_clf = DummyClassifier()
logistic_reg = LogisticRegression(solver='liblinear')
rf_clf = RandomForestClassifier()
gb_clf = GradientBoostingClassifier()

SMOTE_scores, SMOTE_cm = train_access_models([dummy_clf, logistic_reg, rf_clf, gb_clf], X_sm, y_sm)

Scores for DummyClassifier:

Classification Report:
                precision    recall  f1-score   support

           0       0.97      0.49      0.65      1320
           1       0.03      0.48      0.06        44

    accuracy                           0.49      1364
   macro avg       0.50      0.49      0.36      1364
weighted avg       0.94      0.49      0.63      1364

Scores for LogisticRegression:

Classification Report:
                precision    recall  f1-score   support

           0       0.99      0.87      0.93      1320
           1       0.18      0.86      0.30        44

    accuracy                           0.87      1364
   macro avg       0.59      0.87      0.62      1364
weighted avg       0.97      0.87      0.91      1364

Scores for RandomForestClassifier:

Classification Report:
                precision    recall  f1-score   support

           0       0.98      0.98      0.98      1320
           1       0.49      0.45      0.47        44

    accura

In [12]:
# Upsample train data with ADASYN
ada = ADASYN(sampling_strategy='auto', n_neighbors=5, random_state=42)
X_ada, y_ada = ada.fit_resample(X_train, y_train)

ADASYN_scores, ADASYN_cm = train_access_models([dummy_clf, logistic_reg, rf_clf, gb_clf], X_ada, y_ada)

Scores for DummyClassifier:

Classification Report:
                precision    recall  f1-score   support

           0       0.97      0.49      0.65      1320
           1       0.03      0.48      0.06        44

    accuracy                           0.49      1364
   macro avg       0.50      0.49      0.36      1364
weighted avg       0.94      0.49      0.63      1364

Scores for LogisticRegression:

Classification Report:
                precision    recall  f1-score   support

           0       0.99      0.86      0.93      1320
           1       0.18      0.86      0.29        44

    accuracy                           0.86      1364
   macro avg       0.58      0.86      0.61      1364
weighted avg       0.97      0.86      0.90      1364

Scores for RandomForestClassifier:

Classification Report:
                precision    recall  f1-score   support

           0       0.98      0.97      0.98      1320
           1       0.41      0.55      0.47        44

    accura

In [13]:
# Upsample train data with SMOTEENN
sme = SMOTEENN(sampling_strategy='auto',random_state=42)
X_sme, y_sme = sme.fit_resample(X_train, y_train)

SMOTEENN_scores, SMOTEENN_cm = train_access_models([dummy_clf, logistic_reg, rf_clf, gb_clf], X_sme, y_sme)

Scores for DummyClassifier:

Classification Report:
                precision    recall  f1-score   support

           0       0.97      0.49      0.65      1320
           1       0.04      0.59      0.07        44

    accuracy                           0.49      1364
   macro avg       0.50      0.54      0.36      1364
weighted avg       0.94      0.49      0.63      1364

Scores for LogisticRegression:

Classification Report:
                precision    recall  f1-score   support

           0       1.00      0.85      0.92      1320
           1       0.16      0.91      0.28        44

    accuracy                           0.85      1364
   macro avg       0.58      0.88      0.60      1364
weighted avg       0.97      0.85      0.89      1364

Scores for RandomForestClassifier:

Classification Report:
                precision    recall  f1-score   support

           0       0.99      0.95      0.97      1320
           1       0.32      0.75      0.45        44

    accura

In [14]:
# Upsample train data with SMOTETomek
smt = SMOTETomek(sampling_strategy='auto',random_state=42)
X_smt, y_smt = smt.fit_resample(X_train, y_train)

SMOTETomek_scores, SMOTETomek_cm = train_access_models([dummy_clf, logistic_reg, rf_clf, gb_clf], X_smt, y_smt)

Scores for DummyClassifier:

Classification Report:
                precision    recall  f1-score   support

           0       0.97      0.46      0.63      1320
           1       0.03      0.50      0.06        44

    accuracy                           0.47      1364
   macro avg       0.50      0.48      0.34      1364
weighted avg       0.94      0.47      0.61      1364

Scores for LogisticRegression:

Classification Report:
                precision    recall  f1-score   support

           0       0.99      0.87      0.93      1320
           1       0.18      0.86      0.30        44

    accuracy                           0.87      1364
   macro avg       0.59      0.87      0.62      1364
weighted avg       0.97      0.87      0.91      1364

Scores for RandomForestClassifier:

Classification Report:
                precision    recall  f1-score   support

           0       0.98      0.97      0.98      1320
           1       0.37      0.52      0.43        44

    accura

<details>
    <summary><strong>Cick once to see which upsampler worked best</strong></summary>
    <div>For us, SMOTETomek did the best. This is ours, but don't worry if your outcome is different</div>
    <br>
    <img src = 'https://uplevelsg.s3.ap-southeast-1.amazonaws.com/ProjectBankruptcyPrediction/SMOTETomekPerformance.png'>
</details>

In [15]:
df_scores = pd.DataFrame({'SMOTE': SMOTE_scores,
                         'ADASYN': ADASYN_scores,
                         'SMOTEENN': SMOTEENN_scores,
                         'SMOTETOMEK': SMOTETomek_scores},
                        index=['Dummy', 'Logistic', 'RandomForest', 'GradientBoosting'])
df_scores

Unnamed: 0,SMOTE,ADASYN,SMOTEENN,SMOTETOMEK
Dummy,0.057299,0.057221,0.069799,0.056921
Logistic,0.302789,0.291188,0.278746,0.304
RandomForest,0.470588,0.470588,0.452055,0.429907
GradientBoosting,0.384106,0.355828,0.341969,0.368098


In [16]:
def getMatrix(cm_list, str_index):
    cm_array = np.array(cm_list)
    
    cm_multiindex = pd.MultiIndex.from_product([[str_index],
                                  ['Dummy', 'Logistic', 'RandomForest', 'GradientBoosting']],
                                 names=['Upsampler', 'Model'])
    
    df_cm = pd.DataFrame({'true_negative': cm_array[:, 0].tolist(),
                          'false_positive': cm_array[:, 1].tolist(),
                          'false_negative': cm_array[:, 2].tolist(),
                          'true_positive': cm_array[:, 3].tolist()},
                         index=cm_multiindex)

    return df_cm

In [17]:
df_cm1 = getMatrix(SMOTE_cm, 'SMOTE')
df_cm2 = getMatrix(ADASYN_cm, 'ADASYN')
df_cm3 = getMatrix(SMOTEENN_cm, 'SMOTEENN')
df_cm4 = getMatrix(SMOTETomek_cm, 'SMOTETOMEK')

In [18]:
df_cm = pd.concat([df_cm1, df_cm2, df_cm3, df_cm4])
df_cm

Unnamed: 0_level_0,Unnamed: 1_level_0,true_negative,false_positive,false_negative,true_positive
Upsampler,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SMOTE,Dummy,652,668,23,21
SMOTE,Logistic,1151,169,6,38
SMOTE,RandomForest,1299,21,24,20
SMOTE,GradientBoosting,1242,78,15,29
ADASYN,Dummy,651,669,23,21
ADASYN,Logistic,1141,179,6,38
ADASYN,RandomForest,1286,34,20,24
ADASYN,GradientBoosting,1230,90,15,29
SMOTEENN,Dummy,645,675,18,26
SMOTEENN,Logistic,1117,203,4,40


In [19]:
df_cm.sort_values(by='false_negative')

Unnamed: 0_level_0,Unnamed: 1_level_0,true_negative,false_positive,false_negative,true_positive
Upsampler,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SMOTEENN,Logistic,1117,203,4,40
SMOTE,Logistic,1151,169,6,38
ADASYN,Logistic,1141,179,6,38
SMOTETOMEK,Logistic,1152,168,6,38
SMOTEENN,RandomForest,1251,69,11,33
SMOTEENN,GradientBoosting,1204,116,11,33
SMOTETOMEK,GradientBoosting,1231,89,14,30
SMOTE,GradientBoosting,1242,78,15,29
ADASYN,GradientBoosting,1230,90,15,29
SMOTEENN,Dummy,645,675,18,26


In [20]:
df_cm.sort_values(by='true_positive', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,true_negative,false_positive,false_negative,true_positive
Upsampler,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SMOTEENN,Logistic,1117,203,4,40
SMOTE,Logistic,1151,169,6,38
ADASYN,Logistic,1141,179,6,38
SMOTETOMEK,Logistic,1152,168,6,38
SMOTEENN,RandomForest,1251,69,11,33
SMOTEENN,GradientBoosting,1204,116,11,33
SMOTETOMEK,GradientBoosting,1231,89,14,30
SMOTE,GradientBoosting,1242,78,15,29
ADASYN,GradientBoosting,1230,90,15,29
SMOTEENN,Dummy,645,675,18,26


In [21]:
# Step 10: Import GridSearchCV
from sklearn.model_selection import GridSearchCV

In [22]:
# Step 11: Define the parameter grid
n_estimators = [50, 100, 150]
max_depth = [3, 4, 5]
max_features = [13, 15, 17]
param_grid = {'n_estimators': n_estimators,
             'max_depth': max_depth,
             'max_features': max_features}

In [23]:
# Step 12: Declare a GridSearchCV object
gb = GradientBoostingClassifier()
grid_search = GridSearchCV(estimator = gb, param_grid = param_grid, 
                          cv = 5, n_jobs = 4, scoring='precision', verbose = 2)

In [24]:
# Step 13: Fit your upsampled train data with your GridSearchCV object
grid_search.fit(X_sme, y_sme)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  4.1min
[Parallel(n_jobs=4)]: Done 135 out of 135 | elapsed: 20.0min finished


GridSearchCV(cv=5, estimator=GradientBoostingClassifier(), n_jobs=4,
             param_grid={'max_depth': [3, 4, 5], 'max_features': [13, 15, 17],
                         'n_estimators': [50, 100, 150]},
             scoring='precision', verbose=2)

In [25]:
# Step 14: Get your best parameters
print(grid_search.best_params_)
print(grid_search.best_score_)

{'max_depth': 5, 'max_features': 15, 'n_estimators': 150}
0.9804193569208856


In [26]:
# Define the parameter grid for Random Forest
n_estimators = [50, 100]
max_depth = [30, 40, 50]
max_features = [15, 30, 45]
forest_params = {'bootstrap': [True], 'n_estimators': n_estimators,
                  'max_depth': max_depth, 'max_features': max_features}

In [27]:
# Step 12: Declare a GridSearchCV object
forest = RandomForestClassifier()
grid_search2 = GridSearchCV(estimator = forest, param_grid = forest_params, 
                          cv = 5, n_jobs = 4, scoring='precision', verbose = 2)

In [28]:
# Step 13: Fit your upsampled train data with your GridSearchCV object
grid_search2.fit(X_sme, y_sme)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed: 14.8min
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed: 40.8min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=4,
             param_grid={'bootstrap': [True], 'max_depth': [30, 40, 50],
                         'max_features': [15, 30, 45],
                         'n_estimators': [50, 100]},
             scoring='precision', verbose=2)

In [29]:
# Step 14: Get your besr parameters
print(grid_search2.best_params_)
print(grid_search2.best_score_)

{'bootstrap': True, 'max_depth': 30, 'max_features': 15, 'n_estimators': 50}
0.9824104821741155


In [33]:
# Define the parameter grid for Logistic Regression
solver =  ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
log_params = {'solver': solver,
                  'penalty': penalty, 'C': c_values}

In [34]:
# Step 12: Declare a GridSearchCV object
logreg = LogisticRegression()
grid_search3 = GridSearchCV(estimator = logreg, param_grid = log_params, 
                          cv = 5, n_jobs = 4, scoring='precision', verbose = 2)

In [35]:
# Step 13: Fit your upsampled train data with your GridSearchCV object
grid_search3.fit(X_sme, y_sme)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done  75 out of  75 | elapsed:  1.7min finished


GridSearchCV(cv=5, estimator=LogisticRegression(), n_jobs=4,
             param_grid={'C': [100, 10, 1.0, 0.1, 0.01], 'penalty': ['l2'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear']},
             scoring='precision', verbose=2)

In [36]:
# Step 14: Get your besr parameters
print(grid_search3.best_params_)
print(grid_search3.best_score_)

{'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.9219575849912254


In [37]:
# Train GB model and RandomForest model with the best parameters
# fit and assess using unsampled data
gb_best1 = GradientBoostingClassifier(max_depth=5, max_features=15, n_estimators= 150)
rf_best1 = RandomForestClassifier(bootstrap=True, max_depth=30, max_features=15, n_estimators=50)
log_best1 = LogisticRegression(solver='newton-cg' , penalty='l2' , C=100 )

In [38]:
#Using original data
unsampled_scores, unsampled_cm = train_access_models([rf_best1, gb_best1, log_best1], X_train, y_train)

Scores for RandomForestClassifier:

Classification Report:
                precision    recall  f1-score   support

           0       0.98      1.00      0.99      1320
           1       0.75      0.27      0.40        44

    accuracy                           0.97      1364
   macro avg       0.86      0.63      0.69      1364
weighted avg       0.97      0.97      0.97      1364

Scores for GradientBoostingClassifier:

Classification Report:
                precision    recall  f1-score   support

           0       0.98      1.00      0.99      1320
           1       0.69      0.25      0.37        44

    accuracy                           0.97      1364
   macro avg       0.83      0.62      0.68      1364
weighted avg       0.97      0.97      0.97      1364

Scores for LogisticRegression:

Classification Report:
                precision    recall  f1-score   support

           0       0.97      1.00      0.98      1320
           1       0.58      0.16      0.25        44


In [46]:
#Using upsampled data
sampled_scores, sampled_cm = train_access_models([rf_best1, gb_best1, log_best1], X_sme, y_sme)

Scores for RandomForestClassifier:

Classification Report:
                precision    recall  f1-score   support

           0       0.99      0.95      0.97      1320
           1       0.32      0.70      0.44        44

    accuracy                           0.94      1364
   macro avg       0.65      0.83      0.70      1364
weighted avg       0.97      0.94      0.95      1364

Scores for GradientBoostingClassifier:

Classification Report:
                precision    recall  f1-score   support

           0       0.99      0.94      0.96      1320
           1       0.27      0.73      0.40        44

    accuracy                           0.93      1364
   macro avg       0.63      0.83      0.68      1364
weighted avg       0.97      0.93      0.94      1364

Scores for LogisticRegression:

Classification Report:
                precision    recall  f1-score   support

           0       0.99      0.87      0.93      1320
           1       0.18      0.84      0.29        44


In [47]:
# Step 15b: Assess your model performance
df_bestscores = pd.DataFrame({'No upsampling': unsampled_scores,
                              'With Upsampling': sampled_scores},
                         index=['RandomForest', 'GradientBoosting', 'LogisticRegression'])
df_bestscores

Unnamed: 0,No upsampling,With Upsampling
RandomForest,0.4,0.439716
GradientBoosting,0.366667,0.397516
LogisticRegression,0.25,0.291339


In [48]:
def getMatrix2(cm_list, str_index):
    cm_array = np.array(cm_list)
    
    cm_multiindex = pd.MultiIndex.from_product([[str_index],
                                  ['RandomForest', 'GradientBoosting', 'LogisticRegression']],
                                 names=['Sampling', 'Model'])
    
    df_cm = pd.DataFrame({'true_negative': cm_array[:, 0].tolist(),
                          'false_positive': cm_array[:, 1].tolist(),
                          'false_negative': cm_array[:, 2].tolist(),
                          'true_positive': cm_array[:, 3].tolist()},
                         index=cm_multiindex)

    return df_cm

In [49]:
df_cm_no = getMatrix2(unsampled_cm, 'No')
df_cm_yes = getMatrix2(sampled_cm, 'Yes')

In [50]:
df_cm_final = pd.concat([df_cm_no, df_cm_yes])
df_cm_final

Unnamed: 0_level_0,Unnamed: 1_level_0,true_negative,false_positive,false_negative,true_positive
Sampling,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
No,RandomForest,1316,4,32,12
No,GradientBoosting,1315,5,33,11
No,LogisticRegression,1315,5,37,7
Yes,RandomForest,1254,66,13,31
Yes,GradientBoosting,1235,85,12,32
Yes,LogisticRegression,1147,173,7,37


In [51]:
df_cm_final.sort_values(by='false_negative')

Unnamed: 0_level_0,Unnamed: 1_level_0,true_negative,false_positive,false_negative,true_positive
Sampling,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Yes,LogisticRegression,1147,173,7,37
Yes,GradientBoosting,1235,85,12,32
Yes,RandomForest,1254,66,13,31
No,RandomForest,1316,4,32,12
No,GradientBoosting,1315,5,33,11
No,LogisticRegression,1315,5,37,7


In [52]:
df_cm_final.sort_values(by='true_positive', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,true_negative,false_positive,false_negative,true_positive
Sampling,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Yes,LogisticRegression,1147,173,7,37
Yes,GradientBoosting,1235,85,12,32
Yes,RandomForest,1254,66,13,31
No,RandomForest,1316,4,32,12
No,GradientBoosting,1315,5,33,11
No,LogisticRegression,1315,5,37,7
