In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from google.colab import files
uploaded = files.upload()

Saving Taiwan_bankruptcy.csv to Taiwan_bankruptcy.csv


In [3]:
#Uploaded the dataset and explored the data
bankruptcy_data = pd.read_csv('/content/Taiwan_bankruptcy.csv')

In [8]:
bankruptcy_data.columns = bankruptcy_data.columns.str.strip()

#Defined selected financial ratios
selected_features = [
    'Debt ratio %',
    'Working Capital to Total Assets',
    'Quick Ratio',
    'Liability-Assets Flag',
    'Net Income to Total Assets',
    'Net Income Flag'
]

#Defined X and y
X = bankruptcy_data[selected_features]
y = bankruptcy_data['Bankrupt?']


In [10]:
#Split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


#Feature scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
#MODEL 1: Logistic Model
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=1000, random_state=42)

#Fit the model using the scaled training data
logreg.fit(X_train_scaled, y_train)

#Predicted class labels
y_pred_logreg = logreg.predict(X_test_scaled)

#Predicted probabilities (for ROC AUC and threshold tuning later)
y_proba_logreg = logreg.predict_proba(X_test_scaled)[:, 1]

#Evaluated model performance
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

#Confusion matrix and metrics
print("Logistic Regression Performance:")
print(confusion_matrix(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))

#ROC AUC Score
roc_auc = roc_auc_score(y_test, y_proba_logreg)
print(f"ROC AUC Score: {roc_auc:.4f}")

Logistic Regression Performance:
[[1315    5]
 [  39    5]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1320
           1       0.50      0.11      0.19        44

    accuracy                           0.97      1364
   macro avg       0.74      0.55      0.58      1364
weighted avg       0.96      0.97      0.96      1364

ROC AUC Score: 0.8937


In [14]:
#MODEL 2: K-Nearest Neighbor (KNN)
from sklearn.neighbors import KNeighborsClassifier

#KNN model
knn = KNeighborsClassifier(n_neighbors=5)

#Fit the model
knn.fit(X_train_scaled, y_train)

#Predicted class labels
y_pred_knn = knn.predict(X_test_scaled)

#Predicted probabilities for ROC AUC
y_proba_knn = knn.predict_proba(X_test_scaled)[:, 1]

#Evaluated performance
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print("KNN Performance:")
print(confusion_matrix(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))

roc_auc_knn = roc_auc_score(y_test, y_proba_knn)
print(f"ROC AUC Score: {roc_auc_knn:.4f}")

KNN Performance:
[[1306   14]
 [  35    9]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1320
           1       0.39      0.20      0.27        44

    accuracy                           0.96      1364
   macro avg       0.68      0.60      0.63      1364
weighted avg       0.96      0.96      0.96      1364

ROC AUC Score: 0.8212


In [16]:
#Performed grid-search
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

#Defined parameter grid:
param_grid = {
    'n_neighbors': list(range(1, 21))
}

grid_search = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,
    scoring='recall',
    cv=5,
    n_jobs=-1
)

#Fit on scaled training data
grid_search.fit(X_train_scaled, y_train)

best_k = grid_search.best_params_['n_neighbors']
print(f"Best k (based on recall): {best_k}")

#Train with best k
best_knn = KNeighborsClassifier(n_neighbors=best_k)
best_knn.fit(X_train_scaled, y_train)

#Predictions
y_pred_best_knn = best_knn.predict(X_test_scaled)
y_proba_best_knn = best_knn.predict_proba(X_test_scaled)[:, 1]

#Evaluated performance
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print("Tuned KNN Performance:")
print(confusion_matrix(y_test, y_pred_best_knn))
print(classification_report(y_test, y_pred_best_knn))
print(f"ROC AUC Score: {roc_auc_score(y_test, y_proba_best_knn):.4f}")


Best k (based on recall): 1
Tuned KNN Performance:
[[1284   36]
 [  29   15]]
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      1320
           1       0.29      0.34      0.32        44

    accuracy                           0.95      1364
   macro avg       0.64      0.66      0.65      1364
weighted avg       0.96      0.95      0.95      1364

ROC AUC Score: 0.6568


In [17]:
#MODEL 3: Random Forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'
)

#Fit the model
rf.fit(X_train_scaled, y_train)

y_pred_rf = rf.predict(X_test_scaled)

#Predicted probabilities
y_proba_rf = rf.predict_proba(X_test_scaled)[:, 1]

#Evaluated model performance
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print("Random Forest Performance:")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print(f"ROC AUC Score: {roc_auc_score(y_test, y_proba_rf):.4f}")

Random Forest Performance:
[[1315    5]
 [  35    9]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1320
           1       0.64      0.20      0.31        44

    accuracy                           0.97      1364
   macro avg       0.81      0.60      0.65      1364
weighted avg       0.96      0.97      0.96      1364

ROC AUC Score: 0.8614
