In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('american_bankruptcy.csv')

In [None]:
df = df.sort_values(by=['company_name', 'year'])
df['bankrupt_next_year'] = df.groupby('company_name')['status_label'].shift(-1) == 'failed'
df['bankrupt_next_year'] = df['bankrupt_next_year'].astype(int)

In [None]:
features = df.drop(columns=['company_name', 'status_label', 'year', 'bankrupt_next_year'])
target = df['bankrupt_next_year']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, stratify=target, random_state=42
)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score

In [None]:
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000),
    "MLP": MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=200, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\n--- {name} ---")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred

    report = classification_report(y_test, y_pred, output_dict=True)
    conf_matrix = confusion_matrix(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)

    results[name] = {
        "report": report,
        "confusion_matrix": conf_matrix,
        "roc_auc": roc_auc
    }

    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", conf_matrix)
    print("ROC AUC Score:", roc_auc)


--- Random Forest ---
Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97     14815
           1       0.95      0.04      0.08       922

    accuracy                           0.94     15737
   macro avg       0.95      0.52      0.53     15737
weighted avg       0.94      0.94      0.92     15737

Confusion Matrix:
 [[14813     2]
 [  882    40]]
ROC AUC Score: 0.8686226292019507

--- Logistic Regression ---
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.31      0.47     14815
           1       0.07      0.86      0.13       922

    accuracy                           0.34     15737
   macro avg       0.52      0.58      0.30     15737
weighted avg       0.92      0.34      0.45     15737

Confusion Matrix:
 [[ 4605 10210]
 [  131   791]]
ROC AUC Score: 0.6550634982572481

--- MLP ---




Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97     14815
           1       0.64      0.07      0.13       922

    accuracy                           0.94     15737
   macro avg       0.79      0.53      0.55     15737
weighted avg       0.93      0.94      0.92     15737

Confusion Matrix:
 [[14779    36]
 [  858    64]]
ROC AUC Score: 0.7478185400122845

--- XGBoost ---


Parameters: { "use_label_encoder" } are not used.



Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97     14815
           1       0.65      0.08      0.15       922

    accuracy                           0.94     15737
   macro avg       0.80      0.54      0.56     15737
weighted avg       0.93      0.94      0.92     15737

Confusion Matrix:
 [[14774    41]
 [  846    76]]
ROC AUC Score: 0.8243901099826273


**En équilibrant la datset pour améliorer la performance des modèles**

In [None]:
#équilibrage du dataset en appliquant smote
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000),
    "MLP": MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=200, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\n--- {name} ---")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred

    report = classification_report(y_test, y_pred, output_dict=True)
    conf_matrix = confusion_matrix(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)

    results[name] = {
        "report": report,
        "confusion_matrix": conf_matrix,
        "roc_auc": roc_auc
    }

    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", conf_matrix)
    print("ROC AUC Score:", roc_auc)


--- Random Forest ---
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96     14815
           1       0.42      0.43      0.43       922

    accuracy                           0.93     15737
   macro avg       0.69      0.70      0.69     15737
weighted avg       0.93      0.93      0.93     15737

Confusion Matrix:
 [[14259   556]
 [  522   400]]
ROC AUC Score: 0.8676942229653799

--- Logistic Regression ---
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.32      0.48     14815
           1       0.07      0.84      0.13       922

    accuracy                           0.35     15737
   macro avg       0.52      0.58      0.31     15737
weighted avg       0.92      0.35      0.46     15737

Confusion Matrix:
 [[ 4705 10110]
 [  144   778]]
ROC AUC Score: 0.6560633203581702

--- MLP ---




Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.75      0.84     14815
           1       0.13      0.61      0.21       922

    accuracy                           0.74     15737
   macro avg       0.55      0.68      0.53     15737
weighted avg       0.92      0.74      0.81     15737

Confusion Matrix:
 [[11072  3743]
 [  362   560]]
ROC AUC Score: 0.7438035847762315

--- XGBoost ---


Parameters: { "use_label_encoder" } are not used.



Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.83      0.89     14815
           1       0.18      0.61      0.28       922

    accuracy                           0.81     15737
   macro avg       0.57      0.72      0.58     15737
weighted avg       0.92      0.81      0.86     15737

Confusion Matrix:
 [[12235  2580]
 [  363   559]]
ROC AUC Score: 0.8008298296488214


# **Optimizing the models**

In [None]:
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score,GridSearchCV

In [None]:
def build_pipeline(model):
    return ImbPipeline(steps=[
        ("scaler", StandardScaler()),
        ("smote", SMOTE(random_state=42)),
        ("model", model)
    ])

# Models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=15, class_weight='balanced', random_state=42),
    "Logistic Regression": LogisticRegression(C=0.5, class_weight='balanced', solver='liblinear', max_iter=1000),
    "MLP": MLPClassifier(hidden_layer_sizes=(128, 64, 32), alpha=0.0005, max_iter=300, early_stopping=True, random_state=42),
    "XGBoost" : XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, use_label_encoder=False, eval_metric='logloss')
}


for name, clf in models.items():
    print(f"\n--- {name} ---")
    pipeline = build_pipeline(clf)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1]

    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("ROC AUC Score:", roc_auc_score(y_test, y_proba))
    print("F1 Score:", f1_score(y_test, y_pred))



--- Random Forest ---
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.86      0.91     14815
           1       0.21      0.60      0.31       922

    accuracy                           0.84     15737
   macro avg       0.59      0.73      0.61     15737
weighted avg       0.93      0.84      0.87     15737

Confusion Matrix:
 [[12672  2143]
 [  369   553]]
ROC AUC Score: 0.8189537923617604
F1 Score: 0.3056937534549475

--- Logistic Regression ---
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.32      0.48     14815
           1       0.07      0.84      0.13       922

    accuracy                           0.35     15737
   macro avg       0.52      0.58      0.30     15737
weighted avg       0.92      0.35      0.46     15737

Confusion Matrix:
 [[ 4697 10118]
 [  145   777]]
ROC AUC Score: 0.6558319051380621
F1 Score: 0.1315054582381315

--- MLP ---
Classif

Parameters: { "use_label_encoder" } are not used.



Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.71      0.82     14815
           1       0.12      0.65      0.21       922

    accuracy                           0.71     15737
   macro avg       0.55      0.68      0.52     15737
weighted avg       0.92      0.71      0.79     15737

Confusion Matrix:
 [[10589  4226]
 [  321   601]]
ROC AUC Score: 0.7539019197726406
F1 Score: 0.20907983997216908


In [None]:
# Example: Cross-validated ROC AUC
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



XGBoost CV ROC AUC: 0.8489 ± 0.0020
