In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('/kaggle/input/encoded-bankruptcy-data/encoded_bankruptcy_data.csv')

print(df.head())
print(df.info())

  company_name  Financial_Year  Bankruptcy_Status  Current_Assets  \
0          C_1          1999.0                  0        511267.0   
1          C_1          2000.0                  0        485856.0   
2          C_1          2001.0                  0        436656.0   
3          C_1          2002.0                  0        396412.0   
4          C_1          2003.0                  0        432204.0   

   Cost_of_Goods_Sold  Depreciation_Amortization    EBITDA  Inventory  \
0          740998.000                   833107.0  180447.0   18373.00   
1             701.854                   713811.0  179987.0   18577.00   
2          710199.000                   526477.0  217699.0   22496.00   
3             686.621                   496747.0  164658.0   27172.00   
4             709.292                   523302.0  248666.0      26.68   

   Net_Income  Total_Receivables  ...  Group_78  Group_79  Group_80  Group_81  \
0    70658.00          89031.000  ...     False     False     Fal

## DATA PREPROCESSING

In [4]:
# Prepare features and target
X = df.drop(['company_name', 'Bankruptcy_Status'], axis=1)
y = df['Bankruptcy_Status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"\nTrain target distribution:\n{y_train.value_counts()}")
print(f"Test target distribution:\n{y_test.value_counts()}")


Train set: 50231 samples
Test set: 12558 samples

Train target distribution:
Bankruptcy_Status
0    46869
1     3362
Name: count, dtype: int64
Test target distribution:
Bankruptcy_Status
0    11717
1      841
Name: count, dtype: int64


## FEATURE ENGG

In [5]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nScaled train set - Mean: {X_train_scaled.mean():.6f}, Std: {X_train_scaled.std():.6f}")
print(f"Scaled test set - Mean: {X_test_scaled.mean():.6f}, Std: {X_test_scaled.std():.6f}")


Scaled train set - Mean: -0.000000, Std: 1.000000
Scaled test set - Mean: -0.000614, Std: 0.986448


## OPTIMIZATION FOR IMBALANCED DATA

In [6]:
# Handle imbalance using scale_pos_weight for XGBoost and class_weight for LightGBM
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
print(f"\nClass imbalance ratio (scale_pos_weight): {scale_pos_weight:.2f}")


Class imbalance ratio (scale_pos_weight): 13.94


## XGBOOST MODEL TRAINING 

In [7]:
# XGBoost: GridSearchCV
xgb = XGBClassifier(
    objective='binary:logistic',
    scale_pos_weight=scale_pos_weight,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    tree_method='hist'  # Faster training
)

xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'min_child_weight': [1, 3, 5]  # Added to prevent overfitting
}


print("\nFitting the XGBoost model using GridSearchCV......")
xgb_grid = GridSearchCV(
    xgb, xgb_params, cv=5, scoring='f1', n_jobs=-1, verbose=1
)
xgb_grid.fit(X_train_scaled, y_train)
xgb_best = xgb_grid.best_estimator_

print(f"\nBest XGBoost params: {xgb_grid.best_params_}")
print(f"Best XGBoost CV F1 score: {xgb_grid.best_score_:.4f}")


Fitting the XGBoost model using GridSearchCV......
Fitting 5 folds for each of 108 candidates, totalling 540 fits

Best XGBoost params: {'learning_rate': 0.2, 'max_depth': 7, 'min_child_weight': 5, 'n_estimators': 200, 'subsample': 1.0}
Best XGBoost CV F1 score: 0.4118


## LightGBM Model Training

In [8]:

# LightGBM: GridSearchCV
lgbm = LGBMClassifier(
    class_weight='balanced',
    random_state=42,
    force_col_wise=True,  # Fix for auto-choosing warnings
    verbose=-1,  # Suppress iteration logs
    min_child_samples=20,  # Prevent overfitting
    min_data_in_leaf=20
)

lgbm_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [15, 31, 50],  # Adjusted range
    'min_child_samples': [20, 30, 40],  # Added
    'feature_fraction': [0.8, 0.9, 1.0]  # Added for regularization
}

lgbm_grid = GridSearchCV(
    lgbm, lgbm_params, cv=5, scoring='f1', n_jobs=-1, verbose=1
)

print("\nFitting LightGBM GridSearchCV...")
lgbm_grid.fit(X_train_scaled, y_train)
lgbm_best = lgbm_grid.best_estimator_

print(f"\nBest LightGBM params: {lgbm_grid.best_params_}")
print(f"Best LightGBM CV F1 score: {lgbm_grid.best_score_:.4f}")


Fitting LightGBM GridSearchCV...
Fitting 5 folds for each of 486 candidates, totalling 2430 fits

Best LightGBM params: {'feature_fraction': 0.9, 'learning_rate': 0.2, 'max_depth': 7, 'min_child_samples': 20, 'n_estimators': 200, 'num_leaves': 50}
Best LightGBM CV F1 score: 0.4081


## Cross Validation (Metric => F1-Score) 

In [9]:
# Cross-validation scores
xgb_cv_f1 = cross_val_score(xgb_best, X_train_scaled, y_train, cv=5, scoring='f1')
lgbm_cv_f1 = cross_val_score(lgbm_best, X_train_scaled, y_train, cv=5, scoring='f1')

print(f"\nXGBoost CV F1: {xgb_cv_f1.mean():.4f} (+/- {xgb_cv_f1.std() * 2:.4f})")
print(f"LightGBM CV F1: {lgbm_cv_f1.mean():.4f} (+/- {lgbm_cv_f1.std() * 2:.4f})")


XGBoost CV F1: 0.4118 (+/- 0.0217)
LightGBM CV F1: 0.4081 (+/- 0.0245)


## CHECKING THE MODELS ON TEST DATA

In [10]:
# Evaluation on test set

def evaluate(model, X_test, y_test, model_name):
    print("\n" + "=" * 80)
    print(f"{model_name} TEST SET EVALUATION")
    print("=" * 80)
    
    y_pred = model.predict(X_test)
    
    print(f"\nAccuracy:  {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred, zero_division=0):.4f}")
    print(f"Recall:    {recall_score(y_test, y_pred, zero_division=0):.4f}")
    print(f"F1 Score:  {f1_score(y_test, y_pred, zero_division=0):.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))

## SAVING MODELS

In [11]:
import joblib

# Save both trained models
joblib.dump(xgb_best, "xgb_best_model.pkl")
joblib.dump(lgbm_best, "lgbm_best_model.pkl")

print("Models saved successfully!")

Models saved successfully!


In [12]:
# Load the trained models
xgb_best = joblib.load("xgb_best_model.pkl")
lgbm_best = joblib.load("lgbm_best_model.pkl")

print("Models loaded successfully!")

Models loaded successfully!


In [13]:
evaluate(xgb_best, X_test_scaled, y_test, "XGBOOST")
evaluate(lgbm_best, X_test_scaled, y_test, "LIGHTGBM")


XGBOOST TEST SET EVALUATION

Accuracy:  0.9051
Precision: 0.3669
Recall:    0.5755
F1 Score:  0.4481

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.93      0.95     11717
           1       0.37      0.58      0.45       841

    accuracy                           0.91     12558
   macro avg       0.67      0.75      0.70     12558
weighted avg       0.93      0.91      0.91     12558


LIGHTGBM TEST SET EVALUATION

Accuracy:  0.8946
Precision: 0.3378
Recall:    0.5981
F1 Score:  0.4318

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.92      0.94     11717
           1       0.34      0.60      0.43       841

    accuracy                           0.89     12558
   macro avg       0.65      0.76      0.69     12558
weighted avg       0.93      0.89      0.91     12558

