In [None]:
%pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m400.9/400.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.5.0


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                            f1_score, classification_report, confusion_matrix)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
import optuna
from optuna.samplers import TPESampler
import joblib
import warnings
warnings.filterwarnings('ignore')

# Suppress Optuna logging
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Load data
print("="*80)
print("LOADING DATA")
print("="*80)
df = pd.read_csv('/content/encoded_bankruptcy_data.csv')
print(f"\nDataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nDataset info:")
print(df.info())

# ============================================
# DATA PREPROCESSING
# ============================================
print("\n" + "="*80)
print("DATA PREPROCESSING")
print("="*80)

X = df.drop(['company_name', 'Bankruptcy_Status'], axis=1)
y = df['Bankruptcy_Status']

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nTarget distribution:\n{y.value_counts()}")
print(f"Class imbalance ratio: {y.value_counts()[0]/y.value_counts()[1]:.2f}:1")

# Check for missing values
print("\n" + "="*80)
print("CHECKING FOR MISSING VALUES")
print("="*80)
missing_counts = X.isnull().sum()
missing_features = missing_counts[missing_counts > 0]
if len(missing_features) > 0:
    print(f"\nFeatures with missing values:")
    print(missing_features)
    print(f"\nTotal features with missing values: {len(missing_features)}")
    print(f"Total missing values: {missing_counts.sum()}")
else:
    print("\nNo missing values found")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n" + "="*80)
print("TRAIN-TEST SPLIT")
print("="*80)
print(f"\nTrain set: {X_train.shape[0]} samples ({X_train.shape[0]/len(df)*100:.1f}%)")
print(f"Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(df)*100:.1f}%)")
print(f"\nTrain target distribution:\n{y_train.value_counts()}")
print(f"Test target distribution:\n{y_test.value_counts()}")

# ============================================
# HANDLE MISSING VALUES
# ============================================
print("\n" + "="*80)
print("HANDLING MISSING VALUES")
print("="*80)

imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

print("‚úì Missing values imputed using median strategy")

# ============================================
# FEATURE SCALING
# ============================================
print("\n" + "="*80)
print("FEATURE SCALING")
print("="*80)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

print(f"\n‚úì Features standardized (mean=0, std=1)")
print(f"Train set - Mean: {X_train_scaled.mean():.6f}, Std: {X_train_scaled.std():.6f}")
print(f"Test set - Mean: {X_test_scaled.mean():.6f}, Std: {X_test_scaled.std():.6f}")

# Verify no NaN or inf values
assert not np.isnan(X_train_scaled).any(), "NaN values detected in train set"
assert not np.isinf(X_train_scaled).any(), "Inf values detected in train set"
print(f"\n‚úì Data validation passed (no NaN or Inf values)")

# ============================================
# APPLY SMOTE FOR OVERSAMPLING
# ============================================
print("\n" + "="*80)
print("APPLYING SMOTE FOR CLASS BALANCING")
print("="*80)

print(f"\nBefore SMOTE:")
print(y_train.value_counts())

smote = SMOTE(random_state=42, k_neighbors=5)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print(f"\nAfter SMOTE:")
print(pd.Series(y_train_resampled).value_counts())
print(f"\n‚úì Train set balanced: {X_train_resampled.shape[0]} samples")

# ============================================
# EVALUATION FUNCTION
# ============================================
def evaluate_model(model, X_test, y_test, model_name):
    """Comprehensive model evaluation"""
    print("\n" + "=" * 80)
    print(f"{model_name} - TEST SET EVALUATION")
    print("=" * 80)

    y_pred = model.predict(X_test)

    # Calculate metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    print(f"\n{'Metric':<15} {'Score':<10}")
    print("-" * 25)
    print(f"{'Accuracy':<15} {acc:.4f}")
    print(f"{'Precision':<15} {prec:.4f}")
    print(f"{'Recall':<15} {rec:.4f}")
    print(f"{'F1 Score':<15} {f1:.4f}")

    print("\n" + "-" * 80)
    print("CLASSIFICATION REPORT")
    print("-" * 80)
    print(classification_report(y_test, y_pred, zero_division=0))

    print("CONFUSION MATRIX")
    print("-" * 80)
    cm = confusion_matrix(y_test, y_pred)
    print(f"True Negatives:  {cm[0,0]:>6}  |  False Positives: {cm[0,1]:>6}")
    print(f"False Negatives: {cm[1,0]:>6}  |  True Positives:  {cm[1,1]:>6}")

    return {'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1}

# ============================================
# XGBOOST OPTIMIZATION
# ============================================
print("\n" + "="*80)
print("XGBOOST HYPERPARAMETER OPTIMIZATION")
print("="*80)

def objective_xgb(trial):
    """Optuna objective function for XGBoost"""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500, step=50),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 2),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 2),
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'random_state': 42,
        'tree_method': 'hist',
        'verbosity': 0
    }

    model = XGBClassifier(**params)
    cv_scores = cross_val_score(
        model, X_train_resampled, y_train_resampled,
        cv=5, scoring='f1', n_jobs=-1
    )

    return cv_scores.mean()

study_xgb = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42)
)

print("\nüîç Optimizing XGBoost hyperparameters (50 trials)...")
study_xgb.optimize(objective_xgb, n_trials=50, show_progress_bar=True)

print("\n" + "="*80)
print("XGBOOST OPTIMIZATION RESULTS")
print("="*80)
print(f"\nBest CV F1 Score: {study_xgb.best_value:.4f}")
print(f"\nBest Hyperparameters:")
for key, value in study_xgb.best_params.items():
    print(f"  {key:<20} {value}")

# Train final XGBoost model
print("\n" + "="*80)
print("TRAINING FINAL XGBOOST MODEL")
print("="*80)

best_params_xgb = study_xgb.best_params.copy()
best_params_xgb.update({
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'random_state': 42,
    'tree_method': 'hist',
    'verbosity': 0
})

xgb_best = XGBClassifier(**best_params_xgb)
xgb_best.fit(X_train_resampled, y_train_resampled)

xgb_cv_f1 = cross_val_score(
    xgb_best, X_train_resampled, y_train_resampled,
    cv=5, scoring='f1', n_jobs=-1
)
print(f"\n‚úì XGBoost CV F1 Score: {xgb_cv_f1.mean():.4f} (¬±{xgb_cv_f1.std() * 2:.4f})")

xgb_metrics = evaluate_model(xgb_best, X_test_scaled, y_test, "XGBOOST (SMOTE + OPTUNA)")

# ============================================
# LIGHTGBM OPTIMIZATION
# ============================================
print("\n" + "="*80)
print("LIGHTGBM HYPERPARAMETER OPTIMIZATION")
print("="*80)

def objective_lgbm(trial):
    """Optuna objective function for LightGBM"""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500, step=50),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 10, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 2),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 2),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'objective': 'binary',
        'metric': 'binary_logloss',
        'random_state': 42,
        'verbosity': -1,
        'force_col_wise': True
    }

    model = LGBMClassifier(**params)
    cv_scores = cross_val_score(
        model, X_train_resampled, y_train_resampled,
        cv=5, scoring='f1', n_jobs=-1
    )

    return cv_scores.mean()

study_lgbm = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42)
)

print("\nüîç Optimizing LightGBM hyperparameters (50 trials)...")
study_lgbm.optimize(objective_lgbm, n_trials=50, show_progress_bar=True)

print("\n" + "="*80)
print("LIGHTGBM OPTIMIZATION RESULTS")
print("="*80)
print(f"\nBest CV F1 Score: {study_lgbm.best_value:.4f}")
print(f"\nBest Hyperparameters:")
for key, value in study_lgbm.best_params.items():
    print(f"  {key:<20} {value}")

# Train final LightGBM model
print("\n" + "="*80)
print("TRAINING FINAL LIGHTGBM MODEL")
print("="*80)

best_params_lgbm = study_lgbm.best_params.copy()
best_params_lgbm.update({
    'objective': 'binary',
    'metric': 'binary_logloss',
    'random_state': 42,
    'verbosity': -1,
    'force_col_wise': True
})

lgbm_best = LGBMClassifier(**best_params_lgbm)
lgbm_best.fit(X_train_resampled, y_train_resampled)

lgbm_cv_f1 = cross_val_score(
    lgbm_best, X_train_resampled, y_train_resampled,
    cv=5, scoring='f1', n_jobs=-1
)
print(f"\n‚úì LightGBM CV F1 Score: {lgbm_cv_f1.mean():.4f} (¬±{lgbm_cv_f1.std() * 2:.4f})")

lgbm_metrics = evaluate_model(lgbm_best, X_test_scaled, y_test, "LIGHTGBM (SMOTE + OPTUNA)")

# ============================================
# MODEL COMPARISON
# ============================================
print("\n" + "="*80)
print("MODEL COMPARISON SUMMARY")
print("="*80)

comparison_df = pd.DataFrame({
    'Model': ['XGBoost', 'LightGBM'],
    'Accuracy': [xgb_metrics['accuracy'], lgbm_metrics['accuracy']],
    'Precision': [xgb_metrics['precision'], lgbm_metrics['precision']],
    'Recall': [xgb_metrics['recall'], lgbm_metrics['recall']],
    'F1 Score': [xgb_metrics['f1'], lgbm_metrics['f1']]
})

print("\n", comparison_df.to_string(index=False))

best_model_idx = comparison_df['F1 Score'].idxmax()
best_model_name = comparison_df.loc[best_model_idx, 'Model']
best_f1 = comparison_df.loc[best_model_idx, 'F1 Score']

print(f"\nüèÜ Best Model: {best_model_name} (F1 Score: {best_f1:.4f})")

# ============================================
# SAVE MODELS
# ============================================
print("\n" + "="*80)
print("SAVING MODELS AND PREPROCESSORS")
print("="*80)

try:
    joblib.dump(xgb_best, 'xgboost_bankruptcy_model.pkl')
    joblib.dump(lgbm_best, 'lightgbm_bankruptcy_model.pkl')
    joblib.dump(scaler, 'scaler.pkl')
    joblib.dump(imputer, 'imputer.pkl')
    print("\n‚úì Models and preprocessors saved successfully:")
    print("  - xgboost_bankruptcy_model.pkl")
    print("  - lightgbm_bankruptcy_model.pkl")
    print("  - scaler.pkl")
    print("  - imputer.pkl")
except Exception as e:
    print(f"\n‚úó Error saving models: {e}")

print("\n" + "="*80)
print("PIPELINE COMPLETED SUCCESSFULLY")
print("="*80)

LOADING DATA

Dataset shape: (62789, 104)

First few rows:
  company_name  Financial_Year  Bankruptcy_Status  Current_Assets  \
0          C_1          1999.0                  0        511267.0   
1          C_1          2000.0                  0        485856.0   
2          C_1          2001.0                  0        436656.0   
3          C_1          2002.0                  0        396412.0   
4          C_1          2003.0                  0        432204.0   

   Cost_of_Goods_Sold  Depreciation_Amortization    EBITDA  Inventory  \
0          740998.000                   833107.0  180447.0   18373.00   
1             701.854                   713811.0  179987.0   18577.00   
2          710199.000                   526477.0  217699.0   22496.00   
3             686.621                   496747.0  164658.0   27172.00   
4             709.292                   523302.0  248666.0      26.68   

   Net_Income  Total_Receivables  ...  Group_78  Group_79  Group_80  Group_81  \
0    7

  0%|          | 0/50 [00:00<?, ?it/s]


XGBOOST OPTIMIZATION RESULTS

Best CV F1 Score: 0.9552

Best Hyperparameters:
  n_estimators         300
  max_depth            10
  learning_rate        0.2471009793656904
  subsample            0.9815175554337736
  colsample_bytree     0.7499444636304351
  min_child_weight     8
  gamma                0.05243099536025664
  reg_alpha            0.22207577681234913
  reg_lambda           1.2806933972893573

TRAINING FINAL XGBOOST MODEL

‚úì XGBoost CV F1 Score: 0.9552 (¬±0.0911)

XGBOOST (SMOTE + OPTUNA) - TEST SET EVALUATION

Metric          Score     
-------------------------
Accuracy        0.9251
Precision       0.4245
Recall          0.3341
F1 Score        0.3739

--------------------------------------------------------------------------------
CLASSIFICATION REPORT
--------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     11717
           1       0.42    

  0%|          | 0/50 [00:00<?, ?it/s]


LIGHTGBM OPTIMIZATION RESULTS

Best CV F1 Score: 0.9567

Best Hyperparameters:
  n_estimators         300
  max_depth            9
  learning_rate        0.24612964005707397
  subsample            0.7523714481169688
  colsample_bytree     0.6409991817629551
  min_child_weight     0.001742391782691527
  reg_alpha            0.332120778304002
  reg_lambda           0.1808300556050852
  num_leaves           128
  min_child_samples    21

TRAINING FINAL LIGHTGBM MODEL

‚úì LightGBM CV F1 Score: 0.9567 (¬±0.0993)

LIGHTGBM (SMOTE + OPTUNA) - TEST SET EVALUATION

Metric          Score     
-------------------------
Accuracy        0.9299
Precision       0.4673
Recall          0.3317
F1 Score        0.3880

--------------------------------------------------------------------------------
CLASSIFICATION REPORT
--------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     11

In [None]:
import joblib
import pandas as pd

# ===============================
# LOAD SAVED ARTIFACTS
# ===============================
print("="*80)
print("LOADING SAVED MODEL AND PREPROCESSORS")
print("="*80)

# Load the saved LightGBM model, imputer, and scaler
lgbm_model = joblib.load('lightgbm_bankruptcy_model.pkl')
imputer = joblib.load('imputer.pkl')
scaler = joblib.load('scaler.pkl')

# ===============================
# LOAD TEST DATA (WITHOUT TARGET)
# ===============================
print("\n" + "="*80)
print("LOADING TEST DATA")
print("="*80)

test_df = pd.read_csv('/content/encoded_bankruptcy_test.csv')  # üëà change path if needed
print(f"Test data shape: {test_df.shape}")
print(test_df.head())

# Keep company_name for final output
company_names = test_df['company_name']

# Drop non-feature columns
X_test = test_df.drop(['company_name'], axis=1)

# ===============================
# PREPROCESS TEST DATA
# ===============================
print("\n" + "="*80)
print("PREPROCESSING TEST DATA")
print("="*80)

# Handle missing values and scale features
X_test_imputed = imputer.transform(X_test)
X_test_scaled = scaler.transform(X_test_imputed)

print("‚úì Missing values handled and data scaled successfully.")

# ===============================
# MAKE PREDICTIONS
# ===============================
print("\n" + "="*80)
print("MAKING PREDICTIONS WITH LIGHTGBM")
print("="*80)

y_pred = lgbm_model.predict(X_test_scaled)

# Convert predictions to 0/1 integers if needed
y_pred = [int(p) for p in y_pred]

# ===============================
# SAVE PREDICTIONS TO CSV
# ===============================
print("\n" + "="*80)
print("SAVING PREDICTIONS TO CSV")
print("="*80)

output_df = pd.DataFrame({
    'company_name': company_names,
    'Predicted_Bankruptcy_Status': y_pred
})

output_df.to_csv('bankruptcy_predictions.csv', index=False)
print("Predictions saved to 'bankruptcy_predictions.csv' successfully.")

print("\nSample output:")
print(output_df.head())


LOADING SAVED MODEL AND PREPROCESSORS

LOADING TEST DATA
Test data shape: (15893, 103)
  company_name  Financial_Year  Current_Assets  Cost_of_Goods_Sold  \
0          C_3            1999        9757.000            13986.00   
1          C_3            2000           7.884            11608.00   
2          C_3            2001        6494.000             8635.00   
3          C_3            2002        5938.000                7.85   
4          C_3            2004        5807.000             6245.00   

   Depreciation_Amortization  EBITDA  Inventory  Net_Income  \
0                    19796.0  5974.0    667.000    -932.000   
1                    16506.0  4875.0      0.700      -0.028   
2                       15.7  3873.0      0.761      -0.380   
3                    12919.0  2546.0    355.000     356.000   
4                    12018.0   222.0      0.160    1454.000   

   Total_Receivables  Market_Value  ...  Group_78  Group_79  Group_80  \
0             -265.0      9574.000  ... 

In [20]:
predicted_df = pd.read_csv('/content/bankruptcy_predictions.csv')

# Replace 0 and 1 with text labels in the target column
predicted_df['Predicted_Bankruptcy_Status'] = predicted_df['Predicted_Bankruptcy_Status'].replace({0: 'alive', 1: 'failed'})

# Optional: Save the updated CSV
predicted_df.to_csv('/content/bankruptcy_predictions_labeled.csv', index=False)

# Check result
predicted_df['Predicted_Bankruptcy_Status'].value_counts()

Unnamed: 0_level_0,count
Predicted_Bankruptcy_Status,Unnamed: 1_level_1
alive,15231
failed,662
