In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import optuna
from optuna.samplers import TPESampler
import joblib
import warnings
warnings.filterwarnings('ignore')

# Load data
df = pd.read_csv('/content/encoded_bankruptcy_data.csv')

print(df.head())
print(df.info())

# ============================================
# DATA PREPROCESSING
# ============================================
X = df.drop(['company_name', 'Bankruptcy_Status'], axis=1)
y = df['Bankruptcy_Status']

# Check for missing values
print("\n" + "="*80)
print("CHECKING FOR MISSING VALUES")
print("="*80)
print(f"\nMissing values in features:")
missing_counts = X.isnull().sum()
missing_features = missing_counts[missing_counts > 0]
if len(missing_features) > 0:
    print(missing_features)
    print(f"\nTotal features with missing values: {len(missing_features)}")
else:
    print("No missing values found")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"\nTrain target distribution BEFORE SMOTE:\n{y_train.value_counts()}")
print(f"Test target distribution:\n{y_test.value_counts()}")

# ============================================
# HANDLE MISSING VALUES
# ============================================
print("\n" + "="*80)
print("HANDLING MISSING VALUES")
print("="*80)

# Create imputer (median for numerical features)
imputer = SimpleImputer(strategy='median')

# Fit on train and transform both train and test
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

print("Missing values imputed using median strategy")

# ============================================
# FEATURE SCALING
# ============================================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

print(f"\nScaled train set - Mean: {X_train_scaled.mean():.6f}, Std: {X_train_scaled.std():.6f}")
print(f"Scaled test set - Mean: {X_test_scaled.mean():.6f}, Std: {X_test_scaled.std():.6f}")

# Check for any remaining NaN or inf values
print(f"\nNaN in scaled train: {np.isnan(X_train_scaled).sum()}")
print(f"Inf in scaled train: {np.isinf(X_train_scaled).sum()}")

# ============================================
# APPLY SMOTE FOR OVERSAMPLING
# ============================================
print("\n" + "="*80)
print("APPLYING SMOTE FOR OVERSAMPLING")
print("="*80)

smote = SMOTE(random_state=42, k_neighbors=5)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print(f"\nTrain target distribution AFTER SMOTE:\n{pd.Series(y_train_resampled).value_counts()}")
print(f"New train set size: {X_train_resampled.shape[0]} samples")

# ============================================
# OPTUNA HYPERPARAMETER OPTIMIZATION
# ============================================
print("\n" + "="*80)
print("STARTING OPTUNA HYPERPARAMETER OPTIMIZATION")
print("="*80)

def objective(trial):
    """Optuna objective function for XGBoost"""

    # Define hyperparameter search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300, step=50),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 2),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 2),
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'random_state': 42,
        'tree_method': 'hist'
    }

    # Create model
    model = XGBClassifier(**params)

    # Cross-validation with F1 score
    cv_scores = cross_val_score(
        model, X_train_resampled, y_train_resampled,
        cv=5, scoring='f1', n_jobs=-1
    )

    return cv_scores.mean()

# Create Optuna study
study = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42)
)

# Run optimization
print("\nOptimizing hyperparameters (this may take several minutes)...")
study.optimize(objective, n_trials=50, show_progress_bar=True)

# Get best parameters
print("\n" + "="*80)
print("OPTIMIZATION RESULTS")
print("="*80)
print(f"\nBest F1 Score: {study.best_value:.4f}")
print(f"\nBest Parameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

# ============================================
# TRAIN FINAL MODEL WITH BEST PARAMETERS
# ============================================
print("\n" + "="*80)
print("TRAINING FINAL MODEL WITH BEST PARAMETERS")
print("="*80)

best_params = study.best_params.copy()
best_params.update({
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'use_label_encoder': False,
    'random_state': 42,
    'tree_method': 'hist'
})

xgb_best = XGBClassifier(**best_params)
xgb_best.fit(X_train_resampled, y_train_resampled)

# Cross-validation on resampled data
xgb_cv_f1 = cross_val_score(
    xgb_best, X_train_resampled, y_train_resampled,
    cv=5, scoring='f1'
)
print(f"\nXGBoost CV F1 on resampled data: {xgb_cv_f1.mean():.4f} (+/- {xgb_cv_f1.std() * 2:.4f})")

# ============================================
# EVALUATION FUNCTION
# ============================================
def evaluate(model, X_test, y_test, model_name):
    print("\n" + "=" * 80)
    print(f"{model_name} TEST SET EVALUATION")
    print("=" * 80)

    y_pred = model.predict(X_test)

    print(f"\nAccuracy:  {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred, zero_division=0):.4f}")
    print(f"Recall:    {recall_score(y_test, y_pred, zero_division=0):.4f}")
    print(f"F1 Score:  {f1_score(y_test, y_pred, zero_division=0):.4f}")

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))

# Evaluate on test set
evaluate(xgb_best, X_test_scaled, y_test, "XGBOOST (SMOTE + OPTUNA)")

  company_name  Financial_Year  Bankruptcy_Status  Current_Assets  \
0          C_1          1999.0                  0        511267.0   
1          C_1          2000.0                  0        485856.0   
2          C_1          2001.0                  0        436656.0   
3          C_1          2002.0                  0        396412.0   
4          C_1          2003.0                  0        432204.0   

   Cost_of_Goods_Sold  Depreciation_Amortization    EBITDA  Inventory  \
0          740998.000                   833107.0  180447.0   18373.00   
1             701.854                   713811.0  179987.0   18577.00   
2          710199.000                   526477.0  217699.0   22496.00   
3             686.621                   496747.0  164658.0   27172.00   
4             709.292                   523302.0  248666.0      26.68   

   Net_Income  Total_Receivables  ...  Group_78  Group_79  Group_80  Group_81  \
0    70658.00          89031.000  ...     False     False     Fal

[I 2025-11-03 06:07:14,453] A new study created in memory with name: no-name-b4a6fa6e-2bb5-49a8-a6c3-90e98d3fcf7a



Train target distribution AFTER SMOTE:
Bankruptcy_Status
0    46869
1    46869
Name: count, dtype: int64
New train set size: 93738 samples

STARTING OPTUNA HYPERPARAMETER OPTIMIZATION

Optimizing hyperparameters (this may take several minutes)...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-03 06:07:47,447] Trial 0 finished with value: 0.9415803690006396 and parameters: {'n_estimators': 150, 'max_depth': 10, 'learning_rate': 0.1205712628744377, 'subsample': 0.8394633936788146, 'colsample_bytree': 0.6624074561769746, 'min_child_weight': 2, 'gamma': 0.2904180608409973, 'reg_alpha': 1.7323522915498704, 'reg_lambda': 1.2022300234864176}. Best is trial 0 with value: 0.9415803690006396.
[I 2025-11-03 06:08:05,716] Trial 1 finished with value: 0.897044428101108 and parameters: {'n_estimators': 250, 'max_depth': 3, 'learning_rate': 0.2708160864249968, 'subsample': 0.9329770563201687, 'colsample_bytree': 0.6849356442713105, 'min_child_weight': 2, 'gamma': 0.9170225492671691, 'reg_alpha': 0.6084844859190754, 'reg_lambda': 1.0495128632644757}. Best is trial 0 with value: 0.9415803690006396.
[I 2025-11-03 06:08:25,909] Trial 2 finished with value: 0.8875408754684733 and parameters: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.08012737503998542, 'subsample': 0.6

In [None]:
# ============================================
# SAVE MODEL, SCALER, AND IMPUTER
# ============================================
joblib.dump(xgb_best, '/content/Output_prediction/xgb_smote_optuna_model.pkl')
joblib.dump(scaler, '/content/Output_prediction/scaler.pkl')
joblib.dump(imputer, '/content/Output_prediction/imputer.pkl')

print("\n" + "="*80)
print("MODEL, SCALER, AND IMPUTER SAVED SUCCESSFULLY!")
print("="*80)
print("Files saved:")
print("  - xgb_smote_optuna_model.pkl")
print("  - scaler.pkl")
print("  - imputer.pkl")

# ============================================
# PREDICT ON TEST DATA
# ============================================
print("\n" + "="*80)
print("GENERATING PREDICTIONS FOR TEST DATA")
print("="*80)

# Load test data
test_df = pd.read_csv("/content/encoded_bankruptcy_test.csv")
print(f"\nTest data shape: {test_df.shape}")

# Prepare features
FEATURE_COLUMNS = [col for col in test_df.columns if col != 'company_name']
X_test_final = test_df[FEATURE_COLUMNS]

# Apply same preprocessing pipeline
X_test_final_imputed = imputer.transform(X_test_final)
X_test_final_scaled = scaler.transform(X_test_final_imputed)

# Make predictions
predictions = xgb_best.predict(X_test_final_scaled)

# Create output dataframe
test_df['predicted_output'] = predictions
final_df = test_df[['company_name', 'predicted_output']]

# Save predictions
OUTPUT_PATH = "/content/Output_prediction/final_predictions_smote_optuna.csv"
final_df.to_csv(OUTPUT_PATH, index=False)

print(f"\n✅ Predictions saved to: {OUTPUT_PATH}")
print(f"\nPrediction distribution:")
print(final_df['predicted_output'].value_counts())


MODEL, SCALER, AND IMPUTER SAVED SUCCESSFULLY!
Files saved:
  - xgb_smote_optuna_model.pkl
  - scaler.pkl
  - imputer.pkl

GENERATING PREDICTIONS FOR TEST DATA

Test data shape: (15893, 103)

✅ Predictions saved to: /content/Output_prediction/final_predictions_smote_optuna.csv

Prediction distribution:
predicted_output
0    15153
1      740
Name: count, dtype: int64


In [None]:
%pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.5.0
