In [33]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, classification_report
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Load
train = pd.read_csv("dataset/train.csv")
test = pd.read_csv("dataset/df1_matches.csv")

train.drop(columns=["Unnamed: 0"], inplace=True, errors="ignore")

# Encode labels
train["status_label"] = train["status_label"].map({'alive': 0, 'failed': 1})
test["status_label"] = test["status_label"].map({'alive': 0, 'failed': 1})

# Encode categories
le_div = LabelEncoder()
le_major = LabelEncoder()
train["Division"] = le_div.fit_transform(train["Division"])
test["Division"] = le_div.transform(test["Division"])
train["MajorGroup"] = le_major.fit_transform(train["MajorGroup"])
test["MajorGroup"] = le_major.transform(test["MajorGroup"])

# Feature / target split
X_train = train.drop(columns=["company_name", "fyear", "status_label"])
y_train = train["status_label"]
X_test  = test.drop(columns=["company_name", "fyear", "status_label"])
y_test  = test["status_label"]

# Scale
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

# Ratio of classes for imbalance weighting
scale_pos_weight = (len(y_train) - sum(y_train)) / sum(y_train)
print("scale_pos_weight =", round(scale_pos_weight,2))


scale_pos_weight = 13.94


In [34]:
xgb = XGBClassifier(
    n_estimators=1200,
    learning_rate=0.03,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    scale_pos_weight=scale_pos_weight,
    tree_method="hist",
    random_state=42,
    eval_metric="logloss"
)

xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

print("\n=== XGBoost ===")
print("Macro F1:", f1_score(y_test, y_pred_xgb, average='macro'))
print(classification_report(y_test, y_pred_xgb, digits=4))



=== XGBoost ===
Macro F1: 0.5628426474064498
              precision    recall  f1-score   support

           0     0.9426    0.9665    0.9544     14876
           1     0.2215    0.1396    0.1713      1017

    accuracy                         0.9135     15893
   macro avg     0.5821    0.5530    0.5628     15893
weighted avg     0.8965    0.9135    0.9043     15893



In [36]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import f1_score, classification_report

# Compute manual class weights
neg, pos = np.bincount(y_train)
weight_for_0 = 1.0
weight_for_1 = (neg / pos) * 1.5  # amplify a bit beyond balance

print("Manual class weights:", weight_for_0, weight_for_1)

train_pool = Pool(X_train, y_train)
test_pool  = Pool(X_test, y_test)

cat = CatBoostClassifier(
    iterations=2000,
    learning_rate=0.02,
    depth=10,
    l2_leaf_reg=5,
    border_count=128,
    class_weights=[weight_for_0, weight_for_1],
    eval_metric='F1',
    random_seed=42,
    verbose=200,
    early_stopping_rounds=200,
)

cat.fit(train_pool, eval_set=test_pool, use_best_model=True)

# Predict with tuned threshold
proba = cat.predict_proba(X_test)[:, 1]

best_f1, best_thr = 0, 0.5
for thr in np.linspace(0.1, 0.9, 17):
    f1 = f1_score(y_test, (proba > thr).astype(int), average='macro')
    if f1 > best_f1:
        best_f1, best_thr = f1, thr

y_pred_opt = (proba > best_thr).astype(int)
print(f"\nBest threshold = {best_thr:.2f}")
print("Macro F1 =", best_f1)
print(classification_report(y_test, y_pred_opt, digits=4))


Manual class weights: 1.0 20.908636688079945
0:	learn: 0.7557554	test: 0.7210278	best: 0.7210278 (0)	total: 53.2ms	remaining: 1m 46s
200:	learn: 0.8533557	test: 0.7118600	best: 0.7383829 (38)	total: 9.72s	remaining: 1m 27s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.7383829248
bestIteration = 38

Shrink model to first 39 iterations.

Best threshold = 0.60
Macro F1 = 0.5565755204195643
              precision    recall  f1-score   support

           0     0.9503    0.8684    0.9075     14876
           1     0.1483    0.3353    0.2057      1017

    accuracy                         0.8343     15893
   macro avg     0.5493    0.6018    0.5566     15893
weighted avg     0.8990    0.8343    0.8626     15893



In [None]:
# ============================================
# Bankruptcy Classification â€” Clean Final Script
# ============================================

import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from catboost import CatBoostClassifier
from collections import Counter

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# -----------------------------
# 1) Load
# -----------------------------
train = pd.read_csv("dataset/train.csv")
test  = pd.read_csv("dataset/df1_matches.csv")

# Align label column name if needed (your text shows both status_label and Bankruptcy_Status)
LABEL_COL = "status_label" if "status_label" in train.columns else "Bankruptcy_Status"

# Map labels to {alive:0, failed:1}
label_map = {"alive": 0, "failed": 1}
train[LABEL_COL] = train[LABEL_COL].map(label_map)
test[LABEL_COL]  = test[LABEL_COL].map(label_map)

# -----------------------------
# 2) Basic cleaning
# -----------------------------
# Ensure Division is string for CatBoost categorical
if "Division" in train.columns:
    train["Division"] = train["Division"].astype(str)
    test["Division"]  = test["Division"].astype(str)

# Drop obvious non-features later; keep for grouping right now
DROP_COLS = ["Unnamed: 0", "company_name", "fyear"]

# -----------------------------
# 3) Feature engineering
# -----------------------------
def add_core_ratios(df):
    eps = 1e-6

    # Readable aliases
    CA  = df["X1"]   # Current Assets
    COGS= df["X2"]   # Cost of Goods Sold
    DA  = df["X3"]   # Depreciation & Amortization
    EBITDA = df["X4"]
    INV = df["X5"]
    NI  = df["X6"]   # Net Income
    AR  = df["X7"]   # Total Receivables
    MV  = df["X8"]   # Market Value
    SALES = df["X9"] # Net Sales
    TA  = df["X10"]  # Total Assets
    LTD = df["X11"]  # Total Long Term Debt
    EBIT = df["X12"]
    GP  = df["X13"]  # Gross Profit
    TCL = df["X14"]  # Total Current Liabilities
    RE  = df["X15"]  # Retained Earnings
    TR  = df["X16"]  # Total Revenue
    TL  = df["X17"]  # Total Liabilities
    TOE = df["X18"]  # Total Operating Expenses

    # Liquidity
    df["Current_Ratio"] = CA / (TCL + eps)
    df["Quick_Ratio"]   = (CA - INV) / (TCL + eps)
    df["Working_Capital"] = CA - TCL
    df["WC_to_Assets"]  = (df["Working_Capital"]) / (TA + eps)

    # Leverage
    df["Debt_to_Assets"] = TL / (TA + eps)
    df["Debt_to_Equity"] = TL / (np.maximum(TA - TL, 0) + eps)
    df["LTD_to_Assets"]  = LTD / (TA + eps)

    # Profitability
    df["ROA"] = NI / (TA + eps)
    df["ROS"] = NI / (SALES + eps)
    df["Gross_Margin"] = GP / (SALES + eps)
    df["EBITDA_Margin"] = EBITDA / (SALES + eps)

    # Efficiency
    df["Asset_Turnover"] = SALES / (TA + eps)
    df["Inventory_Turnover"] = SALES / (INV + eps)
    df["Receivables_Turnover"] = SALES / (AR + eps)

    # Size
    df["Log_TA"] = np.log(TA + eps)
    df["Log_Sales"] = np.log(SALES + eps)

    return df


def add_bankruptcy_scores(df):
    eps = 1e-6
    TA  = df["X10"]
    SALES = df["X9"]
    TL  = df["X17"]
    TCL = df["X14"]
    EBIT = df["X12"]
    RE   = df["X15"]

    # Altman Z (manufacturing version, using available proxies)
    WC = df["Working_Capital"]
    df["Altman_Z"] = (
        1.2 * (WC / (TA + eps)) +
        1.4 * (RE / (TA + eps)) +
        3.3 * (EBIT / (TA + eps)) +
        0.6 * (SALES / (TL + eps)) +
        1.0 * (SALES / (TA + eps))
    )

    # Ohlson O-Score (partial; uses key drivers with available variables)
    df["Ohlson_O"] = (
        -1.32
        - 0.407 * np.log(TA + eps)
        + 6.03 * (TCL / (TA + eps))
        - 1.43 * (TL  / (TA + eps))
        + 0.0757 * (TL / (SALES + eps))
    )

    # Shumway-like linear index
    df["Shumway"] = (
        df["ROA"] + df["Asset_Turnover"] - df["Debt_to_Assets"]
    )

    return df


def add_trend_features(df):
    # Sort to compute within-company trends
    if not {"company_name", "fyear"}.issubset(df.columns):
        return df

    df = df.sort_values(["company_name", "fyear"])
    cols_to_trend = ["X6", "X9", "X10", "Altman_Z", "Ohlson_O", "Shumway", "ROA", "Debt_to_Assets"]
    for c in cols_to_trend:
        if c in df.columns:
            df[f"{c}_chg"] = df.groupby("company_name")[c].pct_change().replace([np.inf, -np.inf], np.nan).fillna(0.0)
    return df


def build_features(df):
    df = add_core_ratios(df.copy())
    df = add_bankruptcy_scores(df)
    df = add_trend_features(df)
    # Clean up numerical issues
    for col in df.columns:
        if df[col].dtype.kind in "fc":
            df[col] = df[col].replace([np.inf, -np.inf], np.nan).fillna(0.0)
    return df


train_fe = build_features(train)
test_fe  = build_features(test)

# -----------------------------
# 4) Prepare matrices
# -----------------------------
# Keep company_name for grouping; drop from features later
feature_cols = [c for c in train_fe.columns if c not in DROP_COLS + [LABEL_COL]]
X = train_fe[feature_cols]
y = train_fe[LABEL_COL].astype(int)
X_test = test_fe[feature_cols]
y_test = test_fe[LABEL_COL].astype(int)  # assuming available for evaluation

# Identify categorical columns for CatBoost (Division only here)
cat_features = []
if "Division" in feature_cols:
    cat_features.append(feature_cols.index("Division"))  # index within X's columns

# -----------------------------
# 5) CV with GroupKFold (no company leakage)
# -----------------------------
groups = train["company_name"]
gkf = GroupKFold(n_splits=5)

# Class weights based on training distribution
counter = Counter(y)
neg, pos = counter[0], counter[1]
class_weights = [1.0, max(1.0, neg / max(pos, 1))]  # e.g., ~14 if 58.5k/4.2k

print(f"Class distribution (train): {counter}, class_weights={class_weights}")

oof_pred = np.zeros(len(X))
test_pred = np.zeros(len(X_test))

fold = 1
for tr_idx, va_idx in gkf.split(X, y, groups=groups):
    print(f"\n===== Fold {fold} =====")
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    model = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="F1",
        depth=8,
        iterations=2000,
        learning_rate=0.02,
        l2_leaf_reg=6.0,
        random_seed=RANDOM_STATE + fold,
        class_weights=class_weights,
        verbose=False,
        thread_count=-1
    )

    model.fit(
        X_tr, y_tr,
        eval_set=(X_va, y_va),
        cat_features=cat_features,
        use_best_model=True
    )

    oof_pred[va_idx] = model.predict_proba(X_va)[:, 1]
    test_pred += model.predict_proba(X_test)[:, 1] / gkf.n_splits

    # Quick per-fold check
    best_f1, best_t = 0.0, 0.5
    for t in np.linspace(0.05, 0.95, 91):
        f1 = f1_score(y_va, (oof_pred[va_idx] > t).astype(int), average="macro")
        if f1 > best_f1:
            best_f1, best_t = f1, t
    print(f"Fold {fold} best macro-F1={best_f1:.4f} at threshold={best_t:.2f}")

    fold += 1

# -----------------------------
# 6) Global threshold optimization on OOF
# -----------------------------
best_f1, best_t = 0.0, 0.5
for t in np.linspace(0.05, 0.95, 181):
    preds = (oof_pred > t).astype(int)
    f1 = f1_score(y, preds, average="macro")
    if f1 > best_f1:
        best_f1, best_t = f1, t

print("\n===== OOF Performance =====")
print(f"Optimal threshold on OOF: {best_t:.3f}")
print(f"OOF Macro F1: {best_f1:.6f}")

# -----------------------------
# 7) Final Test Evaluation
# -----------------------------
y_test_pred = (test_pred > best_t).astype(int)

print("\n===== Final Test Performance =====")
print(f"Macro F1: {f1_score(y_test, y_test_pred, average='macro'):.6f}")
print(classification_report(y_test, y_test_pred, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_test, y_test_pred))

# -----------------------------
# 8) (Optional) Inspect top features by CatBoost importance
# -----------------------------
# (Train a final model on full train to get importances)
final_model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="F1",
    depth=8,
    iterations=2000,
    learning_rate=0.02,
    l2_leaf_reg=6.0,
    random_seed=RANDOM_STATE,
    class_weights=class_weights,
    verbose=False,
    thread_count=-1
)
final_model.fit(X, y, cat_features=cat_features)

importances = final_model.get_feature_importance(prettified=True)
print("\nTop 20 features by importance:")
print(importances.sort_values("Importances", ascending=False).head(20))

# If you want to save predictions:
# pd.DataFrame({"company_name": test["company_name"], "pred_prob_failed": test_pred, "pred_label": y_test_pred}).to_csv("predictions.csv", index=False)


Class distribution (train): Counter({0: 58586, 1: 4203}), class_weights=[1.0, 13.939091125386629]

===== Fold 1 =====
