In [2]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    f1_score, classification_report, confusion_matrix, accuracy_score,
    precision_recall_curve, auc
)

from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier, Pool

# ----------------- LOAD (use your provided files) -----------------
train_path = "dataset/train.csv"
test_path  = "dataset/df1_matches.csv"

df_tr = pd.read_csv(train_path)
df_te = pd.read_csv(test_path)

# Optional cleanup if present
for df in (df_tr, df_te):
    for c in ["Unnamed: 0", "Division", "MajorGroup"]:
        if c in df.columns:
            df.drop(columns=c, inplace=True)

# Map target to 0/1
label_map = {"alive": 0, "failed": 1}
if "status_label" in df_tr.columns:
    df_tr["status"] = df_tr["status_label"].map(label_map).astype(int)
else:
    # Fallback if a different name is used
    raise ValueError("Training file must contain 'status_label'")

if "status_label" in df_te.columns:
    df_te["status"] = df_te["status_label"].map(label_map).astype(int)
else:
    raise ValueError("Test file must contain 'status_label'")

# ----------------- FEATURES -----------------
# Use all X1..X18 (present in your files) and any other numeric fields except identifiers/target
# Exclude company/year/labels
drop_cols = {"company_name", "fyear", "status_label", "status"}
feature_cols = [c for c in df_tr.columns if c not in drop_cols]

# Keep only columns available in BOTH train & test (safety)
feature_cols = [c for c in feature_cols if c in df_te.columns]

X_train_raw = df_tr[feature_cols].copy()
y_train     = df_tr["status"].values

X_test_raw  = df_te[feature_cols].copy()
y_test      = df_te["status"].values

print("Train shape:", X_train_raw.shape, "| Test shape:", X_test_raw.shape)
print("Train class counts:", np.bincount(y_train))
print("Test  class counts:", np.bincount(y_test))

# ----------------- IMPUTE NUMERIC MISSING VALUES -----------------
imp = SimpleImputer(strategy="median")
X_train_imp = imp.fit_transform(X_train_raw)
X_test_imp  = imp.transform(X_test_raw)

# ----------------- APPLY SMOTE ON TRAIN ONLY -----------------
minority_count = int(y_train.sum()) if y_train.sum() < len(y_train) - y_train.sum() else int(len(y_train) - y_train.sum())
k_safe = max(1, min(5, minority_count - 1))
sm = SMOTE(random_state=42, k_neighbors=k_safe)
X_train_sm, y_train_sm = sm.fit_resample(X_train_imp, y_train)

print("After SMOTE train counts:", np.bincount(y_train_sm))

# ----------------- TRAIN CATBOOST -----------------
clf = CatBoostClassifier(
    iterations=2000,
    learning_rate=0.05,
    depth=6,
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=42,
    verbose=False
)

train_pool = Pool(X_train_sm, y_train_sm, feature_names=feature_cols)
test_pool  = Pool(X_test_imp,  y_test,    feature_names=feature_cols)

# No split: we still pass eval_set=test for early stopping/model selection
clf.fit(train_pool, eval_set=test_pool, use_best_model=True, early_stopping_rounds=100)

# ----------------- EVALUATE -----------------
y_prob = clf.predict_proba(test_pool)[:, 1]

# Default threshold 0.50
y_pred_05 = (y_prob >= 0.5).astype(int)

print("\n=== Test @ threshold 0.50 ===")
print("Accuracy:", accuracy_score(y_test, y_pred_05))
print("F1 (failed=1):", f1_score(y_test, y_pred_05))
print("Macro F1:", f1_score(y_test, y_pred_05, average="macro"))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_05))
print("\nClassification report:\n", classification_report(y_test, y_pred_05, digits=4))

# Best threshold for F1 on the test set (per your pattern)
ths = np.linspace(0.01, 0.99, 99)
f1s = [f1_score(y_test, (y_prob >= t).astype(int)) for t in ths]
best_t = float(ths[int(np.argmax(f1s))])
y_pred_best = (y_prob >= best_t).astype(int)

prec, rec, _ = precision_recall_curve(y_test, y_prob)
pr_auc = auc(rec, prec)

print("\n=== Test @ best F1 threshold ===")
print(f"Best threshold: {best_t:.3f}")
print("F1 (failed=1):", f1_score(y_test, y_pred_best))
print("Macro F1:", f1_score(y_test, y_pred_best, average="macro"))
print("Accuracy:", accuracy_score(y_test, y_pred_best))
print("PR-AUC:", pr_auc)
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_best))
print("\nClassification report:\n", classification_report(y_test, y_pred_best, digits=4))

# ----------------- Feature Importance -----------------
importances = pd.Series(clf.get_feature_importance(test_pool), index=feature_cols).sort_values(ascending=False)
print("\nTop Important Predictors:\n", importances.head(15))


Train shape: (62789, 18) | Test shape: (15893, 18)
Train class counts: [58586  4203]
Test  class counts: [14876  1017]
After SMOTE train counts: [58586 58586]

=== Test @ threshold 0.50 ===
Accuracy: 0.7132699930787139
F1 (failed=1): 0.15875946095624885
Macro F1: 0.4929714480252441
Confusion matrix:
 [[10906  3970]
 [  587   430]]

Classification report:
               precision    recall  f1-score   support

           0     0.9489    0.7331    0.8272     14876
           1     0.0977    0.4228    0.1588      1017

    accuracy                         0.7133     15893
   macro avg     0.5233    0.5780    0.4930     15893
weighted avg     0.8945    0.7133    0.7844     15893


=== Test @ best F1 threshold ===
Best threshold: 0.520
F1 (failed=1): 0.15905804585829375
Macro F1: 0.503986250615174
Accuracy: 0.743849493487699
PR-AUC: 0.09565507847809315
Confusion matrix:
 [[11437  3439]
 [  632   385]]

Classification report:
               precision    recall  f1-score   support

          