# Import Libraries

In [11]:
import pandas as pd
import numpy as np

# Load Dataset

In [4]:
train = pd.read_csv("/kaggle/input/playground-series-s5e8/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e8/test.csv")
submission = pd.read_csv("/kaggle/input/playground-series-s5e8/sample_submission.csv")

train.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0
1,1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0
2,2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0
3,3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0
4,4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1


In [6]:
train.describe()

Unnamed: 0,id,age,balance,day,duration,campaign,pdays,previous,y
count,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0
mean,374999.5,40.926395,1204.067397,16.117209,256.229144,2.577008,22.412733,0.298545,0.120651
std,216506.495284,10.098829,2836.096759,8.250832,272.555662,2.718514,77.319998,1.335926,0.325721
min,0.0,18.0,-8019.0,1.0,1.0,1.0,-1.0,0.0,0.0
25%,187499.75,33.0,0.0,9.0,91.0,1.0,-1.0,0.0,0.0
50%,374999.5,39.0,634.0,17.0,133.0,2.0,-1.0,0.0,0.0
75%,562499.25,48.0,1390.0,21.0,361.0,3.0,-1.0,0.0,0.0
max,749999.0,95.0,99717.0,31.0,4918.0,63.0,871.0,200.0,1.0


## Merge train with original data and remove duplicates

In [7]:
orig = pd.read_csv("/kaggle/input/bank-marketing-dataset-full/bank-full.csv", sep=';')
orig['y'] = orig['y'].map({'no': 0, 'yes': 1})

In [8]:
train = pd.concat([train, orig], ignore_index=True)
train = train.drop_duplicates()

In [9]:
train.describe()

Unnamed: 0,id,age,balance,day,duration,campaign,pdays,previous,y
count,750000.0,795211.0,795211.0,795211.0,795211.0,795211.0,795211.0,795211.0,795211.0
mean,374999.5,40.926953,1213.06198,16.09954,256.339096,2.58763,23.423889,0.314566,0.120442
std,216506.495284,10.129098,2848.603881,8.255231,271.723766,2.741838,78.901465,1.410369,0.325478
min,0.0,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0,0.0
25%,187499.75,33.0,0.0,9.0,91.0,1.0,-1.0,0.0,0.0
50%,374999.5,39.0,624.0,17.0,135.0,2.0,-1.0,0.0,0.0
75%,562499.25,48.0,1390.0,21.0,347.5,3.0,-1.0,0.0,0.0
max,749999.0,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0,1.0


# EDA

## Quick structure + target, duplicates, missing/“unknown”

In [12]:
TARGET = "y"
ID_COL = "id"

feat_cols = [c for c in train.columns if c not in [TARGET, ID_COL]]
num_cols = train[feat_cols].select_dtypes(include=["number"]).columns.tolist()
cat_cols = [c for c in feat_cols if c not in num_cols]

print("Shape:", train.shape, "| Test:", test.shape)
print("\nDtypes\n", train.dtypes)

# Target balance
print("\nTarget distribution")
print(train[TARGET].value_counts().rename("count"))
print((train[TARGET].value_counts(normalize=True)*100).round(2).rename("pct %"))

# Duplicates (excluding id)
dup_rows = train.duplicated(subset=feat_cols).sum()
print(f"\nPotential duplicate rows (excluding id): {dup_rows}")

# Missing + 'unknown' audit
def unknown_count(s):
    return (s.astype(str).str.lower() == "unknown").sum()

summary_rows = []
for c in train.columns:
    summary_rows.append({
        "col": c,
        "dtype": train[c].dtype,
        "n_unique": train[c].nunique(dropna=True),
        "missing": train[c].isna().sum(),
        "missing_%": train[c].isna().mean()*100,
        "unknown_cnt": unknown_count(train[c]) if c in cat_cols else np.nan,
        "unknown_%": (unknown_count(train[c]) / len(train) * 100) if c in cat_cols else np.nan,
        "example_values": train[c].dropna().astype(str).unique()[:5]
    })
eda_summary = pd.DataFrame(summary_rows).sort_values(["dtype","col"])
eda_summary


Shape: (795211, 18) | Test: (250000, 17)

Dtypes
 id           float64
age            int64
job           object
marital       object
education     object
default       object
balance        int64
housing       object
loan          object
contact       object
day            int64
month         object
duration       int64
campaign       int64
pdays          int64
previous       int64
poutcome      object
y              int64
dtype: object

Target distribution
y
0    699434
1     95777
Name: count, dtype: int64
y
0    87.96
1    12.04
Name: pct %, dtype: float64

Potential duplicate rows (excluding id): 21


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,col,dtype,n_unique,missing,missing_%,unknown_cnt,unknown_%,example_values
1,age,int64,78,0,0.0,,,"[42, 38, 36, 27, 26]"
6,balance,int64,8393,0,0.0,,,"[7, 514, 602, 34, 889]"
13,campaign,int64,52,0,0.0,,,"[3, 1, 2, 25, 5]"
10,day,int64,31,0,0.0,,,"[25, 18, 14, 28, 3]"
12,duration,int64,1790,0,0.0,,,"[117, 185, 111, 10, 902]"
14,pdays,int64,612,0,0.0,,,"[-1, 175, 91, 181, 252]"
15,previous,int64,51,0,0.0,,,"[0, 3, 4, 2, 1]"
17,y,int64,2,0,0.0,,,"[0, 1]"
0,id,float64,750000,45211,5.685409,,,"[0.0, 1.0, 2.0, 3.0, 4.0]"
9,contact,object,3,0,0.0,244647.0,30.765042,"[cellular, unknown, telephone]"


## Target relationships (cats & nums)

In [13]:
# Category-wise target rates
def target_rate_table(df, col, topn=15, min_n=50):
    g = df.groupby(col)[TARGET].agg(rate="mean", n="size").reset_index()
    g = g[g["n"] >= min_n].sort_values("rate", ascending=False)
    return g.head(topn)

cat_insights = {c: target_rate_table(train, c) for c in cat_cols}
cat_insights["job"].head(10), cat_insights["contact"].head(10), cat_insights["poutcome"].head(10)


(              job      rate       n
 8         student  0.336797   12705
 5         retired  0.245133   37449
 10     unemployed  0.178117   18937
 4      management  0.149736  184999
 6   self-employed  0.128598   20599
 11        unknown  0.120437    3205
 9      technician  0.117917  145704
 0          admin.  0.116786   86663
 3       housemaid  0.084888   17152
 7        services  0.083086   68363,
      contact      rate       n
 0   cellular  0.156160  515940
 1  telephone  0.136582   34624
 2    unknown  0.042833  244647,
   poutcome      rate       n
 2  success  0.754817   19202
 1    other  0.166848   16584
 0  failure  0.132138   50016
 3  unknown  0.101362  709409)

## Simple informativeness scores (Mutual Information)

In [14]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import mutual_info_classif

X = train[feat_cols].copy()
y = train[TARGET].values

# Ordinal-encode categories for MI (safe for ranking, not for final model)
oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
X_enc = X.copy()
X_enc[cat_cols] = oe.fit_transform(X_enc[cat_cols])

mi = mutual_info_classif(
    X_enc, y,
    discrete_features=[X_enc.columns.get_loc(c) for c in cat_cols],
    random_state=42
)
mi_series = pd.Series(mi, index=feat_cols).sort_values(ascending=False)
mi_series.head(15)


duration     0.148780
balance      0.068309
pdays        0.029613
poutcome     0.027729
campaign     0.026107
month        0.024863
age          0.018548
contact      0.014944
day          0.014419
previous     0.011933
housing      0.011696
job          0.010970
loan         0.003875
education    0.003825
marital      0.003769
dtype: float64

## Train–Test sanity: unseen categories & distribution shift

In [15]:
# Unseen categories in test
def unseen_levels(train_df, test_df, cols):
    rows=[]
    for c in cols:
        tr = set(train_df[c].astype(str).unique())
        te = set(test_df[c].astype(str).unique())
        unseen = sorted(te - tr)
        rows.append({"col": c, "n_unseen_in_test": len(unseen), "example_unseen": unseen[:5]})
    return pd.DataFrame(rows).sort_values("n_unseen_in_test", ascending=False)

unseen_df = unseen_levels(train, test, cat_cols)
unseen_df.head(20)


Unnamed: 0,col,n_unseen_in_test,example_unseen
0,job,0,[]
1,marital,0,[]
2,education,0,[]
3,default,0,[]
4,housing,0,[]
5,loan,0,[]
6,contact,0,[]
7,month,0,[]
8,poutcome,0,[]


In [16]:
# KS test for numeric shift
from scipy.stats import ks_2samp

rows=[]
for c in num_cols:
    t = train[c].dropna()
    u = test[c].dropna()
    if len(t) > 0 and len(u) > 0:
        stat,p = ks_2samp(t,u)
        rows.append({"col": c, "ks_stat": stat, "p_value": p, "train_median": t.median(), "test_median": u.median()})
ks_report = pd.DataFrame(rows).sort_values("ks_stat", ascending=False)
ks_report


Unnamed: 0,col,ks_stat,p_value,train_median,test_median
3,duration,0.009956,8.34568e-17,135.0,133.0
1,balance,0.00739,1.890388e-09,624.0,631.0
5,pdays,0.004437,0.001113161,-1.0,-1.0
6,previous,0.004369,0.001402264,0.0,0.0
4,campaign,0.002355,0.2416873,2.0,2.0
0,age,0.002215,0.3078053,39.0,39.0
2,day,0.001537,0.75915,17.0,17.0


## Leakage check for duration

In [17]:
from sklearn.metrics import roc_auc_score

# Using raw feature as a score is valid for AUC ranking
auc_duration = roc_auc_score(y, train["duration"])
auc_pdays     = roc_auc_score(y, train["pdays"].replace(-1, np.nan).fillna(train["pdays"].max()+1))
auc_campaign  = roc_auc_score(y, train["campaign"])
auc_prev      = roc_auc_score(y, train["previous"])
print(f"AUC(duration) ~ {auc_duration:.3f}, AUC(pdays) ~ {auc_pdays:.3f}, AUC(campaign) ~ {auc_campaign:.3f}, AUC(previous) ~ {auc_prev:.3f}")


AUC(duration) ~ 0.886, AUC(pdays) ~ 0.415, AUC(campaign) ~ 0.422, AUC(previous) ~ 0.581


# EDA Key takeaways:

1. **Severe class imbalance**
* `y=1` is only **12.04%** → use **StratifiedKFold**, and set **class weights / scale\_pos\_weight ≈ 7.3** (699,434 / 95,777) in XGB/LGBM/CatBoost. Optimize **ROC‑AUC** (Kaggle metric) but also watch **PR‑AUC** for sanity.

2. **`duration` is massive leakage**

* `AUC(duration) ≈ 0.886` (alone!). Build **two runs**:

  * **LB‑max**: keep `duration`.
  * **Realistic**: **drop `duration`** in FE (`use_duration=False`).

3. **Huge “unknown” prevalence in a few columns**

* `contact`: **30.77% unknown** with **low target rate (4.28%)** → very informative “unknown”. Keep as a **distinct level** + add a **flag** feature.
* `poutcome`: **89.21% unknown**; but when **success**, target rate is **0.755** (very predictive). Engineer features that capture “previous success” cleanly (see FE plan).

4. **`pdays == -1` sentinel**

* Classic UCI convention meaning “never contacted.” Convert `-1 → NaN`, and add **`ever_contacted = 1(pdays != -1)`**.

5. **`previous` and `campaign` matter**

* `previous` alone yields **AUC ≈ 0.581** (solid). Create **`has_previous = 1(previous>0)`** and **`prev_success = 1(poutcome=="success")`**, forcing that to `0` when `previous==0`.
* `campaign` is heavy‑tailed; helpful but weaker (AUC \~0.422). For linear/NNs I’d log/winsorize, but trees can handle it; optional `log1p(campaign)` won’t hurt.

6. **Mild train–test drift**

* KS stats are tiny across numerics (e.g., `duration` 0.0099; medians close). No special reweighting needed.

7. **`id` is float with \~5.7% missing**

* Not a feature; **drop it** from modeling. Keep `test.id` only for submission mapping.

8. **Duplicates: 21 rows (excluding id)**

* Tiny volume. **Drop exact duplicate rows**; if you find duplicate features with **conflicting targets**, drop those pairs to avoid noisy supervision.

9. **Categoricals are modest in cardinality**

* OHE with rare‑category folding is fine. Keep **`handle_unknown="ignore"`** (you’ll have unseen levels in test).

# Concrete data cleaning & feature engineering (do this next)

**Always**

* Drop: `id` (feature), optionally `day` if it adds noise (I usually keep it; it’s harmless).
* Impute: numerics **median**, categoricals **most frequent**.
* Encode: OHE with `min_frequency=0.01` (merge very rare levels to “other”).
* Class imbalance: set model weights `(neg/pos) ≈ 7.3`.

**Sentinels and flags**

* `pdays`: `ever_contacted = 1(pdays != -1)`; then `pdays=-1 → NaN` (median impute).
* `previous`: `has_previous = 1(previous > 0)`.
* `poutcome`: `prev_success = 1(poutcome=="success")`; if `previous==0`, force `prev_success=0`. (Optionally also `prev_failure`, `prev_other`.)
* `contact`: `contact_unknown = 1(contact=="unknown")`.

**Transformations**

* `balance_log = log1p(balance)` (very skewed). Keep original `balance` too; trees will decide.
* `month_num` (map `jan..dec → 1..12`) + cyclical `sin/cos` to capture seasonality; keep the original `month` categorical as well.
* Optional: age bins (`[18,25,35,45,55,65,100]`) as an extra categorical (`age_bin`).
* Optional (linear‑friendly): `log1p(campaign)`, `log1p(previous)`.

**Leakage handling**

* Two FE modes:

  * **`use_duration=True`** (for LB probing).
  * **`use_duration=False`** (deployment‑realistic).

**Duplicates**

* Remove exact duplicates (`subset=all_features_incl_y`).
* If duplicates with different `y` exist, remove both sides of the conflict (they’re label noise).


# Data Cleaning + Feature Engineering

In [21]:
import numpy as np
import pandas as pd
from typing import Tuple, Dict, Any, Optional, List

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

## Utility: Duplicate handling

In [22]:
def drop_exact_duplicates(df: pd.DataFrame, target: str, id_col: Optional[str]=None) -> pd.DataFrame:
    """
    Drop exact duplicate rows based on all columns including y (if present),
    but excluding id_col if provided.
    If there are duplicates with conflicting target labels, drop all copies of those rows.
    """
    cols = [c for c in df.columns if c != id_col] if id_col in df.columns else df.columns.tolist()

    # duplicates including target
    dup_mask = df.duplicated(subset=cols, keep="first")
    df1 = df.loc[~dup_mask].copy()

    # Detect conflicting duplicates (same features, different y)
    if target in df.columns:
        feat_cols = [c for c in df.columns if c not in [target, id_col]]
        # group by features, check if multiple unique targets
        g = df.groupby(feat_cols, dropna=False)[target].nunique()
        conflict_keys = g[g > 1].index
        if len(conflict_keys) > 0:
            # Drop all rows whose feature tuple is in conflict_keys
            feat_tuple = df[feat_cols].apply(lambda r: tuple(r.values.tolist()), axis=1)
            conflict_set = set(conflict_keys)
            keep_mask = ~feat_tuple.isin(conflict_set)
            df1 = df.loc[keep_mask].copy()

    return df1

train = drop_exact_duplicates(train, TARGET, ID_COL)


## Feature engineering transformer

In [23]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    """
    - Drops ID column (if present)
    - Maps yes/no to 1/0 for ['default','housing','loan']
    - pdays: add ever_contacted, convert -1 -> NaN
    - previous/poutcome: has_previous, prev_success (zeroed when has_previous==0)
    - contact: contact_unknown flag
    - month: month_num + sin/cos
    - balance: balance_log = log1p(balance)
    - optional age bins
    - optional drop of 'duration' to avoid leakage
    """
    def __init__(self, use_duration: bool = False, age_bins: Optional[List[int]] = None):
        self.use_duration = use_duration
        self.age_bins = age_bins
        self.month_map = {m:i for i,m in enumerate(
            ["jan","feb","mar","apr","may","jun","jul","aug","sep","oct","nov","dec"], start=1)}

    def fit(self, X: pd.DataFrame, y=None):
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X = X.copy()

        # Drop ID if present
        if "id" in X.columns:
            X = X.drop(columns=["id"])

        # yes/no → 1/0
        for c in ["default","housing","loan"]:
            if c in X.columns:
                X[c] = X[c].map({"yes":1, "no":0}).astype("float")

        # pdays engineering
        if "pdays" in X.columns:
            X["ever_contacted"] = (X["pdays"] != -1).astype(int)
            X["pdays"] = X["pdays"].replace(-1, np.nan)

        # previous / poutcome
        if "previous" in X.columns:
            X["has_previous"] = (X["previous"] > 0).astype(int)
        if "poutcome" in X.columns:
            X["prev_success"] = (X["poutcome"].astype(str).str.lower() == "success").astype(int)
            if "has_previous" in X.columns:
                X.loc[X["has_previous"] == 0, "prev_success"] = 0

        # contact unknown flag
        if "contact" in X.columns:
            X["contact_unknown"] = (X["contact"].astype(str).str.lower() == "unknown").astype(int)

        # month features
        if "month" in X.columns:
            X["month_num"] = X["month"].astype(str).str[:3].str.lower().map(self.month_map)
            X["month_sin"] = np.sin(2*np.pi*X["month_num"]/12)
            X["month_cos"] = np.cos(2*np.pi*X["month_num"]/12)

        # balance log
        if "balance" in X.columns:
            # guard against negatives; UCI balance can be negative; log1p handles >= -1 safely if we clip
            X["balance_log"] = np.log1p(np.clip(X["balance"], a_min=0, a_max=None))

        # optional age bins
        if self.age_bins is not None and "age" in X.columns:
            X["age_bin"] = pd.cut(X["age"], bins=self.age_bins, include_lowest=True).astype(str)

        # optionally drop duration to avoid leakage
        if not self.use_duration and "duration" in X.columns:
            X = X.drop(columns=["duration"])

        return X


In [24]:
# Two FE variants
fe_no_leak   = FeatureEngineer(use_duration=False, age_bins=[17,25,35,45,55,65,120])
fe_with_leak = FeatureEngineer(use_duration=True,  age_bins=[17,25,35,45,55,65,120])

## ColumnTransformer (impute + encode +scale)

In [25]:
numeric_selector = selector(dtype_include=np.number)
categorical_selector = selector(dtype_include=["object","category"])

num_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale", RobustScaler())
])

cat_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    # min_frequency folds ultra-rare levels (helps generalisation, keeps matrix compact)
    ("ohe", OneHotEncoder(handle_unknown="ignore", min_frequency=0.01, sparse=False))
])

preprocess = ColumnTransformer([
    ("num", num_pipe, numeric_selector),
    ("cat", cat_pipe, categorical_selector),
], verbose_feature_names_out=False)

## Build preprocessing pipelines (without model for now)

In [26]:
prep_no_leak   = Pipeline([("fe", fe_no_leak),   ("prep", preprocess)])
prep_with_leak = Pipeline([("fe", fe_with_leak), ("prep", preprocess)])

## Prepare matrices for modeling (choose one)

In [27]:
def prepare_data(pipeline: Pipeline, train_df: pd.DataFrame, test_df: pd.DataFrame,
                 target: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, List[str]]:
    """
    Fits the given preprocessing pipeline on train, transforms both train and test,
    and returns (X_train, y_train, X_test, test_ids, feature_names).
    """
    X = train_df.drop(columns=[target])
    y = train_df[target].values
    pipe = pipeline.fit(X, y)

    X_train = pipe.transform(X)
    X_test  = pipe.transform(test_df)

    # Try to expose generated feature names (Sklearn >=1.0 on OneHot)
    try:
        feature_names = pipe.get_feature_names_out().tolist()
    except Exception:
        feature_names = [f"f{i}" for i in range(X_train.shape[1])]

    test_ids = test_df[ID_COL].values if ID_COL in test_df.columns else np.arange(len(test_df))
    return X_train, y, X_test, test_ids, feature_names

# Example: create both preprocessed datasets for future modeling
X_train_no_leak, y, X_test_no_leak, test_ids, feat_names_no_leak = prepare_data(
    prep_no_leak, train, test, TARGET
)
X_train_with_leak, _, X_test_with_leak, _, feat_names_with_leak = prepare_data(
    prep_with_leak, train, test, TARGET
)

print("X_train_no_leak:", X_train_no_leak.shape, "| X_test_no_leak:", X_test_no_leak.shape)
print("X_train_with_leak:", X_train_with_leak.shape, "| X_test_with_leak:", X_test_with_leak.shape)




X_train_no_leak: (795169, 60) | X_test_no_leak: (250000, 60)
X_train_with_leak: (795169, 61) | X_test_with_leak: (250000, 61)


## Class weight helper for imbalanced y

In [28]:
def compute_scale_pos_weight(y: np.ndarray) -> float:
    neg = (y == 0).sum()
    pos = (y == 1).sum()
    return float(neg / max(pos, 1))

scale_pos_weight = compute_scale_pos_weight(y)
print(f"scale_pos_weight ≈ {scale_pos_weight:.3f}  (use in XGB/LGBM/Cat)")

scale_pos_weight ≈ 7.304  (use in XGB/LGBM/Cat)


# Modeling + Submission

In [40]:
import logging, time
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    datefmt="%H:%M:%S"
)

## Define base models

In [41]:
xgb_params = dict(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="auc",
    tree_method="hist",
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1,
)

lgbm_params = dict(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary",
    verbosity = -1,
    metric="auc",
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1,
)

cat_params = dict(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    eval_metric="AUC",
    scale_pos_weight=scale_pos_weight,
    random_seed=42,
    verbose=0,
    thread_count=-1,
)

## Cross-validation training function

In [42]:
def _make_models(early_stopping_rounds: int):
    # XGB: put early_stopping_rounds in constructor to avoid deprecation warning
    xgb = XGBClassifier(**{**xgb_params, "early_stopping_rounds": early_stopping_rounds})
    # LGBM: callbacks handle early stopping & logging silence
    lgbm = LGBMClassifier(**{**lgbm_params})
    # CatBoost: use od_* for early stopping (robust across versions)
    cat = CatBoostClassifier(**{**cat_params, "od_type": "Iter", "od_wait": early_stopping_rounds})
    return xgb, lgbm, cat

def _best_iter_safe(model):
    for attr in ["best_iteration_", "best_iteration", "best_ntree_limit"]:
        if hasattr(model, attr) and getattr(model, attr) is not None:
            return getattr(model, attr)
    try:
        return model.get_booster().best_ntree_limit
    except Exception:
        return None

def train_and_predict(
    X, y, X_test, label="no_leak",
    n_splits=5, early_stopping_rounds=100
):
    logging.info(f"Start CV training: label='{label}', folds={n_splits}, samples={len(y):,}, features={X.shape[1]}")
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    oof_preds = np.zeros(len(y), dtype=float)
    test_preds = np.zeros(len(X_test), dtype=float)

    for fold, (tr_idx, va_idx) in enumerate(tqdm(skf.split(X, y), total=n_splits, desc=f"CV ({label})"), start=1):
        t0 = time.time()
        X_tr, y_tr = X[tr_idx], y[tr_idx]
        X_va, y_va = X[va_idx], y[va_idx]

        model_xgb, model_lgb, model_cat = _make_models(early_stopping_rounds)

        # --- XGB (early stopping via constructor) ---
        model_xgb.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            verbose=False
        )

        # --- LGBM (callbacks for ES + silence) ---
        model_lgb.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            callbacks=[
                lgb.early_stopping(early_stopping_rounds),
                lgb.log_evaluation(period=0),  # 0 = silence
            ],
        )

        # --- CatBoost (od_wait for ES; silent training) ---
        model_cat.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            use_best_model=True,
            verbose=False
        )

        # Validation probs (soft vote)
        val_pred = (
            model_xgb.predict_proba(X_va)[:, 1] +
            model_lgb.predict_proba(X_va)[:, 1] +
            model_cat.predict_proba(X_va)[:, 1]
        ) / 3.0
        oof_preds[va_idx] = val_pred
        fold_auc = roc_auc_score(y_va, val_pred)

        # Test probs averaged across folds
        test_fold_pred = (
            model_xgb.predict_proba(X_test)[:, 1] +
            model_lgb.predict_proba(X_test)[:, 1] +
            model_cat.predict_proba(X_test)[:, 1]
        ) / 3.0
        test_preds += test_fold_pred / n_splits

        sec = time.time() - t0
        logging.info(
            f"Fold {fold}/{n_splits} | AUC={fold_auc:.5f} | "
            f"best_iter(xgb={_best_iter_safe(model_xgb)}, lgb={_best_iter_safe(model_lgb)}, cat={_best_iter_safe(model_cat)}) | "
            f"{sec:.1f}s"
        )

    overall_auc = roc_auc_score(y, oof_preds)
    logging.info(f"Finished CV: {label} | Overall AUC={overall_auc:.5f}")

    # Save submission with clear name
    sub_df = pd.DataFrame({"id": test_ids, "y": test_preds})
    filename = f"submission_{label}.csv"
    sub_df.to_csv(filename, index=False)
    logging.info(f"Saved {filename}")

    return oof_preds, test_preds

## Train both pipelines

In [43]:
print("=== Training WITHOUT duration (realistic) ===")
oof_no_leak, test_no_leak = train_and_predict(
    X_train_no_leak, y, X_test_no_leak, label="no_leak"
)

=== Training WITHOUT duration (realistic) ===


CV (no_leak):   0%|          | 0/5 [00:00<?, ?it/s]

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's auc: 0.849787
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's auc: 0.851939
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[494]	valid_0's auc: 0.856462
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's auc: 0.855729
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's auc: 0.855926


In [44]:
print("\n=== Training WITH duration (leakage version) ===")
oof_with_leak, test_with_leak = train_and_predict(
    X_train_with_leak, y, X_test_with_leak, label="with_leak"
)


=== Training WITH duration (leakage version) ===


CV (with_leak):   0%|          | 0/5 [00:00<?, ?it/s]

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's auc: 0.965653
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[499]	valid_0's auc: 0.966131
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[497]	valid_0's auc: 0.966566
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's auc: 0.966222
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's auc: 0.966545
