In [1]:
pip install pytorch-tabnet torch

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb
from catboost import CatBoostClassifier, Pool

# TabNet
try:
    import torch
    from pytorch_tabnet.tab_model import TabNetClassifier
    TABNET_AVAILABLE = True
except:
    print("‚ö†Ô∏è TabNet not available. Install: pip install pytorch-tabnet torch")
    TABNET_AVAILABLE = False


# ============================================================
# 0) Load
# ============================================================
TRAIN_PATH = "../data/train.csv"
TEST_PATH  = "../data/test.csv"
SUB_PATH   = "../data/sample_submission.csv"
OUT_PATH   = "../outputs/06_ML&DL_ensemble_all.csv"

if not os.path.exists(TRAIN_PATH) and os.path.exists("/mnt/data/train.csv"):
    TRAIN_PATH = "/mnt/data/train.csv"

train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sub   = pd.read_csv(SUB_PATH)

TARGET_COL = "ÏûÑÏã† ÏÑ±Í≥µ Ïó¨Î∂Ä"
ID_COL = "ID"
SUB_ID_COL = sub.columns[0]
SUB_PRED_COL = sub.columns[1]


# ============================================================
# 1) Feature engineering
# ============================================================
EMBRYO_STAGE_COLS = [
    "Îã®Ïùº Î∞∞ÏïÑ Ïù¥Ïãù Ïó¨Î∂Ä","Ï∞©ÏÉÅ Ï†Ñ Ïú†Ï†Ñ ÏßÑÎã® ÏÇ¨Ïö© Ïó¨Î∂Ä","Î∞∞ÏïÑ ÏÉùÏÑ± Ï£ºÏöî Ïù¥Ïú†",
    "Ï¥ù ÏÉùÏÑ± Î∞∞ÏïÑ Ïàò","ÎØ∏ÏÑ∏Ï£ºÏûÖÎêú ÎÇúÏûê Ïàò","ÎØ∏ÏÑ∏Ï£ºÏûÖÏóêÏÑú ÏÉùÏÑ±Îêú Î∞∞ÏïÑ Ïàò",
    "Ïù¥ÏãùÎêú Î∞∞ÏïÑ Ïàò","ÎØ∏ÏÑ∏Ï£ºÏûÖ Î∞∞ÏïÑ Ïù¥Ïãù Ïàò","Ï†ÄÏû•Îêú Î∞∞ÏïÑ Ïàò",
    "ÎØ∏ÏÑ∏Ï£ºÏûÖ ÌõÑ Ï†ÄÏû•Îêú Î∞∞ÏïÑ Ïàò","Ìï¥ÎèôÎêú Î∞∞ÏïÑ Ïàò","Ìï¥Îèô ÎÇúÏûê Ïàò",
    "ÏàòÏßëÎêú Ïã†ÏÑ† ÎÇúÏûê Ïàò","Ï†ÄÏû•Îêú Ïã†ÏÑ† ÎÇúÏûê Ïàò","ÌòºÌï©Îêú ÎÇúÏûê Ïàò",
    "ÌååÌä∏ÎÑà Ï†ïÏûêÏôÄ ÌòºÌï©Îêú ÎÇúÏûê Ïàò","Í∏∞Ï¶ùÏûê Ï†ïÏûêÏôÄ ÌòºÌï©Îêú ÎÇúÏûê Ïàò",
    "ÎèôÍ≤∞ Î∞∞ÏïÑ ÏÇ¨Ïö© Ïó¨Î∂Ä","Ïã†ÏÑ† Î∞∞ÏïÑ ÏÇ¨Ïö© Ïó¨Î∂Ä","Í∏∞Ï¶ù Î∞∞ÏïÑ ÏÇ¨Ïö© Ïó¨Î∂Ä","ÎåÄÎ¶¨Î™® Ïó¨Î∂Ä",
]

INFERTILITY_COLS = [
    "ÎÇ®ÏÑ± Ï£º Î∂àÏûÑ ÏõêÏù∏","ÎÇ®ÏÑ± Î∂Ä Î∂àÏûÑ ÏõêÏù∏","Ïó¨ÏÑ± Ï£º Î∂àÏûÑ ÏõêÏù∏","Ïó¨ÏÑ± Î∂Ä Î∂àÏûÑ ÏõêÏù∏",
    "Î∂ÄÎ∂Ä Ï£º Î∂àÏûÑ ÏõêÏù∏","Î∂ÄÎ∂Ä Î∂Ä Î∂àÏûÑ ÏõêÏù∏","Î∂àÎ™ÖÌôï Î∂àÏûÑ ÏõêÏù∏",
    "Î∂àÏûÑ ÏõêÏù∏ - ÎÇúÍ¥Ä ÏßàÌôò","Î∂àÏûÑ ÏõêÏù∏ - ÎÇ®ÏÑ± ÏöîÏù∏","Î∂àÏûÑ ÏõêÏù∏ - Î∞∞ÎûÄ Ïû•Ïï†",
    "Î∂àÏûÑ ÏõêÏù∏ - Ïó¨ÏÑ± ÏöîÏù∏","Î∂àÏûÑ ÏõêÏù∏ - ÏûêÍ∂ÅÍ≤ΩÎ∂Ä Î¨∏Ï†ú","Î∂àÏûÑ ÏõêÏù∏ - ÏûêÍ∂ÅÎÇ¥ÎßâÏ¶ù",
    "Î∂àÏûÑ ÏõêÏù∏ - Ï†ïÏûê ÎÜçÎèÑ","Î∂àÏûÑ ÏõêÏù∏ - Ï†ïÏûê Î©¥Ïó≠ÌïôÏ†Å ÏöîÏù∏","Î∂àÏûÑ ÏõêÏù∏ - Ï†ïÏûê Ïö¥ÎèôÏÑ±",
    "Î∂àÏûÑ ÏõêÏù∏ - Ï†ïÏûê ÌòïÌÉú"
]

# üÜï Í≥†Í≤∞Ï∏° Î≥ÄÏàò Ï†úÍ±∞ Î¶¨Ïä§Ìä∏ (99%+)
HIGH_MISSING_COLS = [
    'ÎÇúÏûê Ìï¥Îèô Í≤ΩÍ≥ºÏùº',           # 99.4%
    'PGS ÏãúÏà† Ïó¨Î∂Ä',             # 99.2%
    'PGD ÏãúÏà† Ïó¨Î∂Ä',             # 99.1%
    'Ï∞©ÏÉÅ Ï†Ñ Ïú†Ï†Ñ Í≤ÄÏÇ¨ ÏÇ¨Ïö© Ïó¨Î∂Ä', # 98.9%
    'ÏûÑÏã† ÏãúÎèÑ ÎòêÎäî ÎßàÏßÄÎßâ ÏûÑÏã† Í≤ΩÍ≥º Ïó∞Ïàò',  # 96.3%
]

MISS_FLAG_COLS = ["Î∞∞ÏïÑ Ïù¥Ïãù Í≤ΩÍ≥ºÏùº", "ÎÇúÏûê Ï±ÑÏ∑® Í≤ΩÍ≥ºÏùº", "Î∞∞ÏïÑ Ìï¥Îèô Í≤ΩÍ≥ºÏùº"]

def safe_div(a, b):
    return np.where(b == 0, 0.0, a / b)

def add_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    
    # üÜï Í≥†Í≤∞Ï∏° Î≥ÄÏàò Ï†úÍ±∞
    df = df.drop(columns=HIGH_MISSING_COLS, errors='ignore')
    print(f"‚úÖ Í≥†Í≤∞Ï∏° Î≥ÄÏàò {len(HIGH_MISSING_COLS)}Í∞ú Ï†úÍ±∞")

    # ÏãúÏà†_ÎåÄÎ∂ÑÎ•ò
    def major_procedure(x):
        if pd.isna(x):
            return "Unknown"
        x = str(x)
        if "IUI" in x:  return "IUI"
        if "ICSI" in x: return "ICSI"
        if "IVF" in x:  return "IVF"
        if "DI" in x:   return "DI"
        return "Other"

    df["ÏãúÏà†_ÎåÄÎ∂ÑÎ•ò"] = df["ÌäπÏ†ï ÏãúÏà† Ïú†Ìòï"].apply(major_procedure)

    # Î≥¥Ï°∞Í∏∞Ïà† ÌÜ†ÌÅ∞
    s = df["ÌäπÏ†ï ÏãúÏà† Ïú†Ìòï"].astype("object").fillna("Unknown").astype(str)
    df["BLASTOCYST_Ìè¨Ìï®"] = s.str.contains("BLASTOCYST", na=False).astype(int)
    df["AH_Ìè¨Ìï®"]         = s.str.contains("AH", na=False).astype(int)

    # Î∞∞ÏïÑ Îã®Í≥Ñ Í≤∞Ï∏° Ìå®ÌÑ¥
    df["Î∞∞ÏïÑ_stage_missing_count"] = df[EMBRYO_STAGE_COLS].isna().sum(axis=1)
    df["Î∞∞ÏïÑ_stage_all_missing"]   = (df["Î∞∞ÏïÑ_stage_missing_count"] == len(EMBRYO_STAGE_COLS)).astype(int)
    df["Î∞∞ÏïÑ_Ïù¥Ïãù_Ïó¨Î∂Ä"]           = 1 - df["Î∞∞ÏïÑ_stage_all_missing"]

    # Ï¥ùÏãúÏà†_bin3
    def collapse_trials(x):
        if pd.isna(x):
            return "Unknown"
        if x == "0Ìöå":
            return "0Ìöå"
        if x in ["1Ìöå", "2Ìöå"]:
            return "1-2Ìöå"
        return "3Ìöå Ïù¥ÏÉÅ"
    df["Ï¥ùÏãúÏà†_bin3"] = df["Ï¥ù ÏãúÏà† ÌöüÏàò"].apply(collapse_trials)

    # ÎÇòÏù¥_3Íµ¨Í∞Ñ
    def age_group_simple(age):
        if pd.isna(age) or age == "Ïïå Ïàò ÏóÜÏùå":
            return "Unknown"
        if age == "Îßå18-34ÏÑ∏":
            return "34ÏÑ∏ Ïù¥Ìïò"
        if age in ["Îßå35-37ÏÑ∏", "Îßå38-39ÏÑ∏"]:
            return "35-39ÏÑ∏"
        return "40ÏÑ∏ Ïù¥ÏÉÅ"
    df["ÎÇòÏù¥_3Íµ¨Í∞Ñ"] = df["ÏãúÏà† ÎãπÏãú ÎÇòÏù¥"].apply(age_group_simple)

    # Day5 Ïù¥Ïãù Ïó¨Î∂Ä
    d = pd.to_numeric(df["Î∞∞ÏïÑ Ïù¥Ïãù Í≤ΩÍ≥ºÏùº"], errors="coerce")
    df["Day5_Ïù¥Ïãù_Ïó¨Î∂Ä"] = (d == 5).astype(int)

    # Î∂àÏûÑ ÏõêÏù∏ Í∞úÏàò
    tmp = df[INFERTILITY_COLS].apply(pd.to_numeric, errors="coerce").fillna(0)
    df["Î∂àÏûÑ_ÏõêÏù∏_Í∞úÏàò"] = tmp.sum(axis=1)

    # Ï†ÑÏ≤¥ Í≤∞Ï∏° Í∞úÏàò
    df["Ï†ÑÏ≤¥_missing_count"] = df.isna().sum(axis=1)

    # Í≤∞Ï∏° flag
    for c in MISS_FLAG_COLS:
        if c in df.columns:
            df[f"{c}_isna"] = df[c].isna().astype(int)

    # Ïà´ÏûêÌòï ÏïàÏ†Ñ Î≥ÄÌôò
    num_candidates = [
        "Ï¥ù ÏÉùÏÑ± Î∞∞ÏïÑ Ïàò", "Ïù¥ÏãùÎêú Î∞∞ÏïÑ Ïàò", "Ï†ÄÏû•Îêú Î∞∞ÏïÑ Ïàò", "Ìï¥ÎèôÎêú Î∞∞ÏïÑ Ïàò",
        "ÎØ∏ÏÑ∏Ï£ºÏûÖÎêú ÎÇúÏûê Ïàò", "ÎØ∏ÏÑ∏Ï£ºÏûÖÏóêÏÑú ÏÉùÏÑ±Îêú Î∞∞ÏïÑ Ïàò", "ÎØ∏ÏÑ∏Ï£ºÏûÖ Î∞∞ÏïÑ Ïù¥Ïãù Ïàò",
        "ÎØ∏ÏÑ∏Ï£ºÏûÖ ÌõÑ Ï†ÄÏû•Îêú Î∞∞ÏïÑ Ïàò", "Ìï¥Îèô ÎÇúÏûê Ïàò", "ÏàòÏßëÎêú Ïã†ÏÑ† ÎÇúÏûê Ïàò", "ÌòºÌï©Îêú ÎÇúÏûê Ïàò"
    ]
    for c in num_candidates:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    # ÎπÑÏú® Î≥ÄÏàò
    if "Ï¥ù ÏÉùÏÑ± Î∞∞ÏïÑ Ïàò" in df.columns and "Ïù¥ÏãùÎêú Î∞∞ÏïÑ Ïàò" in df.columns:
        df["Ïù¥Ïãù/ÏÉùÏÑ±"] = safe_div(df["Ïù¥ÏãùÎêú Î∞∞ÏïÑ Ïàò"].fillna(0), df["Ï¥ù ÏÉùÏÑ± Î∞∞ÏïÑ Ïàò"].fillna(0))

    if "Ï¥ù ÏÉùÏÑ± Î∞∞ÏïÑ Ïàò" in df.columns and "Ï†ÄÏû•Îêú Î∞∞ÏïÑ Ïàò" in df.columns:
        df["Ï†ÄÏû•/ÏÉùÏÑ±"] = safe_div(df["Ï†ÄÏû•Îêú Î∞∞ÏïÑ Ïàò"].fillna(0), df["Ï¥ù ÏÉùÏÑ± Î∞∞ÏïÑ Ïàò"].fillna(0))

    if "Ï†ÄÏû•Îêú Î∞∞ÏïÑ Ïàò" in df.columns and "Ìï¥ÎèôÎêú Î∞∞ÏïÑ Ïàò" in df.columns:
        df["Ìï¥Îèô/Ï†ÄÏû•"] = safe_div(df["Ìï¥ÎèôÎêú Î∞∞ÏïÑ Ïàò"].fillna(0), df["Ï†ÄÏû•Îêú Î∞∞ÏïÑ Ïàò"].fillna(0))

    if "ÎØ∏ÏÑ∏Ï£ºÏûÖÎêú ÎÇúÏûê Ïàò" in df.columns and "ÎØ∏ÏÑ∏Ï£ºÏûÖÏóêÏÑú ÏÉùÏÑ±Îêú Î∞∞ÏïÑ Ïàò" in df.columns:
        df["ÎØ∏ÏÑ∏Ï£ºÏûÖ_Î∞∞ÏïÑ/ÎÇúÏûê"] = safe_div(df["ÎØ∏ÏÑ∏Ï£ºÏûÖÏóêÏÑú ÏÉùÏÑ±Îêú Î∞∞ÏïÑ Ïàò"].fillna(0), df["ÎØ∏ÏÑ∏Ï£ºÏûÖÎêú ÎÇúÏûê Ïàò"].fillna(0))

    if "Ïù¥ÏãùÎêú Î∞∞ÏïÑ Ïàò" in df.columns and "ÎØ∏ÏÑ∏Ï£ºÏûÖ Î∞∞ÏïÑ Ïù¥Ïãù Ïàò" in df.columns:
        df["ÎØ∏ÏÑ∏Ï£ºÏûÖ_Ïù¥Ïãù/Ïù¥Ïãù"] = safe_div(df["ÎØ∏ÏÑ∏Ï£ºÏûÖ Î∞∞ÏïÑ Ïù¥Ïãù Ïàò"].fillna(0), df["Ïù¥ÏãùÎêú Î∞∞ÏïÑ Ïàò"].fillna(0))

    # ===== üÜï Ï∂îÍ∞Ä Feature 5Í∞ú =====
    
    # 1. Î∞∞ÏïÑ_ÌíàÏßà_Ï†êÏàò
    df['Î∞∞ÏïÑ_ÌíàÏßà_Ï†êÏàò'] = (
        (df['Day5_Ïù¥Ïãù_Ïó¨Î∂Ä'] * 3) +
        ((df['Ïù¥ÏãùÎêú Î∞∞ÏïÑ Ïàò'].fillna(0).between(1, 2)).astype(int) * 2) +
        ((df['Ïù¥Ïãù/ÏÉùÏÑ±'].fillna(0) > 0.5).astype(int))
    )
    
    # 2. ÏµúÏ†Å_Ïù¥Ïãù_Ï°∞Í±¥
    df['ÏµúÏ†Å_Ïù¥Ïãù_Ï°∞Í±¥'] = (
        (df['Day5_Ïù¥Ïãù_Ïó¨Î∂Ä'] == 1) & 
        (df['Ïù¥ÏãùÎêú Î∞∞ÏïÑ Ïàò'].fillna(0).between(1, 2))
    ).astype(int)
    
    # 3. Í≥†Î†π_Ïó¨Î∂Ä
    age_map = {
        'Îßå18-34ÏÑ∏': 0, 'Îßå35-37ÏÑ∏': 0, 'Îßå38-39ÏÑ∏': 0,
        'Îßå40-42ÏÑ∏': 1, 'Îßå43-44ÏÑ∏': 1, 'Îßå45-50ÏÑ∏': 1,
        'Ïïå Ïàò ÏóÜÏùå': 0
    }
    df['Í≥†Î†π_Ïó¨Î∂Ä'] = df['ÏãúÏà† ÎãπÏãú ÎÇòÏù¥'].map(age_map).fillna(0).astype(int)
    
    # 4. IVF_Í≥ºÍ±∞_ÏÑ±Í≥µÎ•†
    trials_map = {'0Ìöå': 0, '1Ìöå': 1, '2Ìöå': 2, '3Ìöå': 3, '4Ìöå': 4, '5Ìöå': 5, '6Ìöå Ïù¥ÏÉÅ': 7}
    ivf_trials = df['IVF ÏãúÏà† ÌöüÏàò'].map(trials_map).fillna(0)
    ivf_success = df['IVF ÏûÑÏã† ÌöüÏàò'].map(trials_map).fillna(0)
    df['IVF_Í≥ºÍ±∞_ÏÑ±Í≥µÎ•†'] = safe_div(ivf_success, ivf_trials)
    
    # 5. Î∞∞ÏïÑ_ÏÑ†Î≥ÑÎ•†
    df['Î∞∞ÏïÑ_ÏÑ†Î≥ÑÎ•†'] = safe_div(
        (df['Ï¥ù ÏÉùÏÑ± Î∞∞ÏïÑ Ïàò'].fillna(0) - df['Ïù¥ÏãùÎêú Î∞∞ÏïÑ Ïàò'].fillna(0)),
        df['Ï¥ù ÏÉùÏÑ± Î∞∞ÏïÑ Ïàò'].fillna(0)
    )
    
    # ===== üÜï ÌôïÏ†ï Ïã§Ìå® ÏºÄÏù¥Ïä§ (ÌåÄÏõê Ïù∏ÏÇ¨Ïù¥Ìä∏) =====
    storage_keywords = ['ÎÇúÏûê Ï†ÄÏû•Ïö©', 'Í∏∞Ï¶ùÏö©', 'Î∞∞ÏïÑ Ï†ÄÏû•Ïö©', 'Ïó∞Íµ¨Ïö©']
    df['ÌôïÏ†ï_Ïã§Ìå®_ÏºÄÏù¥Ïä§'] = 0
    
    if 'Î∞∞ÏïÑ ÏÉùÏÑ± Ï£ºÏöî Ïù¥Ïú†' in df.columns:
        df['ÌôïÏ†ï_Ïã§Ìå®_ÏºÄÏù¥Ïä§'] = df['Î∞∞ÏïÑ ÏÉùÏÑ± Ï£ºÏöî Ïù¥Ïú†'].isin(storage_keywords).astype(int)
    
    if 'ÏãúÏà† ÎãπÏãú ÎÇòÏù¥' in df.columns:
        df['ÌôïÏ†ï_Ïã§Ìå®_ÏºÄÏù¥Ïä§'] = df['ÌôïÏ†ï_Ïã§Ìå®_ÏºÄÏù¥Ïä§'] | (df['ÏãúÏà† ÎãπÏãú ÎÇòÏù¥'] == 'Ïïå Ïàò ÏóÜÏùå').astype(int)

    return df


train_fe = add_features(train)
test_fe  = add_features(test)


# ============================================================
# 2) Feature/Column split
# ============================================================
drop_cols = [ID_COL, TARGET_COL, "Î∞∞ÏïÑ_stage_all_missing"]
feature_cols = [c for c in train_fe.columns if c not in drop_cols]

X = train_fe[feature_cols].copy()
y = train_fe[TARGET_COL].copy()
X_test = test_fe[feature_cols].copy()

print(f"\n‚úÖ Total Features: {len(feature_cols)}")

explicit_cat = [
    'ÏãúÏà† ÏãúÍ∏∞ ÏΩîÎìú','ÏãúÏà† ÎãπÏãú ÎÇòÏù¥','ÏãúÏà† Ïú†Ìòï','ÌäπÏ†ï ÏãúÏà† Ïú†Ìòï','Î∞∞ÎûÄ ÏûêÍ∑π Ïó¨Î∂Ä','Î∞∞ÎûÄ Ïú†ÎèÑ Ïú†Ìòï',
    'Î∞∞ÏïÑ ÏÉùÏÑ± Ï£ºÏöî Ïù¥Ïú†','Ï¥ù ÏãúÏà† ÌöüÏàò','ÌÅ¥Î¶¨Îãâ ÎÇ¥ Ï¥ù ÏãúÏà† ÌöüÏàò','IVF ÏãúÏà† ÌöüÏàò','DI ÏãúÏà† ÌöüÏàò',
    'Ï¥ù ÏûÑÏã† ÌöüÏàò','IVF ÏûÑÏã† ÌöüÏàò','DI ÏûÑÏã† ÌöüÏàò','Ï¥ù Ï∂úÏÇ∞ ÌöüÏàò','IVF Ï∂úÏÇ∞ ÌöüÏàò','DI Ï∂úÏÇ∞ ÌöüÏàò',
    'ÎÇúÏûê Ï∂úÏ≤ò','Ï†ïÏûê Ï∂úÏ≤ò','ÎÇúÏûê Í∏∞Ï¶ùÏûê ÎÇòÏù¥','Ï†ïÏûê Í∏∞Ï¶ùÏûê ÎÇòÏù¥',
    'ÏãúÏà†_ÎåÄÎ∂ÑÎ•ò','Ï¥ùÏãúÏà†_bin3','ÎÇòÏù¥_3Íµ¨Í∞Ñ',
]
cat_cols = [c for c in explicit_cat if c in X.columns]
cat_cols = sorted(list(set(cat_cols + X.select_dtypes(include=["object"]).columns.tolist())))

num_cols = [c for c in X.columns if c not in cat_cols]

print(f"   Î≤îÏ£ºÌòï: {len(cat_cols)}Í∞ú")
print(f"   ÏàòÏπòÌòï: {len(num_cols)}Í∞ú")


# ============================================================
# 3) Preprocessing
# ============================================================
def prep_fold(X_tr, X_va, X_te):
    X_tr = X_tr.copy()
    X_va = X_va.copy()
    X_te = X_te.copy()

    for c in cat_cols:
        X_tr[c] = X_tr[c].astype("object").fillna("Unknown").astype(str)
        X_va[c] = X_va[c].astype("object").fillna("Unknown").astype(str)
        X_te[c] = X_te[c].astype("object").fillna("Unknown").astype(str)

    med = X_tr[num_cols].median(numeric_only=True)
    X_tr[num_cols] = X_tr[num_cols].fillna(med)
    X_va[num_cols] = X_va[num_cols].fillna(med)
    X_te[num_cols] = X_te[num_cols].fillna(med)

    return X_tr, X_va, X_te


# ============================================================
# 4) 3-Model CV: CatBoost + LightGBM + TabNet
# ============================================================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_cb = np.zeros(len(X))
oof_lgb = np.zeros(len(X))
oof_tn = np.zeros(len(X))

pred_cb = np.zeros(len(X_test))
pred_lgb = np.zeros(len(X_test))
pred_tn = np.zeros(len(X_test))

print("\n" + "="*80)
print("3-MODEL ENSEMBLE: CatBoost + LightGBM + TabNet")
print("="*80)

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
    print(f"\n{'='*80}")
    print(f"FOLD {fold}/5")
    print(f"{'='*80}")
    
    X_tr_raw, X_va_raw = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    X_tr, X_va, X_te = prep_fold(X_tr_raw, X_va_raw, X_test)

    pos = (y_tr == 1).sum()
    neg = (y_tr == 0).sum()

    # ---------
    # CatBoost
    # ---------
    print("\n[CatBoost Training...]")
    cb_class_weights = [1.0, (neg / max(pos, 1))]

    train_pool = Pool(X_tr, y_tr, cat_features=cat_cols)
    valid_pool = Pool(X_va, y_va, cat_features=cat_cols)
    test_pool  = Pool(X_te, cat_features=cat_cols)

    cb = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="AUC",
        iterations=8000,
        learning_rate=0.03,
        depth=8,
        l2_leaf_reg=6.0,
        random_strength=1.0,
        subsample=0.8,
        rsm=0.8,
        class_weights=cb_class_weights,
        random_seed=42,
        verbose=0,
        allow_writing_files=False
    )
    cb.fit(train_pool, eval_set=valid_pool, use_best_model=True, early_stopping_rounds=300)

    oof_cb[va_idx] = cb.predict_proba(X_va)[:, 1]
    pred_cb += cb.predict_proba(X_te)[:, 1] / skf.n_splits

    auc_cb = roc_auc_score(y_va, oof_cb[va_idx])

    # ---------
    # LightGBM
    # ---------
    print("[LightGBM Training...]")
    X_tr_lgb = X_tr.copy()
    X_va_lgb = X_va.copy()
    X_te_lgb = X_te.copy()
    for c in cat_cols:
        X_tr_lgb[c] = X_tr_lgb[c].astype("category")
        X_va_lgb[c] = X_va_lgb[c].astype("category")
        X_te_lgb[c] = X_te_lgb[c].astype("category")

    lgb_params = {
        "objective": "binary",
        "metric": "auc",
        "learning_rate": 0.03,
        "num_leaves": 128,
        "min_data_in_leaf": 80,
        "feature_fraction": 0.85,
        "bagging_fraction": 0.85,
        "bagging_freq": 1,
        "lambda_l1": 0.0,
        "lambda_l2": 0.0,
        "verbosity": -1,
        "seed": 42,
        "scale_pos_weight": (neg / max(pos, 1)),
    }

    dtr = lgb.Dataset(X_tr_lgb, label=y_tr, categorical_feature=cat_cols, free_raw_data=False)
    dva = lgb.Dataset(X_va_lgb, label=y_va, categorical_feature=cat_cols, free_raw_data=False)

    lgbm = lgb.train(
        lgb_params,
        dtr,
        num_boost_round=10000,
        valid_sets=[dva],
        callbacks=[lgb.early_stopping(300, verbose=False)]
    )

    oof_lgb[va_idx] = lgbm.predict(X_va_lgb)
    pred_lgb += lgbm.predict(X_te_lgb) / skf.n_splits

    auc_lgb = roc_auc_score(y_va, oof_lgb[va_idx])

    # ---------
    # TabNet
    # ---------
    if TABNET_AVAILABLE:
        print("[TabNet Training...]")
        
        # Ïà´ÏûêÌòïÎßå ÏÇ¨Ïö© (TabNetÏùÄ Î≤îÏ£ºÌòï ÏßÅÏ†ë Ï≤òÎ¶¨ Ïñ¥Î†§ÏõÄ)
        X_tr_tn = X_tr[num_cols].values.astype(np.float32)
        X_va_tn = X_va[num_cols].values.astype(np.float32)
        X_te_tn = X_te[num_cols].values.astype(np.float32)
        
        y_tr_arr = y_tr.values
        y_va_arr = y_va.values
        
        tabnet = TabNetClassifier(
            n_d=64,
            n_a=64,
            n_steps=5,
            gamma=1.5,
            n_independent=2,
            n_shared=2,
            lambda_sparse=1e-4,
            optimizer_fn=torch.optim.Adam,
            optimizer_params=dict(lr=2e-2),
            scheduler_params={"step_size": 50, "gamma": 0.9},
            scheduler_fn=torch.optim.lr_scheduler.StepLR,
            mask_type='entmax',
            seed=42,
            verbose=0
        )
        
        tabnet.fit(
            X_tr_tn, y_tr_arr,
            eval_set=[(X_va_tn, y_va_arr)],
            max_epochs=200,
            patience=30,
            batch_size=1024,
            virtual_batch_size=128,
            eval_metric=['auc']
        )
        
        oof_tn[va_idx] = tabnet.predict_proba(X_va_tn)[:, 1]
        pred_tn += tabnet.predict_proba(X_te_tn)[:, 1] / skf.n_splits
        
        auc_tn = roc_auc_score(y_va, oof_tn[va_idx])
    else:
        auc_tn = 0.0

    # ---------
    # Fold summary
    # ---------
    if TABNET_AVAILABLE:
        oof_ens = (oof_cb[va_idx] + oof_lgb[va_idx] + oof_tn[va_idx]) / 3
        auc_ens = roc_auc_score(y_va, oof_ens)
        print(f"\n[Fold {fold}] CB: {auc_cb:.6f} | LGB: {auc_lgb:.6f} | TN: {auc_tn:.6f} | ENS: {auc_ens:.6f}")
    else:
        oof_ens = (oof_cb[va_idx] + oof_lgb[va_idx]) / 2
        auc_ens = roc_auc_score(y_va, oof_ens)
        print(f"\n[Fold {fold}] CB: {auc_cb:.6f} | LGB: {auc_lgb:.6f} | ENS: {auc_ens:.6f}")

# OOF Ï†ÑÏ≤¥
auc_cb_all  = roc_auc_score(y, oof_cb)
auc_lgb_all = roc_auc_score(y, oof_lgb)

if TABNET_AVAILABLE:
    auc_tn_all = roc_auc_score(y, oof_tn)
    auc_ens_all = roc_auc_score(y, (oof_cb + oof_lgb + oof_tn) / 3)
else:
    auc_tn_all = 0.0
    auc_ens_all = roc_auc_score(y, (oof_cb + oof_lgb) / 2)

print("\n" + "="*80)
print("FINAL OOF SCORES")
print("="*80)
print(f"CatBoost:  {auc_cb_all:.6f}")
print(f"LightGBM:  {auc_lgb_all:.6f}")
if TABNET_AVAILABLE:
    print(f"TabNet:    {auc_tn_all:.6f}")
    print(f"Ensemble:  {auc_ens_all:.6f} (3-model average)")
else:
    print(f"Ensemble:  {auc_ens_all:.6f} (2-model average)")

print(f"\nüéØ Previous Best: 0.740363")
print(f"üöÄ Improvement:   +{auc_ens_all - 0.740363:.6f}")


# ============================================================
# 5) Post-processing + Submission
# ============================================================
if TABNET_AVAILABLE:
    pred_test = (pred_cb + pred_lgb + pred_tn) / 3
else:
    pred_test = (pred_cb + pred_lgb) / 2

# üÜï ÌôïÏ†ï Ïã§Ìå® ÏºÄÏù¥Ïä§ Í∞ïÏ†ú 0
mask = test_fe['ÌôïÏ†ï_Ïã§Ìå®_ÏºÄÏù¥Ïä§'] == 1
if mask.sum() > 0:
    print(f"\n‚ö†Ô∏è  ÌôïÏ†ï Ïã§Ìå® ÏºÄÏù¥Ïä§ {mask.sum()}Í∞ú Î∞úÍ≤¨ ‚Üí Í∞ïÏ†úÎ°ú 0.0 ÏÑ§Ï†ï")
    pred_test[mask] = 0.0

out = sub.copy()
out[SUB_ID_COL] = test_fe[ID_COL].values
out[SUB_PRED_COL] = pred_test

os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
out.to_csv(OUT_PATH, index=False)

print(f"\n‚úÖ Submission saved: {OUT_PATH}")
print(out.head())
print("\n" + "="*80)
print("ALL-IN-ONE COMPLETE! üéâ")
print("="*80)

‚úÖ Í≥†Í≤∞Ï∏° Î≥ÄÏàò 5Í∞ú Ï†úÍ±∞
‚úÖ Í≥†Í≤∞Ï∏° Î≥ÄÏàò 5Í∞ú Ï†úÍ±∞

‚úÖ Total Features: 86
   Î≤îÏ£ºÌòï: 24Í∞ú
   ÏàòÏπòÌòï: 62Í∞ú

3-MODEL ENSEMBLE: CatBoost + LightGBM + TabNet

FOLD 1/5


  X_tr[c] = X_tr[c].astype("object").fillna("Unknown").astype(str)
  X_va[c] = X_va[c].astype("object").fillna("Unknown").astype(str)
  X_te[c] = X_te[c].astype("object").fillna("Unknown").astype(str)



[CatBoost Training...]
[LightGBM Training...]
[TabNet Training...]


  from .autonotebook import tqdm as notebook_tqdm



Early stopping occurred at epoch 59 with best_epoch = 29 and best_val_0_auc = 0.72883





[Fold 1] CB: 0.738081 | LGB: 0.737187 | TN: 0.728828 | ENS: 0.737813

FOLD 2/5


  X_tr[c] = X_tr[c].astype("object").fillna("Unknown").astype(str)
  X_va[c] = X_va[c].astype("object").fillna("Unknown").astype(str)
  X_te[c] = X_te[c].astype("object").fillna("Unknown").astype(str)



[CatBoost Training...]
[LightGBM Training...]
[TabNet Training...]

Early stopping occurred at epoch 66 with best_epoch = 36 and best_val_0_auc = 0.73062





[Fold 2] CB: 0.743136 | LGB: 0.741501 | TN: 0.730623 | ENS: 0.741703

FOLD 3/5


  X_tr[c] = X_tr[c].astype("object").fillna("Unknown").astype(str)
  X_va[c] = X_va[c].astype("object").fillna("Unknown").astype(str)
  X_te[c] = X_te[c].astype("object").fillna("Unknown").astype(str)



[CatBoost Training...]
[LightGBM Training...]
[TabNet Training...]

Early stopping occurred at epoch 53 with best_epoch = 23 and best_val_0_auc = 0.73066





[Fold 3] CB: 0.740221 | LGB: 0.739427 | TN: 0.730657 | ENS: 0.740231

FOLD 4/5


  X_tr[c] = X_tr[c].astype("object").fillna("Unknown").astype(str)
  X_va[c] = X_va[c].astype("object").fillna("Unknown").astype(str)
  X_te[c] = X_te[c].astype("object").fillna("Unknown").astype(str)



[CatBoost Training...]
[LightGBM Training...]
[TabNet Training...]

Early stopping occurred at epoch 49 with best_epoch = 19 and best_val_0_auc = 0.72864





[Fold 4] CB: 0.738340 | LGB: 0.737460 | TN: 0.728641 | ENS: 0.738149

FOLD 5/5


  X_tr[c] = X_tr[c].astype("object").fillna("Unknown").astype(str)
  X_va[c] = X_va[c].astype("object").fillna("Unknown").astype(str)
  X_te[c] = X_te[c].astype("object").fillna("Unknown").astype(str)



[CatBoost Training...]
[LightGBM Training...]
[TabNet Training...]

Early stopping occurred at epoch 86 with best_epoch = 56 and best_val_0_auc = 0.72926





[Fold 5] CB: 0.740367 | LGB: 0.739580 | TN: 0.729257 | ENS: 0.739604

FINAL OOF SCORES
CatBoost:  0.740026
LightGBM:  0.739016
TabNet:    0.728803
Ensemble:  0.739417 (3-model average)

üéØ Previous Best: 0.740363
üöÄ Improvement:   +-0.000946

‚ö†Ô∏è  ÌôïÏ†ï Ïã§Ìå® ÏºÄÏù¥Ïä§ 4205Í∞ú Î∞úÍ≤¨ ‚Üí Í∞ïÏ†úÎ°ú 0.0 ÏÑ§Ï†ï

‚úÖ Submission saved: ../outputs/06_ML&DL_ensemble_all.csv
           ID  probability
0  TEST_00000     0.003823
1  TEST_00001     0.006125
2  TEST_00002     0.254410
3  TEST_00003     0.183719
4  TEST_00004     0.674107

ALL-IN-ONE COMPLETE! üéâ
