입력 구성:

범주형 컬럼 → train-fold 기준 정수 인코딩(UNK=0) → embedding

연속형 컬럼 → train-fold 기준 StandardScaler

(중요) 모든 인코딩/스케일링은 fold의 train만으로 fit → OOF 누수 방지

모델 3개:

FT-Transformer(1순위)

MLP(LayerNorm)

MLP(StrongReg: 높은 dropout + numeric noise)

평가: StratifiedKFold 5fold로 OOF AUC 계산

Seed Ensemble: SEEDS 여러 개로 돌린 뒤 OOF/TEST 평균으로 최종 성능 산출

In [None]:
# ============================================================
# 10번: FT-Transformer + MLP(2종) 딥러닝 파이프라인 (OOF-safe)
# - categorical embedding + numeric
# - StratifiedKFold OOF AUC
# - seed ensemble 지원
# - outputs에 10_ 접두사 저장
# ============================================================

import os, gc, re, random, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

# -------------------------
# Config
# -------------------------
DATA_DIR = "../data"
OUT_DIR  = "../outputs"
os.makedirs(OUT_DIR, exist_ok=True)

EXP_PREFIX = "10_dl_ft_mlp"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

TARGET_COL = "임신 성공 여부"
ID_COL = "ID"

N_FOLDS = 5
EPOCHS = 40
BATCH_SIZE = 512
LR = 3e-4
WEIGHT_DECAY = 1e-4
PATIENCE = 6
NUM_WORKERS = 0

# seed ensemble
SEEDS = [42, 123, 777]   # 원하면 [42,202,777,1024] 이런식으로 늘려도 됨

# -------------------------
# Utils
# -------------------------
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def sigmoid_np(x):
    return 1/(1+np.exp(-x))

# -------------------------
# 1) Feature Engineering (AutoGluon 03 기반 그대로)
# -------------------------
def preprocess(df):
    df_copy = df.copy()

    # 시술_대분류
    def major_procedure(x):
        if pd.isna(x):
            return "Unknown"
        if "IUI" in x: return "IUI"
        if "DI" in x:  return "Other"
        if "ICSI" in x:return "ICSI"
        if "IVF" in x: return "IVF"
        return "Other"
    df_copy["시술_대분류"] = df_copy["특정 시술 유형"].apply(major_procedure)

    # BLASTOCYST 포함 여부
    df_copy["BLASTOCYST_포함"] = df_copy["특정 시술 유형"].astype(str).str.contains("BLASTOCYST", na=False).astype(int)

    # 배아 이식 여부 (구조적 결측 기반)
    embryo_stage_cols = [
        "단일 배아 이식 여부", "착상 전 유전 진단 사용 여부", "배아 생성 주요 이유",
        "총 생성 배아 수", "미세주입된 난자 수", "미세주입에서 생성된 배아 수",
        "이식된 배아 수", "미세주입 배아 이식 수", "저장된 배아 수",
        "미세주입 후 저장된 배아 수", "해동된 배아 수", "해동 난자 수",
        "수집된 신선 난자 수", "저장된 신선 난자 수", "혼합된 난자 수",
        "파트너 정자와 혼합된 난자 수", "기증자 정자와 혼합된 난자 수",
        "동결 배아 사용 여부", "신선 배아 사용 여부", "기증 배아 사용 여부", "대리모 여부",
    ]
    df_copy["배아_이식_미도달"] = df_copy[embryo_stage_cols].isna().all(axis=1).astype(int)
    df_copy["배아_이식_여부"] = 1 - df_copy["배아_이식_미도달"]

    def embryo_stage(row):
        if row["배아_이식_여부"] == 0:
            return "배아단계_미도달"
        elif pd.isna(row["총 생성 배아 수"]) or row["총 생성 배아 수"] == 0:
            return "배아생성_실패"
        elif pd.isna(row["이식된 배아 수"]) or row["이식된 배아 수"] == 0:
            return "이식_미실시"
        else:
            return "이식_완료"
    df_copy["배아_진행_단계"] = df_copy.apply(embryo_stage, axis=1)

    # 총시술_bin3
    def collapse_trials(x):
        if x == "0회": return "0회"
        elif x in ["1회","2회"]: return "1–2회"
        else: return "3회 이상"
    df_copy["총시술_bin3"] = df_copy["총 시술 횟수"].apply(collapse_trials)

    # 나이_3구간
    def age_group_simple(age):
        if age == "알 수 없음": return "Unknown"
        elif age == "만18-34세": return "34세 이하"
        elif age in ["만35-37세","만38-39세"]: return "35-39세"
        else: return "40세 이상"
    df_copy["나이_3구간"] = df_copy["시술 당시 나이"].apply(age_group_simple)

    # 이식배아_구간
    def embryo_count_bin(count):
        if pd.isna(count) or count == 0: return "0개"
        elif count <= 2: return "1-2개"
        else: return "3개 이상"
    df_copy["이식배아_구간"] = df_copy["이식된 배아 수"].apply(embryo_count_bin)

    # Day5_이식_여부
    df_copy["Day5_이식_여부"] = (df_copy["배아 이식 경과일"] == 5.0).astype(int)

    # 불임원인_복잡도
    infertility_cols = [
        "남성 주 불임 원인", "남성 부 불임 원인", "여성 주 불임 원인", "여성 부 불임 원인",
        "부부 주 불임 원인", "부부 부 불임 원인", "불명확 불임 원인",
        "불임 원인 - 난관 질환", "불임 원인 - 남성 요인", "불임 원인 - 배란 장애",
        "불임 원인 - 여성 요인", "불임 원인 - 자궁경부 문제", "불임 원인 - 자궁내막증",
        "불임 원인 - 정자 농도", "불임 원인 - 정자 면역학적 요인", "불임 원인 - 정자 운동성",
        "불임 원인 - 정자 형태"
    ]
    df_copy["불임_원인_개수"] = df_copy[infertility_cols].sum(axis=1)

    def infertility_complexity(count):
        if count == 0: return "None"
        elif count == 1: return "Single"
        elif count == 2: return "Double"
        else: return "Multiple"
    df_copy["불임원인_복잡도"] = df_copy["불임_원인_개수"].apply(infertility_complexity)

    # 배아_해동_실시_여부
    df_copy["배아_해동_실시_여부"] = df_copy["배아 해동 경과일"].notna().astype(int)

    # 비율
    df_copy["배아_생성_효율"] = df_copy["총 생성 배아 수"] / (df_copy["수집된 신선 난자 수"] + 1)
    df_copy["배아_이식_비율"] = df_copy["이식된 배아 수"] / (df_copy["총 생성 배아 수"] + 1)
    df_copy["배아_저장_비율"] = df_copy["저장된 배아 수"] / (df_copy["총 생성 배아 수"] + 1)

    # 교호작용
    df_copy["나이×Day5"] = df_copy["시술 당시 나이"].astype(str) + "_" + df_copy["Day5_이식_여부"].astype(str)
    df_copy["시술횟수×나이"] = df_copy["총시술_bin3"].astype(str) + "_" + df_copy["나이_3구간"].astype(str)

    return df_copy

# -------------------------
# 2) Build categorical/numeric lists
# -------------------------
def split_cols(df: pd.DataFrame, target_col: str):
    feats = [c for c in df.columns if c != target_col]
    # bool/int/float -> numeric, object/category -> categorical
    cat_cols, num_cols = [], []
    for c in feats:
        if str(df[c].dtype) in ["object", "category"]:
            cat_cols.append(c)
        else:
            num_cols.append(c)
    return cat_cols, num_cols

# -------------------------
# 3) Encode categorical (TRAIN-only, fold-safe) + Scale numeric (TRAIN-only)
# -------------------------
def fit_transform_fold(train_df, valid_df, test_df, cat_cols, num_cols):
    # numeric
    scaler = StandardScaler()
    Xtr_num = scaler.fit_transform(train_df[num_cols].astype(float))
    Xva_num = scaler.transform(valid_df[num_cols].astype(float))
    Xte_num = scaler.transform(test_df[num_cols].astype(float))

    # categorical: factorize with train categories only, unseen -> 0
    cat_maps = {}
    Xtr_cat = []
    Xva_cat = []
    Xte_cat = []
    cat_cardinalities = []

    for c in cat_cols:
        tr = train_df[c].astype("object").fillna("NA").values
        va = valid_df[c].astype("object").fillna("NA").values
        te = test_df[c].astype("object").fillna("NA").values

        uniq = pd.unique(tr)
        # reserve 0 for UNK, map starts at 1
        mp = {v: i+1 for i, v in enumerate(uniq)}
        cat_maps[c] = mp
        card = len(mp) + 1
        cat_cardinalities.append(card)

        def enc(arr):
            out = np.zeros(len(arr), dtype=np.int64)
            for i, v in enumerate(arr):
                out[i] = mp.get(v, 0)
            return out

        Xtr_cat.append(enc(tr))
        Xva_cat.append(enc(va))
        Xte_cat.append(enc(te))

    Xtr_cat = np.stack(Xtr_cat, axis=1) if len(cat_cols) else np.zeros((len(train_df),0), dtype=np.int64)
    Xva_cat = np.stack(Xva_cat, axis=1) if len(cat_cols) else np.zeros((len(valid_df),0), dtype=np.int64)
    Xte_cat = np.stack(Xte_cat, axis=1) if len(cat_cols) else np.zeros((len(test_df),0), dtype=np.int64)

    return (Xtr_cat, Xtr_num, Xva_cat, Xva_num, Xte_cat, Xte_num, cat_cardinalities)

# -------------------------
# 4) Dataset
# -------------------------
class TabDataset(Dataset):
    def __init__(self, X_cat, X_num, y=None):
        self.X_cat = torch.tensor(X_cat, dtype=torch.long)
        self.X_num = torch.tensor(X_num, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X_num)

    def __getitem__(self, idx):
        if self.y is None:
            return self.X_cat[idx], self.X_num[idx]
        return self.X_cat[idx], self.X_num[idx], self.y[idx]

# -------------------------
# 5) Models
# -------------------------
class MLPWithEmbeddings(nn.Module):
    def __init__(self, cat_cardinalities, num_dim, emb_dim=16,
                 hidden=[256,128], dropout=0.2, layernorm=False, noise_std=0.0):
        super().__init__()
        self.noise_std = noise_std

        self.embs = nn.ModuleList()
        emb_out = 0
        for card in cat_cardinalities:
            d = min(emb_dim, max(2, int(round(card**0.25 * 2))))
            self.embs.append(nn.Embedding(card, d))
            emb_out += d

        in_dim = emb_out + num_dim
        layers = []
        for h in hidden:
            layers.append(nn.Linear(in_dim, h))
            if layernorm:
                layers.append(nn.LayerNorm(h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            in_dim = h
        layers.append(nn.Linear(in_dim, 1))
        self.net = nn.Sequential(*layers)

    def forward(self, x_cat, x_num):
        if self.noise_std > 0 and self.training:
            x_num = x_num + torch.randn_like(x_num) * self.noise_std

        if x_cat.shape[1] > 0:
            embs = [emb(x_cat[:,i]) for i, emb in enumerate(self.embs)]
            x = torch.cat(embs + [x_num], dim=1)
        else:
            x = x_num
        return self.net(x).squeeze(1)

# FT-Transformer (간결 구현: tokenization + transformer encoder)
class FTTransformer(nn.Module):
    def __init__(self, cat_cardinalities, num_dim,
                 d_token=192, n_heads=8, n_layers=3, dropout=0.1):
        super().__init__()
        self.num_dim = num_dim
        self.n_cat = len(cat_cardinalities)

        # categorical embeddings -> token
        self.cat_embs = nn.ModuleList()
        for card in cat_cardinalities:
            self.cat_embs.append(nn.Embedding(card, d_token))

        # numeric -> linear to token space (각 numeric을 token으로 만들지 않고 하나의 token으로 합치는 방식)
        self.num_proj = nn.Linear(num_dim, d_token)

        # [CLS] token
        self.cls = nn.Parameter(torch.zeros(1, 1, d_token))

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_token, nhead=n_heads,
            dim_feedforward=d_token*4,
            dropout=dropout, activation="gelu",
            batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)
        self.head = nn.Sequential(
            nn.LayerNorm(d_token),
            nn.Linear(d_token, 1)
        )

    def forward(self, x_cat, x_num):
        bs = x_num.size(0)
        cls = self.cls.expand(bs, 1, -1)

        tokens = []
        if self.n_cat > 0:
            cat_tokens = [emb(x_cat[:,i]).unsqueeze(1) for i, emb in enumerate(self.cat_embs)]
            tokens.extend(cat_tokens)

        num_token = self.num_proj(x_num).unsqueeze(1)
        tokens.append(num_token)

        x = torch.cat([cls] + tokens, dim=1)  # (B, 1 + n_cat + 1, d)
        x = self.encoder(x)
        cls_out = x[:,0]                      # (B, d)
        logit = self.head(cls_out).squeeze(1)
        return logit

# -------------------------
# 6) Train/Eval loop (OOF)
# -------------------------
def train_one_model(model, train_loader, valid_loader, seed, lr=LR, wd=WEIGHT_DECAY):
    model.to(DEVICE)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
    best_auc = -1
    best_state = None
    patience = 0

    for epoch in range(1, EPOCHS+1):
        model.train()
        tr_losses = []
        for batch in train_loader:
            xcat, xnum, y = batch
            xcat, xnum, y = xcat.to(DEVICE), xnum.to(DEVICE), y.to(DEVICE)

            opt.zero_grad()
            logit = model(xcat, xnum)
            loss = F.binary_cross_entropy_with_logits(logit, y)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()
            tr_losses.append(loss.item())

        # valid
        model.eval()
        va_logits = []
        va_y = []
        with torch.no_grad():
            for batch in valid_loader:
                xcat, xnum, y = batch
                xcat, xnum = xcat.to(DEVICE), xnum.to(DEVICE)
                logit = model(xcat, xnum)
                va_logits.append(logit.detach().cpu().numpy())
                va_y.append(y.numpy())
        va_logits = np.concatenate(va_logits)
        va_y = np.concatenate(va_y)
        va_proba = sigmoid_np(va_logits)
        auc = roc_auc_score(va_y, va_proba)

        if auc > best_auc + 1e-5:
            best_auc = auc
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            patience = 0
        else:
            patience += 1
            if patience >= PATIENCE:
                break

    if best_state is not None:
        model.load_state_dict(best_state)
    return model, best_auc

@torch.no_grad()
def predict_proba(model, loader):
    model.eval()
    outs = []
    for batch in loader:
        if len(batch) == 2:
            xcat, xnum = batch
        else:
            xcat, xnum, _ = batch
        xcat, xnum = xcat.to(DEVICE), xnum.to(DEVICE)
        logit = model(xcat, xnum)
        outs.append(logit.detach().cpu().numpy())
    logits = np.concatenate(outs)
    return sigmoid_np(logits)

def run_cv(model_name: str, build_model_fn, df_train, df_test, cat_cols, num_cols, seeds=SEEDS):
    y = df_train[TARGET_COL].astype(int).values
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

    oof_all_seeds = []
    test_all_seeds = []
    auc_all_seeds = []

    for seed in seeds:
        set_seed(seed)
        oof = np.zeros(len(df_train), dtype=float)
        test_pred_seed = np.zeros(len(df_test), dtype=float)

        for fold, (tr_idx, va_idx) in enumerate(skf.split(df_train, y), 1):
            tr_df = df_train.iloc[tr_idx].reset_index(drop=True)
            va_df = df_train.iloc[va_idx].reset_index(drop=True)

            # fold-safe encoding/scaling
            Xtr_cat, Xtr_num, Xva_cat, Xva_num, Xte_cat, Xte_num, cat_cards = fit_transform_fold(
                tr_df, va_df, df_test, cat_cols, num_cols
            )

            tr_ds = TabDataset(Xtr_cat, Xtr_num, tr_df[TARGET_COL].astype(int).values)
            va_ds = TabDataset(Xva_cat, Xva_num, va_df[TARGET_COL].astype(int).values)
            te_ds = TabDataset(Xte_cat, Xte_num, None)

            tr_loader = DataLoader(tr_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
            va_loader = DataLoader(va_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)
            te_loader = DataLoader(te_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

            model = build_model_fn(cat_cards, Xtr_num.shape[1])
            model, best_auc = train_one_model(model, tr_loader, va_loader, seed)

            # oof
            va_proba = predict_proba(model, va_loader)
            oof[va_idx] = va_proba

            # test (fold 평균)
            te_proba = predict_proba(model, te_loader)
            test_pred_seed += te_proba / N_FOLDS

            del model, tr_loader, va_loader, te_loader, tr_ds, va_ds, te_ds
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        seed_auc = roc_auc_score(y, oof)
        print(f"[{model_name}] seed={seed} OOF AUC = {seed_auc:.6f}")
        auc_all_seeds.append(seed_auc)
        oof_all_seeds.append(oof)
        test_all_seeds.append(test_pred_seed)

    # seed ensemble (mean)
    oof_ens = np.mean(np.stack(oof_all_seeds, axis=0), axis=0)
    test_ens = np.mean(np.stack(test_all_seeds, axis=0), axis=0)
    ens_auc = roc_auc_score(y, oof_ens)
    print(f"\n[{model_name}] SEED-ENSEMBLE OOF AUC = {ens_auc:.6f} (seeds={seeds})\n")

    return {
        "model_name": model_name,
        "oof_by_seed": oof_all_seeds,
        "test_by_seed": test_all_seeds,
        "auc_by_seed": auc_all_seeds,
        "oof_ens": oof_ens,
        "test_ens": test_ens,
        "auc_ens": ens_auc
    }

# -------------------------
# 7) Main
# -------------------------
print("="*80)
print("Load & Preprocess")
print("="*80)

train_raw = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
test_raw  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
sub       = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))

train_df = preprocess(train_raw)
test_df  = preprocess(test_raw)

# drop ID + (03에서 제거했던) 배아_이식_미도달
drop_cols = [ID_COL, "배아_이식_미도달"]
train_df = train_df.drop(columns=[c for c in drop_cols if c in train_df.columns])
test_df  = test_df.drop(columns=[c for c in drop_cols if c in test_df.columns])

cat_cols, num_cols = split_cols(train_df, TARGET_COL)
print(f"cat_cols={len(cat_cols)}, num_cols={len(num_cols)}")
print("target rate:", train_df[TARGET_COL].mean())

# -------------------------
# (B) FT-Transformer
# -------------------------
def build_ft(cat_cards, num_dim):
    return FTTransformer(
        cat_cardinalities=cat_cards,
        num_dim=num_dim,
        d_token=192,
        n_heads=8,
        n_layers=3,
        dropout=0.1
    )

res_ft = run_cv("FTTransformer", build_ft, train_df, test_df, cat_cols, num_cols, seeds=SEEDS)

# -------------------------
# (C-1) MLP LayerNorm형
# -------------------------
def build_mlp_ln(cat_cards, num_dim):
    return MLPWithEmbeddings(
        cat_cardinalities=cat_cards,
        num_dim=num_dim,
        emb_dim=16,
        hidden=[512, 256, 128],
        dropout=0.25,
        layernorm=True,
        noise_std=0.0
    )

res_mlp_ln = run_cv("MLP_LayerNorm", build_mlp_ln, train_df, test_df, cat_cols, num_cols, seeds=SEEDS)

# -------------------------
# (C-2) MLP StrongReg형
# -------------------------
def build_mlp_reg(cat_cards, num_dim):
    return MLPWithEmbeddings(
        cat_cardinalities=cat_cards,
        num_dim=num_dim,
        emb_dim=16,
        hidden=[512, 256, 128],
        dropout=0.45,     # 강한 드롭아웃
        layernorm=False,
        noise_std=0.03    # numeric gaussian noise (train only)
    )

res_mlp_reg = run_cv("MLP_StrongReg", build_mlp_reg, train_df, test_df, cat_cols, num_cols, seeds=SEEDS)

# -------------------------
# 8) Save outputs (10_ prefix)
# -------------------------
def save_pack(res):
    name = res["model_name"]
    # OOF / TEST (ensemble)
    oof_path = os.path.join(OUT_DIR, f"10_oof_{EXP_PREFIX}_{name}.npy")
    test_path= os.path.join(OUT_DIR, f"10_test_{EXP_PREFIX}_{name}.npy")
    np.save(oof_path, res["oof_ens"])
    np.save(test_path, res["test_ens"])

    # submission
    sub2 = sub.copy()
    sub2["probability"] = res["test_ens"]
    sub_path = os.path.join(OUT_DIR, f"10_submission_{EXP_PREFIX}_{name}_AUC{res['auc_ens']:.6f}.csv")
    sub2.to_csv(sub_path, index=False)

    # summary
    summ_path = os.path.join(OUT_DIR, f"10_summary_{EXP_PREFIX}_{name}.txt")
    with open(summ_path, "w", encoding="utf-8") as f:
        f.write(f"model={name}\n")
        f.write(f"seeds={SEEDS}\n")
        f.write(f"folds={N_FOLDS}\n")
        f.write(f"ensemble_oof_auc={res['auc_ens']:.6f}\n")
        f.write(f"auc_by_seed={res['auc_by_seed']}\n")
        f.write(f"cat_cols={len(cat_cols)}, num_cols={len(num_cols)}\n")
    print(f"[saved] {name}:")
    print(" -", oof_path)
    print(" -", test_path)
    print(" -", sub_path)
    print(" -", summ_path)

save_pack(res_ft)
save_pack(res_mlp_ln)
save_pack(res_mlp_reg)

print("\nDONE. outputs 폴더에 10_ 파일들 생성 완료.")
