In [106]:
import warnings
warnings.filterwarnings("ignore")

In [85]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [86]:
# =====================================================
# 1. 데이터 로드
# =====================================================
train_df = pd.read_csv("./data/train.csv")
test_df  = pd.read_csv("./data/test.csv")
submission = pd.read_csv("./data/sample_submission.csv")

In [87]:
train_df.head(3)

Unnamed: 0,ID,시술 시기 코드,시술 당시 나이,임신 시도 또는 마지막 임신 경과 연수,시술 유형,특정 시술 유형,배란 자극 여부,배란 유도 유형,단일 배아 이식 여부,착상 전 유전 검사 사용 여부,...,기증 배아 사용 여부,대리모 여부,PGD 시술 여부,PGS 시술 여부,난자 채취 경과일,난자 해동 경과일,난자 혼합 경과일,배아 이식 경과일,배아 해동 경과일,임신 성공 여부
0,TRAIN_000000,TRZKPL,만18-34세,,IVF,ICSI,1,기록되지 않은 시행,0.0,,...,0.0,0.0,,,0.0,,0.0,3.0,,0
1,TRAIN_000001,TRYBLT,만45-50세,,IVF,ICSI,0,알 수 없음,0.0,,...,0.0,0.0,,,0.0,,0.0,,,0
2,TRAIN_000002,TRVNRY,만18-34세,,IVF,IVF,1,기록되지 않은 시행,0.0,,...,0.0,0.0,,,0.0,,0.0,2.0,,0


In [88]:
train_df.isna().mean()

ID                       0.000000
시술 시기 코드                 0.000000
시술 당시 나이                 0.000000
임신 시도 또는 마지막 임신 경과 연수    0.963449
시술 유형                    0.000000
                           ...   
난자 해동 경과일                0.994398
난자 혼합 경과일                0.209615
배아 이식 경과일                0.169947
배아 해동 경과일                0.842525
임신 성공 여부                 0.000000
Length: 69, dtype: float64

In [98]:
# 1. 시술 대비 임신/출산 비율
def feature_success_rate(df):
    df = df.copy()
    df['IVF_success_rate'] = df['IVF 임신 횟수'].astype(float) / (df['IVF 시술 횟수'].astype(float) + 1e-5)
    df['DI_success_rate'] = df['DI 임신 횟수'].astype(float) / (df['DI 시술 횟수'].astype(float) + 1e-5)
    df['total_birth_rate'] = df['총 출산 횟수'].astype(float) / (df['총 시술 횟수'].astype(float) + 1e-5)
    return df

# 2. 난자/배아 관련 상호작용
def feature_embryo_interaction(df):
    df = df.copy()
    df['embryo_transfer_rate'] = df['이식된 배아 수'].astype(float) / (df['총 생성 배아 수'].astype(float) + 1e-5)
    df['micro_inj_transfer_rate'] = df['미세주입 배아 이식 수'].astype(float) / (df['미세주입에서 생성된 배아 수'].astype(float) + 1e-5)
    df['frozen_use_rate'] = df['동결 배아 사용 여부'] / (df['저장된 배아 수'].astype(float) + 1e-5)
    return df

# 3. 연령 관련 숫자화
def feature_age_numeric(df):
    df = df.copy()
    def age_str_to_mid(age_str):
        if age_str in ['알 수 없음', 'Unknown']:
            return np.nan
        try:
            parts = age_str.replace('만','').replace('세','').split('-')
            return (int(parts[0]) + int(parts[1])) / 2
        except:
            return np.nan

    df['시술 당시 나이_num'] = df['시술 당시 나이'].apply(age_str_to_mid)
    df['난자 기증자 나이_num'] = df['난자 기증자 나이'].apply(age_str_to_mid)
    df['정자 기증자 나이_num'] = df['정자 기증자 나이'].apply(age_str_to_mid)
    return df

# 4. 불임 원인 합계
def feature_total_infertility(df):
    df = df.copy()
    infertility_cols = [
        '남성 주 불임 원인','남성 부 불임 원인','여성 주 불임 원인','여성 부 불임 원인',
        '부부 주 불임 원인','부부 부 불임 원인','불명확 불임 원인','불임 원인 - 난관 질환',
        '불임 원인 - 남성 요인','불임 원인 - 배란 장애','불임 원인 - 자궁경부 문제',
        '불임 원인 - 자궁내막증','불임 원인 - 정자 농도','불임 원인 - 정자 면역학적 요인',
        '불임 원인 - 정자 운동성','불임 원인 - 정자 형태'
    ]
    df['total_infertility_factors'] = df[infertility_cols].sum(axis=1)
    return df

# 5. 시술 타입 + 배란 자극 여부 결합
def feature_type_stim(df):
    df = df.copy()
    df['IVF_stim'] = ((df['시술 유형']=='IVF') & (df['배란 자극 여부']==1)).astype(int)
    df['DI_stim'] = ((df['시술 유형']=='DI') & (df['배란 자극 여부']==1)).astype(int)
    return df


In [90]:
cnt_cols = ['총 시술 횟수','클리닉 내 총 시술 횟수','IVF 시술 횟수','DI 시술 횟수',
            '총 임신 횟수','IVF 임신 횟수','DI 임신 횟수','총 출산 횟수','IVF 출산 횟수','DI 출산 횟수']

In [91]:
import re
def extract_number(x):
    if pd.isna(x):
        return 0
    num = re.findall(r'\d+', str(x))
    if num:
        return int(num[0])
    else:
        return 0   

In [92]:
# =====================================================
# 2. Train-independent 전처리 (Feature Engineering)
# =====================================================
def preprocess_train_independent(df):
    for col in cnt_cols:
        if col in df.columns:
            df[col] = df[col].apply(extract_number)
    df = feature_success_rate(df)
    df = feature_embryo_interaction(df)
    df = feature_age_numeric(df)
    df = feature_total_infertility(df)
    df = feature_type_stim(df)
    return df

In [93]:
# =====================================================
# 3. Train-dependent 전처리 (결측치, 타입)
# =====================================================
def preprocess_train_dependent(X_train, X_test):
    X_train = X_train.copy()
    X_test  = X_test.copy()

    num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns
    cat_cols = X_train.select_dtypes(include=["object", "category"]).columns
    
    # 숫자형 결측치 → train median
    for col in num_cols:
        med = X_train[col].median()
        X_train[col] = X_train[col].fillna(med)
        X_test[col]  = X_test[col].fillna(med) 

    # 범주형 결측치 → train mode
    for col in cat_cols:
        mode = X_train[col].mode()[0]
        X_train[col] = X_train[col].fillna(mode).astype("category")
        X_test[col]  = X_test[col].fillna(mode).astype("category")

    return X_train, X_test

In [99]:
# =====================================================
# 4. 전처리 적용
# =====================================================
y = train_df["임신 성공 여부"]
feature_cols = [c for c in train_df.columns if c not in ["임신 성공 여부", 'ID', '시술 시기 코드', '임신 시도 또는 마지막 임신 경과 연수', '난자 해동 경과일', '배아 해동 경과일']]
train_df = train_df[feature_cols]
test_df = test_df[feature_cols]

train_df, test_df = preprocess_train_dependent(train_df, test_df)

X = preprocess_train_independent(train_df)
X_test  = preprocess_train_independent(test_df)


In [None]:
# =====================================================
# 5. KFold + LGBM + CatBoost 앙상블
# =====================================================

In [100]:
lgb_params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "learning_rate": 0.03,
    "num_leaves": 64,
    "max_depth": 8,
    "min_data_in_leaf": 50,
    "feature_fraction": 0.7,
    "bagging_fraction": 0.7,
    "bagging_freq": 1,
    "lambda_l1": 0.1,
    "lambda_l2": 0.2,
    "verbosity": -1,
    "seed": 42,
}

cat_params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "iterations": 5000,
    "learning_rate": 0.03,
    "depth": 8,
    "l2_leaf_reg": 5,
    "min_data_in_leaf": 50,
    "random_strength": 1.0,
    "bagging_temperature": 0.5,
    "border_count": 128,
    "verbose": 200,
    "random_seed": 42,
    "allow_writing_files": False,
}


In [101]:
def train_lgb_kfold(X, y, X_test, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    oof_pred = np.zeros(len(X))
    test_pred = np.zeros(len(X_test))

    for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"\n[LGBM] Fold {fold+1}")

        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        lgb_train = lgb.Dataset(X_tr, y_tr)
        lgb_valid = lgb.Dataset(X_val, y_val)

        model = lgb.train(
            lgb_params,
            lgb_train,
            num_boost_round=3000,
            valid_sets=[lgb_train, lgb_valid],
            valid_names=["train", "valid"],
            callbacks=[lgb.early_stopping(100)]
        )

        oof_pred[val_idx] = model.predict(
            X_val, num_iteration=model.best_iteration
        )
        test_pred += model.predict(
            X_test, num_iteration=model.best_iteration
        ) / n_splits

        auc = roc_auc_score(y_val, oof_pred[val_idx])
        print(f"[LGBM] Fold {fold+1} AUC: {auc:.5f}")

    print(f"\n[LGBM] OOF AUC: {roc_auc_score(y, oof_pred):.5f}")
    return oof_pred, test_pred


In [102]:
def train_cat_kfold(X, y, X_test, cat_features, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    oof_pred = np.zeros(len(X))
    test_pred = np.zeros(len(X_test))

    for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"\n[CAT] Fold {fold+1}")

        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        model = CatBoostClassifier(**cat_params)

        model.fit(
            X_tr,
            y_tr,
            eval_set=(X_val, y_val),
            cat_features=cat_features,
            early_stopping_rounds=200,
            use_best_model=True
        )

        oof_pred[val_idx] = model.predict_proba(X_val)[:, 1]
        test_pred += model.predict_proba(X_test)[:, 1] / n_splits

        auc = roc_auc_score(y_val, oof_pred[val_idx])
        print(f"[CAT] Fold {fold+1} AUC: {auc:.5f}")

    print(f"\n[CAT] OOF AUC: {roc_auc_score(y, oof_pred):.5f}")
    return oof_pred, test_pred


In [105]:
# CatBoost용 categorical index
# categorical_features = X.select_dtypes(include="category").columns.tolist()
# cat_feature_idx = [X.columns.get_loc(c) for c in categorical_features]

cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

# LGBM 학습 (LabelEncoding 된 데이터 사용)
lgb_oof, lgb_test = train_lgb_kfold(
    X, y, X_test
)

# CatBoost 학습 (object 그대로)
cat_oof, cat_test = train_cat_kfold(
    X, y, X_test, cat_features
)



[LGBM] Fold 1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[156]	train's auc: 0.751509	valid's auc: 0.736723
[LGBM] Fold 1 AUC: 0.73672

[LGBM] Fold 2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[197]	train's auc: 0.752278	valid's auc: 0.741269
[LGBM] Fold 2 AUC: 0.74127

[LGBM] Fold 3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[205]	train's auc: 0.753575	valid's auc: 0.73927
[LGBM] Fold 3 AUC: 0.73927

[LGBM] Fold 4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[133]	train's auc: 0.749922	valid's auc: 0.73709
[LGBM] Fold 4 AUC: 0.73709

[LGBM] Fold 5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[197]	train's auc: 0.753145	valid's auc: 0.739371
[LGBM] Fold 5 AUC: 0.73937

[LGBM] OOF AUC: 0.73875

[CAT] Fold 1
0:	test: 0.7215340

In [108]:
# =====================================================
# 6. 앙상블 & 평가
# =====================================================
oof_ensemble = 0.55 * oof_lgb + 0.45 * oof_cat
test_ensemble = 0.55 * test_lgb + 0.45 * test_cat

print("\n=== CV AUC ===")
print("LGBM:", roc_auc_score(y, oof_lgb))
print("CatBoost:", roc_auc_score(y, oof_cat))
print("Ensemble:", roc_auc_score(y, oof_ensemble))


=== CV AUC ===
LGBM: 0.7388565238146467
CatBoost: 0.7391669945720898
Ensemble: 0.7393724607180461


In [13]:
# =====================================================
# 7. Submission
# =====================================================

import ipynbname
from datetime import datetime

notebook_name = ipynbname.name()
timestamp = datetime.now().strftime("%Y%m%d_%H%M")

submission.to_csv(
    f"{notebook_name}_{timestamp}.csv",
    index=False
)

print("\nSubmission saved!")


Submission saved!
