In [14]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")

In [15]:
# =====================================================
# 1. 데이터 로드
# =====================================================
train_df = pd.read_csv("./data/train.csv")
test_df  = pd.read_csv("./data/test.csv")
submission = pd.read_csv("./data/sample_submission.csv")

In [16]:
def preprocess_train_independent(X: pd.DataFrame, 
                                num_fillna=0, cat_fillna='Unknown',
                                num_cols=None, cat_cols=None):
    """
    Train-independent 전처리 (train/test 각각 독립 적용)
    - 숫자형 결측치: 0 또는 지정값
    - 범주형 결측치: 'Unknown' 또는 지정값
    - 문자열 소문자/strip 처리
    - category type 지정
    """
    X = X.copy()
    
    num_cols = num_cols or X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    cat_cols = cat_cols or X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # 숫자형 결측치
    for col in num_cols:
        X[col] = X[col].fillna(num_fillna)
    
    # 범주형 결측치 + 타입 변환 + 문자열 처리
    for col in cat_cols:
        X[col] = X[col].fillna(cat_fillna)
        X[col] = X[col].astype('category')
        # 문자열이면 소문자 + strip
        if X[col].dtype.name == 'category' or X[col].dtype == object:
            X[col] = X[col].astype(str).str.lower().str.strip()
    
    return X


In [17]:
# =====================================================
# 3. Train-dependent 전처리 (결측치, 타입)
# =====================================================
def preprocess_train_dependent(X_train, X_test):
    X_train = X_train.copy()
    X_test  = X_test.copy()

    num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns
    cat_cols = X_train.select_dtypes(include=["object", "category"]).columns

    # 숫자형 결측치 → train median
    for col in num_cols:
        med = X_train[col].median()
        X_train[col] = X_train[col].fillna(med)
        X_test[col]  = X_test[col].fillna(med)

    # 범주형 결측치 → train mode
    for col in cat_cols:
        mode = X_train[col].mode()[0]
        X_train[col] = X_train[col].fillna(mode).astype("category")
        X_test[col]  = X_test[col].fillna(mode).astype("category")

    return X_train, X_test

In [18]:
# =====================================================
# 4. 전처리 적용
# =====================================================
train_df = preprocess_train_independent(train_df)
test_df  = preprocess_train_independent(test_df)

feature_cols = [c for c in train_df.columns if c not in ["ID", "임신 성공 여부"]]
X = train_df[feature_cols]
y = train_df["임신 성공 여부"]
X_test = test_df[feature_cols]

X, X_test = preprocess_train_dependent(X, X_test)

categorical_features = X.select_dtypes(include="category").columns.tolist()
cat_feature_idx = [X.columns.get_loc(c) for c in categorical_features]

In [19]:
# =====================================================
# 5. KFold + LGBM + CatBoost 앙상블
# =====================================================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_lgb = np.zeros(len(X))
oof_cat = np.zeros(len(X))
test_lgb = np.zeros(len(X_test))
test_cat = np.zeros(len(X_test))

for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"\n===== Fold {fold} =====")

    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

    # --- LightGBM ---
    lgb_train = lgb.Dataset(X_tr, y_tr, categorical_feature=categorical_features)
    lgb_valid = lgb.Dataset(X_val, y_val, categorical_feature=categorical_features)

    lgb_model = lgb.train(
        {
            "objective": "binary",
            "metric": "auc",
            "learning_rate": 0.05,
            "num_leaves": 31,
            "feature_fraction": 0.8,
            "bagging_fraction": 0.8,
            "bagging_freq": 5,
            "seed": 42,
            "verbose": -1,
        },
        lgb_train,
        num_boost_round=1000,
        valid_sets=[lgb_valid],
        callbacks=[lgb.early_stopping(50)]
    )

    oof_lgb[val_idx] = lgb_model.predict(X_val)
    test_lgb += lgb_model.predict(X_test) / skf.n_splits

    # --- CatBoost ---
    cat_model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        eval_metric="AUC",
        random_seed=42,
        verbose=False
    )

    cat_model.fit(
        X_tr, y_tr,
        cat_features=cat_feature_idx,
        eval_set=(X_val, y_val),
        early_stopping_rounds=50
    )

    oof_cat[val_idx] = cat_model.predict_proba(X_val)[:, 1]
    test_cat += cat_model.predict_proba(X_test)[:, 1] / skf.n_splits


===== Fold 1 =====
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[198]	valid_0's auc: 0.737864

===== Fold 2 =====
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[245]	valid_0's auc: 0.742856

===== Fold 3 =====
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[216]	valid_0's auc: 0.740226

===== Fold 4 =====
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[181]	valid_0's auc: 0.737768

===== Fold 5 =====
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[247]	valid_0's auc: 0.740911


In [20]:
# =====================================================
# 6. 앙상블 & 평가
# =====================================================
oof_ensemble = 0.5 * oof_lgb + 0.5 * oof_cat
test_ensemble = 0.5 * test_lgb + 0.5 * test_cat

print("\n=== CV AUC ===")
print("LGBM:", roc_auc_score(y, oof_lgb))
print("CatBoost:", roc_auc_score(y, oof_cat))
print("Ensemble:", roc_auc_score(y, oof_ensemble))


=== CV AUC ===
LGBM: 0.7399141236567512
CatBoost: 0.739984231100707
Ensemble: 0.7403779200470678


In [21]:
# =====================================================
# 6. 앙상블 & 평가
# =====================================================
oof_ensemble = 0.4 * oof_lgb + 0.6 * oof_cat
test_ensemble = 0.4 * test_lgb + 0.6 * test_cat

print("\n=== CV AUC ===")
print("LGBM:", roc_auc_score(y, oof_lgb))
print("CatBoost:", roc_auc_score(y, oof_cat))
print("Ensemble:", roc_auc_score(y, oof_ensemble))


=== CV AUC ===
LGBM: 0.7399141236567512
CatBoost: 0.739984231100707
Ensemble: 0.7403679208142889


In [13]:
# =====================================================
# 7. Submission
# =====================================================
from datetime import datetime
submission["probability"] = test_ensemble
submission.to_csv(    
    './submission_jekim_lgbm_cat_ensemble_{}.csv'.format(datetime.now().strftime('%m%d_02')),
    index=False)
print("\nSubmission saved!")



Submission saved!
