In [67]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")

In [68]:
# =====================================================
# 1. 데이터 로드
# =====================================================
train_df = pd.read_csv("./data/train.csv")
test_df  = pd.read_csv("./data/test.csv")
submission = pd.read_csv("./data/sample_submission.csv")

In [69]:
cnt_cols = ['총 시술 횟수','클리닉 내 총 시술 횟수','IVF 시술 횟수','DI 시술 횟수',
            '총 임신 횟수','IVF 임신 횟수','DI 임신 횟수','총 출산 횟수','IVF 출산 횟수','DI 출산 횟수']

In [70]:
# =====================================================
# 2. Train-independent 전처리 (Feature Engineering)
# =====================================================
def preprocess_train_independent(df):
    for col in cnt_cols:
        if col in df.columns:
            df[col] = df[col].apply(extract_number)
    df = feature_success_rate(df)
    df = feature_embryo_interaction(df)
    df = feature_age_numeric(df)
    df = feature_total_infertility(df)
    df = feature_type_stim(df)
    return df

In [76]:
# 1. 시술 대비 임신/출산 비율
def feature_success_rate(df):
    df = df.copy()
    df['IVF_success_rate'] = df['IVF 임신 횟수'].astype(float) / (df['IVF 시술 횟수'].astype(float) + 1e-5)
    df['DI_success_rate'] = df['DI 임신 횟수'].astype(float) / (df['DI 시술 횟수'].astype(float) + 1e-5)
    df['total_birth_rate'] = df['총 출산 횟수'].astype(float) / (df['총 시술 횟수'].astype(float) + 1e-5)
    return df

# 2. 난자/배아 관련 상호작용
def feature_embryo_interaction(df):
    df = df.copy()
    df['embryo_transfer_rate'] = df['이식된 배아 수'].astype(float) / (df['총 생성 배아 수'].astype(float) + 1e-5)
    df['micro_inj_transfer_rate'] = df['미세주입 배아 이식 수'].astype(float) / (df['미세주입에서 생성된 배아 수'].astype(float) + 1e-5)
    df['frozen_use_rate'] = df['동결 배아 사용 여부'] / (df['저장된 배아 수'].astype(float) + 1e-5)
    return df

# 3. 연령 관련 숫자화
def feature_age_numeric(df):
    df = df.copy()
    def age_str_to_mid(age_str):
        if age_str in ['알 수 없음', 'Unknown']:
            return np.nan
        try:
            parts = age_str.replace('만','').replace('세','').split('-')
            return (int(parts[0]) + int(parts[1])) / 2
        except:
            return np.nan

    df['시술 당시 나이_num'] = df['시술 당시 나이'].apply(age_str_to_mid)
    df['난자 기증자 나이_num'] = df['난자 기증자 나이'].apply(age_str_to_mid)
    df['정자 기증자 나이_num'] = df['정자 기증자 나이'].apply(age_str_to_mid)
    return df

# 4. 불임 원인 합계
def feature_total_infertility(df):
    df = df.copy()
    infertility_cols = [
        '남성 주 불임 원인','남성 부 불임 원인','여성 주 불임 원인','여성 부 불임 원인',
        '부부 주 불임 원인','부부 부 불임 원인','불명확 불임 원인','불임 원인 - 난관 질환',
        '불임 원인 - 남성 요인','불임 원인 - 배란 장애','불임 원인 - 자궁경부 문제',
        '불임 원인 - 자궁내막증','불임 원인 - 정자 농도','불임 원인 - 정자 면역학적 요인',
        '불임 원인 - 정자 운동성','불임 원인 - 정자 형태'
    ]
    df['total_infertility_factors'] = df[infertility_cols].sum(axis=1)
    return df

# 5. 시술 타입 + 배란 자극 여부 결합
def feature_type_stim(df):
    df = df.copy()
    print(df['시술 유형'])
    df['IVF_stim'] = ((df['시술 유형']=='IVF') & (df['배란 자극 여부']==1)).astype(int)
    df['DI_stim'] = ((df['시술 유형']=='DI') & (df['배란 자극 여부']==1)).astype(int)
    return df


In [72]:
import re
def extract_number(x):
    if pd.isna(x):
        return 0
    num = re.findall(r'\d+', str(x))
    if num:
        return int(num[0])
    else:
        return 0   

In [77]:
# =====================================================
# 3. Train-dependent 전처리 (결측치, 타입)
# =====================================================
def preprocess_train_dependent(X_train, X_test):
    X_train = X_train.copy()
    X_test  = X_test.copy()

    num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns
    cat_cols = X_train.select_dtypes(include=["object", "category"]).columns
    
    # 숫자형 결측치 → train median
    for col in num_cols:
        med = X_train[col].median()
        X_train[col] = X_train[col].fillna(med)
        X_test[col]  = X_test[col].fillna(med) 

    # 범주형 결측치 → train mode
    for col in cat_cols:
        mode = X_train[col].mode()[0]
        X_train[col] = X_train[col].fillna(mode).astype("category")
        X_test[col]  = X_test[col].fillna(mode).astype("category")

    return X_train, X_test

In [74]:
# =====================================================
# 4. 전처리 적용
# =====================================================
y = train_df["임신 성공 여부"]
feature_cols = [c for c in train_df.columns if c not in ["임신 성공 여부", 'ID', '시술 시기 코드', '임신 시도 또는 마지막 임신 경과 연수', '난자 해동 경과일', '배아 해동 경과일']]
train_df = train_df[feature_cols]
test_df = test_df[feature_cols]

train_df, test_df = preprocess_train_dependent(train_df, test_df)

X = preprocess_train_independent(train_df)
X_test  = preprocess_train_independent(test_df)


categorical_features = X.select_dtypes(include="category").columns.tolist()
cat_feature_idx = [X.columns.get_loc(c) for c in categorical_features]

preprocess_train_independent
feature_success_rate
feature_embryo_interaction
feature_age_numeric
feature_total_infertility
feature_type_stim
0         IVF
1         IVF
2         IVF
3         IVF
4         IVF
         ... 
256346    IVF
256347    IVF
256348    IVF
256349    IVF
256350    IVF
Name: 시술 유형, Length: 256351, dtype: category
Categories (2, object): ['DI', 'IVF']
preprocess_train_independent
feature_success_rate
feature_embryo_interaction
feature_age_numeric
feature_total_infertility
feature_type_stim
0        IVF
1        IVF
2        IVF
3        IVF
4        IVF
        ... 
90062    IVF
90063    IVF
90064    IVF
90065    IVF
90066    IVF
Name: 시술 유형, Length: 90067, dtype: category
Categories (2, object): ['DI', 'IVF']


In [75]:
# =====================================================
# 5. KFold + LGBM + CatBoost 앙상블
# =====================================================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_lgb = np.zeros(len(X))
oof_cat = np.zeros(len(X))
test_lgb = np.zeros(len(X_test))
test_cat = np.zeros(len(X_test))

for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"\n===== Fold {fold} =====")

    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

    # --- LightGBM ---
    lgb_train = lgb.Dataset(X_tr, y_tr, categorical_feature=categorical_features)
    lgb_valid = lgb.Dataset(X_val, y_val, categorical_feature=categorical_features)

    lgb_model = lgb.train(
        {
            "objective": "binary",
            "metric": "auc",
            "learning_rate": 0.05,
            "num_leaves": 31,
            "feature_fraction": 0.8,
            "bagging_fraction": 0.8,
            "bagging_freq": 5,
            "seed": 42,
            "verbose": -1,
        },
        lgb_train,
        num_boost_round=1000,
        valid_sets=[lgb_valid],
        callbacks=[lgb.early_stopping(50)]
    )

    oof_lgb[val_idx] = lgb_model.predict(X_val)
    test_lgb += lgb_model.predict(X_test) / skf.n_splits

    # --- CatBoost ---
    cat_model = CatBoostClassifier(
#         iterations=1000,
#         learning_rate=0.05,
#         depth=6,
#         eval_metric="AUC",
#         random_seed=42,
#         verbose=False
        
    "loss_function": "Logloss",
    "eval_metric": "AUC",

    "iterations": 5000,
    "learning_rate": 0.03,
    "depth": 8,

    # 과적합 제어
    "l2_leaf_reg": 5,
    "min_data_in_leaf": 50,

    # 랜덤성
    "random_strength": 1.0,
    "bagging_temperature": 0.5,

    # 기타
    "border_count": 128,
    "verbose": 200,
    "random_seed": 42,
    "allow_writing_files": False,
    )

    cat_model.fit(
        X_tr, y_tr,
        cat_features=cat_feature_idx,
        eval_set=(X_val, y_val),
        early_stopping_rounds=50
    )

    oof_cat[val_idx] = cat_model.predict_proba(X_val)[:, 1]
    test_cat += cat_model.predict_proba(X_test)[:, 1] / skf.n_splits


===== Fold 1 =====
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[120]	valid_0's auc: 0.73663

===== Fold 2 =====
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[176]	valid_0's auc: 0.7419

===== Fold 3 =====
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[214]	valid_0's auc: 0.738885

===== Fold 4 =====
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[136]	valid_0's auc: 0.737337

===== Fold 5 =====
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[174]	valid_0's auc: 0.739579


In [78]:
# =====================================================
# 6. 앙상블 & 평가
# =====================================================
oof_ensemble = 0.5 * oof_lgb + 0.5 * oof_cat
test_ensemble = 0.5 * test_lgb + 0.5 * test_cat

print("\n=== CV AUC ===")
print("LGBM:", roc_auc_score(y, oof_lgb))
print("CatBoost:", roc_auc_score(y, oof_cat))
print("Ensemble:", roc_auc_score(y, oof_ensemble))


=== CV AUC ===
LGBM: 0.7388565238146467
CatBoost: 0.7391669945720898
Ensemble: 0.7393911849078405


In [79]:
# =====================================================
# 6. 앙상블 & 평가
# =====================================================
oof_ensemble = 0.4 * oof_lgb + 0.6 * oof_cat
test_ensemble = 0.4 * test_lgb + 0.6 * test_cat

print("\n=== CV AUC ===")
print("LGBM:", roc_auc_score(y, oof_lgb))
print("CatBoost:", roc_auc_score(y, oof_cat))
print("Ensemble:", roc_auc_score(y, oof_ensemble))


=== CV AUC ===
LGBM: 0.7388565238146467
CatBoost: 0.7391669945720898
Ensemble: 0.7394061464698495


In [13]:
# =====================================================
# 7. Submission
# =====================================================
from datetime import datetime
submission["probability"] = test_ensemble
submission.to_csv(    
    './submission_jekim_lgbm_cat_ensemble_{}.csv'.format(datetime.now().strftime('%m%d_01')),
    index=False)
print("\nSubmission saved!")



Submission saved!
