### Import

In [None]:
!pip install catboost

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder

import optuna
from sklearn.model_selection import train_test_split

### Data Load

In [2]:
train = pd.read_csv('./Data/train.csv').drop(columns=['ID'])
test = pd.read_csv('./Data/test.csv').drop(columns=['ID'])

In [3]:
X = train.drop('임신 성공 여부', axis=1)
y = train['임신 성공 여부']

### Data Pre-processing

In [4]:
categorical_columns = [
    "시술 시기 코드",
    "시술 당시 나이",
    "시술 유형",
    "특정 시술 유형",
    "배란 자극 여부",
    "배란 유도 유형",
    "단일 배아 이식 여부",
    "착상 전 유전 검사 사용 여부",
    "착상 전 유전 진단 사용 여부",
    "남성 주 불임 원인",
    "남성 부 불임 원인",
    "여성 주 불임 원인",
    "여성 부 불임 원인",
    "부부 주 불임 원인",
    "부부 부 불임 원인",
    "불명확 불임 원인",
    "불임 원인 - 난관 질환",
    "불임 원인 - 남성 요인",
    "불임 원인 - 배란 장애",
    "불임 원인 - 여성 요인",
    "불임 원인 - 자궁경부 문제",
    "불임 원인 - 자궁내막증",
    "불임 원인 - 정자 농도",
    "불임 원인 - 정자 면역학적 요인",
    "불임 원인 - 정자 운동성",
    "불임 원인 - 정자 형태",
    "배아 생성 주요 이유",
    "총 시술 횟수",
    "클리닉 내 총 시술 횟수",
    "IVF 시술 횟수",
    "DI 시술 횟수",
    "총 임신 횟수",
    "IVF 임신 횟수",
    "DI 임신 횟수",
    "총 출산 횟수",
    "IVF 출산 횟수",
    "DI 출산 횟수",
    "난자 출처",
    "정자 출처",
    "난자 기증자 나이",
    "정자 기증자 나이",
    "동결 배아 사용 여부",
    "신선 배아 사용 여부",
    "기증 배아 사용 여부",
    "대리모 여부",
    "PGD 시술 여부",
    "PGS 시술 여부"
]

In [5]:
# 카테고리형 컬럼들을 문자열로 변환
for col in categorical_columns:
    X[col] = X[col].astype(str)
    test[col] = test[col].astype(str)

In [6]:
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_train_encoded = X.copy()
X_train_encoded[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])

X_test_encoded = test.copy()
X_test_encoded[categorical_columns] = ordinal_encoder.transform(test[categorical_columns])

In [7]:
numeric_columns = [
    "임신 시도 또는 마지막 임신 경과 연수",
    "총 생성 배아 수",
    "미세주입된 난자 수",
    "미세주입에서 생성된 배아 수",
    "이식된 배아 수",
    "미세주입 배아 이식 수",
    "저장된 배아 수",
    "미세주입 후 저장된 배아 수",
    "해동된 배아 수",
    "해동 난자 수",
    "수집된 신선 난자 수",
    "저장된 신선 난자 수",
    "혼합된 난자 수",
    "파트너 정자와 혼합된 난자 수",
    "기증자 정자와 혼합된 난자 수",
    "난자 채취 경과일",
    "난자 해동 경과일",
    "난자 혼합 경과일",
    "배아 이식 경과일",
    "배아 해동 경과일"
]

In [8]:
numeric_columns = [col for col in numeric_columns if col in X.columns]

X_train_encoded[numeric_columns] = X_train_encoded[numeric_columns].fillna(0)
X_test_encoded[numeric_columns] = X_test_encoded[numeric_columns].fillna(0)

In [41]:
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif

# 1️⃣ 상수 Feature 제거
var_thresh = VarianceThreshold(threshold=0.0)
X_train_var_filtered = var_thresh.fit_transform(X_train_encoded)

# 2️⃣ Feature Selection 적용
selector = SelectKBest(f_classif, k=20)
X_train_selected = selector.fit_transform(X_train_var_filtered, y)

# 2️⃣ SMOTE 적용 (Feature Selection 후)
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_selected, y)

In [44]:
print("X_train_encoded:", X_train_encoded.shape)
print("y:", y.shape)
print("X_train_selected:", X_train_selected.shape)
print("X_train_resampled:", X_resampled.shape)
print("y_resampled:", y_resampled.shape)

X_train_encoded: (256351, 67)
y: (256351,)
X_train_selected: (256351, 20)
X_train_resampled: (380246, 20)
y_resampled: (380246,)


In [46]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_encoded, y, test_size=0.1, random_state=42, stratify=y
)

### Train

XGBoost

In [11]:
import catboost as cb
from catboost import CatBoostClassifier
import warnings
from sklearn.metrics import roc_auc_score

warnings.simplefilter(action='ignore', category=FutureWarning)

def objective(trial):
    try:
        grow_policy = trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise'])
        
        min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 20, 100) if grow_policy == 'Depthwise' else trial.suggest_int('min_data_in_leaf', 10, 100)# '시술 당시 나이' 가중치를 Optuna에서 탐색하도록 설정
        age_weight = trial.suggest_float("age_weight", 1.0, 3.0, step=0.1)


        params = {
            'objective': 'Logloss',
            'eval_metric': 'AUC',
            'depth': trial.suggest_int('depth', 4, 8),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
            'iterations': trial.suggest_int('iterations', 1000, 10000),
            'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-6, 10.0),
            'bagging_temperature': trial.suggest_uniform('bagging_temperature', 0.1, 1.0),
            'random_strength': trial.suggest_loguniform('random_strength', 1e-9, 10.0),
            'border_count': trial.suggest_int('border_count', 32, 255),
            'one_hot_max_size': trial.suggest_int('one_hot_max_size', 2, 20),
            'feature_weights': [age_weight if col == '시술 당시 나이' else 1 for col in X_train.columns],
            'min_data_in_leaf': min_data_in_leaf,
            'grow_policy': grow_policy,
            'random_seed': 42,
            'verbose': 0
        }

        model = CatBoostClassifier(**params)
        
        model.fit(
            X_train, y_train,
            eval_set=(X_valid, y_valid),
            early_stopping_rounds=100,
            use_best_model=True
        )
        
        preds = model.predict_proba(X_valid)[:, 1]
        auc = roc_auc_score(y_valid, preds)
        return auc

    except cb.CatBoostError as e:
        print(f"Skipping trial due to CatBoost error: {e}")
        return None  # 실패한 trial을 무시하고 다음으로 넘어감


In [12]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, show_progress_bar=True)

[I 2025-02-22 23:29:09,275] A new study created in memory with name: no-name-93e590e4-60e7-47d8-90ff-37c58e1542fe


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-02-22 23:29:50,825] Trial 0 finished with value: 0.740887289471807 and parameters: {'grow_policy': 'Depthwise', 'min_data_in_leaf': 95, 'age_weight': 2.0, 'depth': 7, 'learning_rate': 0.05449567775495605, 'iterations': 5196, 'l2_leaf_reg': 8.787178146232184e-05, 'bagging_temperature': 0.644370344322472, 'random_strength': 3.3574020720612724e-09, 'border_count': 155, 'one_hot_max_size': 13}. Best is trial 0 with value: 0.740887289471807.
[I 2025-02-22 23:30:57,635] Trial 1 finished with value: 0.7421092296974044 and parameters: {'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 36, 'age_weight': 3.0, 'depth': 4, 'learning_rate': 0.06878035710558862, 'iterations': 1533, 'l2_leaf_reg': 0.00025476328244655115, 'bagging_temperature': 0.4179642768615548, 'random_strength': 1.723324048721379e-05, 'border_count': 55, 'one_hot_max_size': 8}. Best is trial 1 with value: 0.7421092296974044.
[I 2025-02-22 23:32:26,833] Trial 2 finished with value: 0.7415987594142676 and parameters: {'gro

Training has stopped (degenerate solution on iteration 626, probably too small l2-regularization, try to increase it)


[I 2025-02-23 00:03:50,915] Trial 54 finished with value: 0.742248056490414 and parameters: {'grow_policy': 'Depthwise', 'min_data_in_leaf': 86, 'age_weight': 1.1, 'depth': 7, 'learning_rate': 0.022177765581950334, 'iterations': 7994, 'l2_leaf_reg': 0.0012511726616989559, 'bagging_temperature': 0.43728664474960177, 'random_strength': 1.552739978789365, 'border_count': 83, 'one_hot_max_size': 14}. Best is trial 39 with value: 0.7427128361890141.
[I 2025-02-23 00:04:00,033] Trial 55 finished with value: 0.7411240212568149 and parameters: {'grow_policy': 'Depthwise', 'min_data_in_leaf': 90, 'age_weight': 1.2, 'depth': 7, 'learning_rate': 0.02908242292586446, 'iterations': 8736, 'l2_leaf_reg': 0.015374554312292428, 'bagging_temperature': 0.5303308087395507, 'random_strength': 4.892068762931397e-05, 'border_count': 69, 'one_hot_max_size': 15}. Best is trial 39 with value: 0.7427128361890141.
[I 2025-02-23 00:04:29,447] Trial 56 finished with value: 0.7423079065104647 and parameters: {'grow_

Training has stopped (degenerate solution on iteration 637, probably too small l2-regularization, try to increase it)


[I 2025-02-23 00:07:46,921] Trial 63 finished with value: 0.7423998515157255 and parameters: {'grow_policy': 'Depthwise', 'min_data_in_leaf': 64, 'age_weight': 2.0, 'depth': 7, 'learning_rate': 0.02448217259495994, 'iterations': 5413, 'l2_leaf_reg': 0.0009817342102747555, 'bagging_temperature': 0.655831604868356, 'random_strength': 1.9486163922021358, 'border_count': 97, 'one_hot_max_size': 11}. Best is trial 39 with value: 0.7427128361890141.
[I 2025-02-23 00:08:01,301] Trial 64 finished with value: 0.7422648087782528 and parameters: {'grow_policy': 'Depthwise', 'min_data_in_leaf': 57, 'age_weight': 2.4000000000000004, 'depth': 7, 'learning_rate': 0.02511516606879724, 'iterations': 5708, 'l2_leaf_reg': 0.001118937964239168, 'bagging_temperature': 0.7578688163807692, 'random_strength': 0.8371961636473323, 'border_count': 99, 'one_hot_max_size': 13}. Best is trial 39 with value: 0.7427128361890141.
[I 2025-02-23 00:08:35,147] Trial 65 finished with value: 0.7420568088147195 and paramete

In [14]:
# Optuna에서 찾은 최적의 하이퍼파라미터 적용
best_params = study.best_params

# CatBoost에 맞게 변환 (필요 없는 파라미터 제거)
filtered_params = {k: v for k, v in best_params.items() if k != 'max_leaves'}  # max_leaves는 일부 경우 필요 없을 수 있음

# filtered_params에서 'age_weight' 값을 분리하고 제거합니다.
if 'age_weight' in filtered_params:
    age_weight = filtered_params.pop('age_weight')
    # '시술 당시 나이'에 대해서만 age_weight 값을 할당하고, 나머지는 1로 설정합니다.
    filtered_params['feature_weights'] = [age_weight if col == '시술 당시 나이' else 1 for col in X_train.columns]

# 모델 학습
best_model = CatBoostClassifier(
    **filtered_params,
    random_seed=42,
    verbose=100  # 로그 출력 간격 조정
)

best_model.fit(
    X_train, y_train,
    eval_set=(X_valid, y_valid),
    early_stopping_rounds=100,
    use_best_model=True
)

0:	learn: 0.6819313	test: 0.6818620	best: 0.6818620 (0)	total: 26.7ms	remaining: 4m 18s
100:	learn: 0.4993665	test: 0.4981781	best: 0.4981781 (100)	total: 2.17s	remaining: 3m 26s
200:	learn: 0.4918473	test: 0.4909244	best: 0.4909244 (200)	total: 4.29s	remaining: 3m 22s
300:	learn: 0.4897234	test: 0.4891183	best: 0.4891183 (300)	total: 6.35s	remaining: 3m 17s
400:	learn: 0.4885727	test: 0.4882552	best: 0.4882552 (400)	total: 8.36s	remaining: 3m 13s
500:	learn: 0.4879197	test: 0.4878547	best: 0.4878547 (500)	total: 16s	remaining: 4m 52s
600:	learn: 0.4871309	test: 0.4874124	best: 0.4874124 (600)	total: 26.9s	remaining: 6m 46s
700:	learn: 0.4857569	test: 0.4869989	best: 0.4869989 (700)	total: 37.5s	remaining: 8m
800:	learn: 0.4846259	test: 0.4868341	best: 0.4868341 (800)	total: 48.3s	remaining: 8m 56s
900:	learn: 0.4837336	test: 0.4868433	best: 0.4868189 (803)	total: 59.3s	remaining: 9m 38s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.4868189202
bestIteration = 803

<catboost.core.CatBoostClassifier at 0x22191284450>

In [15]:
from sklearn.metrics import accuracy_score, roc_auc_score

# 예측
y_train_pred = best_model.predict(X_train_encoded)
y_train_proba = best_model.predict_proba(X_train_encoded)[:, 1]  # ROC-AUC Score 계산용

# 평가
accuracy = accuracy_score(y, y_train_pred)
roc_auc = roc_auc_score(y, y_train_proba)

# 출력
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")

Accuracy: 0.7489
ROC-AUC Score: 0.7458


RandomForest (Hyperparameter tune)

In [16]:
best_model.save_model("Catboost_weighted_model.json")

### Predict

In [17]:
pred_proba = best_model.predict_proba(X_test_encoded)[:, 1]

### Submission

In [18]:
sample_submission = pd.read_csv('./Data/sample_submission.csv')
sample_submission['probability'] = pred_proba

In [19]:
sample_submission.to_csv('./Catboost_Weighted_optimization_submit.csv', index=False)