### Import

In [1]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 41.8 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder

import joblib
import optuna

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, ExtraTreesClassifier
import lightgbm as lgb
import xgboost as xgb

### Data Load

In [3]:
train = pd.read_csv('./Data/train.csv').drop(columns=['ID'])
test = pd.read_csv('./Data/test.csv').drop(columns=['ID'])

In [4]:
X = train.drop('임신 성공 여부', axis=1)
y = train['임신 성공 여부']

### Data Pre-processing

In [5]:
categorical_columns = [
    "시술 시기 코드",
    "시술 당시 나이",
    "시술 유형",
    "특정 시술 유형",
    "배란 자극 여부",
    "배란 유도 유형",
    "단일 배아 이식 여부",
    "착상 전 유전 검사 사용 여부",
    "착상 전 유전 진단 사용 여부",
    "남성 주 불임 원인",
    "남성 부 불임 원인",
    "여성 주 불임 원인",
    "여성 부 불임 원인",
    "부부 주 불임 원인",
    "부부 부 불임 원인",
    "불명확 불임 원인",
    "불임 원인 - 난관 질환",
    "불임 원인 - 남성 요인",
    "불임 원인 - 배란 장애",
    "불임 원인 - 여성 요인",
    "불임 원인 - 자궁경부 문제",
    "불임 원인 - 자궁내막증",
    "불임 원인 - 정자 농도",
    "불임 원인 - 정자 면역학적 요인",
    "불임 원인 - 정자 운동성",
    "불임 원인 - 정자 형태",
    "배아 생성 주요 이유",
    "총 시술 횟수",
    "클리닉 내 총 시술 횟수",
    "IVF 시술 횟수",
    "DI 시술 횟수",
    "총 임신 횟수",
    "IVF 임신 횟수",
    "DI 임신 횟수",
    "총 출산 횟수",
    "IVF 출산 횟수",
    "DI 출산 횟수",
    "난자 출처",
    "정자 출처",
    "난자 기증자 나이",
    "정자 기증자 나이",
    "동결 배아 사용 여부",
    "신선 배아 사용 여부",
    "기증 배아 사용 여부",
    "대리모 여부",
    "PGD 시술 여부",
    "PGS 시술 여부"
]

In [6]:
# 카테고리형 컬럼들을 문자열로 변환
for col in categorical_columns:
    X[col] = X[col].astype(str)
    test[col] = test[col].astype(str)

In [7]:
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_train_encoded = X.copy()
X_train_encoded[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])

X_test_encoded = test.copy()
X_test_encoded[categorical_columns] = ordinal_encoder.transform(test[categorical_columns])

In [8]:
numeric_columns = [
    "임신 시도 또는 마지막 임신 경과 연수",
    "총 생성 배아 수",
    "미세주입된 난자 수",
    "미세주입에서 생성된 배아 수",
    "이식된 배아 수",
    "미세주입 배아 이식 수",
    "저장된 배아 수",
    "미세주입 후 저장된 배아 수",
    "해동된 배아 수",
    "해동 난자 수",
    "수집된 신선 난자 수",
    "저장된 신선 난자 수",
    "혼합된 난자 수",
    "파트너 정자와 혼합된 난자 수",
    "기증자 정자와 혼합된 난자 수",
    "난자 채취 경과일",
    "난자 해동 경과일",
    "난자 혼합 경과일",
    "배아 이식 경과일",
    "배아 해동 경과일"
]

In [9]:
numeric_columns = [col for col in numeric_columns if col in X.columns]

X_train_encoded[numeric_columns] = X_train_encoded[numeric_columns].fillna(0)
X_test_encoded[numeric_columns] = X_test_encoded[numeric_columns].fillna(0)

In [9]:
from collections import Counter

# 클래스별 샘플 개수 출력
class_counts = Counter(y)
print(class_counts)


Counter({0: 190123, 1: 66228})


In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_encoded, y, test_size=0.2, random_state=42, stratify=y
)

### Train

Lightgbm

In [None]:
import joblib

opti_trial = 10

# 저장된 모델 불러오기
lgb_model = joblib.load('./LightGBM_boosting2.pkl')

# 3. Optuna로 XGBoost 최적화
def optimize_xgb(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10)
    }
    model = xgb.XGBClassifier(**param, random_state=42, eval_metric='logloss')
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc')
    return scores.mean()

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(optimize_xgb, n_trials=opti_trial, show_progress_bar=True)
xgb_best_params = study_xgb.best_params

# 4. Optuna로 ExtraTreesClassifier 최적화
def optimize_extra(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10)
    }
    model = ExtraTreesClassifier(**param, random_state=42)
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc')
    return scores.mean()

study_extra = optuna.create_study(direction='maximize')
study_extra.optimize(optimize_extra, n_trials=opti_trial, show_progress_bar=True)
extra_best_params = study_extra.best_params

# 5. Optuna로 Meta Model (LogisticRegression) 최적화
def optimize_meta(trial):
    param = {
        'C': trial.suggest_loguniform('C', 0.01, 10.0),
        'penalty': trial.suggest_categorical('penalty', ['l2'])
    }
    model = LogisticRegression(**param, random_state=42)
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc')
    return scores.mean()

study_meta = optuna.create_study(direction='maximize')
study_meta.optimize(optimize_meta, n_trials=opti_trial, show_progress_bar=True)
meta_best_params = study_meta.best_params

# 6. 최적화된 하이퍼파라미터로 모델 구성
xgb_model = xgb.XGBClassifier(**xgb_best_params, random_state=42, eval_metric='logloss')
extra_model = ExtraTreesClassifier(**extra_best_params, random_state=42)
meta_model = LogisticRegression(**meta_best_params, random_state=42)

# 7. Stacking Classifier 구성
stacking_clf = StackingClassifier(
    estimators=[
        ('lgb', lgb_model),
        ('xgb', xgb_model),
        ('extra', extra_model)
    ],
    final_estimator=meta_model,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1
)

[I 2025-02-26 23:30:17,428] A new study created in memory with name: no-name-07a20711-3feb-44fc-ac61-264289af742b


  0%|          | 0/10 [00:00<?, ?it/s]

  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[I 2025-02-26 23:30:26,436] Trial 0 finished with value: 0.7390667822973718 and parameters: {'n_estimators': 423, 'learning_rate': 0.11273811648754437, 'max_depth': 4}. Best is trial 0 with value: 0.7390667822973718.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[I 2025-02-26 23:30:37,405] Trial 1 finished with value: 0.7370211010948933 and parameters: {'n_estimators': 268, 'learning_rate': 0.02970296633225808, 'max_depth': 9}. Best is trial 0 with value: 0.7390667822973718.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[I 2025-02-26 23:30:54,880] Trial 2 finished with value: 0.7367991230992451 and parameters: {'n_estimators': 486, 'learning_rate': 0.020144729075373843, 'max_depth': 9}. Best is trial 0 with value: 0.7390667822973718.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[I 2025-02-26 23:31:04,433] Trial 3 finished with value: 0.7333723119263811 and parameters: {'n_estimators': 327, 'learning_rate': 0.09755277405677476, 'max_depth': 8}. Best is trial 0 with value: 0.7390667822973718.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[I 2025-02-26 23:31:16,754] Trial 4 finished with value: 0.729973587723521 and parameters: {'n_estimators': 374, 'learning_rate': 0.06940268576555354, 'max_depth': 10}. Best is trial 0 with value: 0.7390667822973718.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[I 2025-02-26 23:31:24,264] Trial 5 finished with value: 0.7397867655367976 and parameters: {'n_estimators': 353, 'learning_rate': 0.07999021163010435, 'max_depth': 3}. Best is trial 5 with value: 0.7397867655367976.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[I 2025-02-26 23:31:31,980] Trial 6 finished with value: 0.7362454825306536 and parameters: {'n_estimators': 327, 'learning_rate': 0.19122922334825088, 'max_depth': 5}. Best is trial 5 with value: 0.7397867655367976.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[I 2025-02-26 23:31:35,769] Trial 7 finished with value: 0.7393436059719092 and parameters: {'n_estimators': 152, 'learning_rate': 0.28110022763062825, 'max_depth': 3}. Best is trial 5 with value: 0.7397867655367976.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[I 2025-02-26 23:31:47,898] Trial 8 finished with value: 0.7309110122745192 and parameters: {'n_estimators': 444, 'learning_rate': 0.10364405919139609, 'max_depth': 8}. Best is trial 5 with value: 0.7397867655367976.


  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-02-26 23:31:52,521] A new study created in memory with name: no-name-64d8d6e4-c635-4c98-b7d7-92c300df57ef


[I 2025-02-26 23:31:52,517] Trial 9 finished with value: 0.7390253919969443 and parameters: {'n_estimators': 162, 'learning_rate': 0.05370202699833404, 'max_depth': 4}. Best is trial 5 with value: 0.7397867655367976.


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-02-26 23:32:56,088] Trial 0 finished with value: 0.7216428961584442 and parameters: {'n_estimators': 151, 'max_depth': 8, 'min_samples_split': 9}. Best is trial 0 with value: 0.7216428961584442.


In [11]:
stacking_clf.fit(X_train_encoded, y)

In [None]:
# 피처 이름 공백 제거
X.columns = X.columns.str.replace(" ", "_")

def objective(trial):
    # 사전에 저장된 LightGBM 모델 불러오기 (이미 학습된 모델)
    pre_trained_lgb = joblib.load('./LightGBM_boosting2.pkl')
    
    age_weight = trial.suggest_float("age_weight", 1.0, 3.0, step=0.1)
    
    # XGBoost 하이퍼파라미터 최적화
    xgb_params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 0.3),
        "feature_weights": [age_weight if col == '시술 당시 나이' else 1 for col in X_train.columns]
    }
    xgb_model = XGBClassifier(**xgb_params,
                              random_state=42,
                              use_label_encoder=False,
                              eval_metric='logloss')
    
    # 메타 모델 (로지스틱 회귀) 하이퍼파라미터 최적화
    lr_C = trial.suggest_loguniform('lr_C', 1e-3, 1e2)
    meta_model = LogisticRegression(C=lr_C, random_state=42, max_iter=1000)
    
    # 스태킹 모델 구성: 사전에 학습된 LightGBM 모델과 최적화 대상 XGBoost 모델 사용
    estimators = [
        ('lgb', pre_trained_lgb),
        ('xgb', xgb_model)
    ]
    
    stack_model = StackingClassifier(estimators=estimators,
                                     final_estimator=meta_model,
                                     cv=5,          # 내부 교차 검증 사용
                                     passthrough=False,
                                     n_jobs=-1)
    
    # 5-fold 교차 검증을 통해 평균 정확도 측정
    score = cross_val_score(stack_model, X_train, y_train, cv=5, scoring='roc_auc').mean()
    return score


NameError: name 'X' is not defined

In [64]:
import warnings

# 불필요한 경고 제거
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.filterwarnings("ignore", category=UserWarning, module="lightbgm")

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, show_progress_bar=True)

[I 2025-02-23 11:58:34,197] A new study created in memory with name: no-name-de154bb2-cbdb-4efa-b252-7584e8362012


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-02-23 12:01:00,983] Trial 0 finished with value: 0.7441242441973863 and parameters: {'age_weight': 1.8, 'n_estimators': 86, 'max_depth': 10, 'learning_rate': 0.004049289475108182, 'lr_C': 0.4357079289600887}. Best is trial 0 with value: 0.7441242441973863.
[I 2025-02-23 12:03:35,021] Trial 1 finished with value: 0.7436707626292179 and parameters: {'age_weight': 3.0, 'n_estimators': 181, 'max_depth': 9, 'learning_rate': 0.009563974126100076, 'lr_C': 83.81433121876083}. Best is trial 0 with value: 0.7441242441973863.
[I 2025-02-23 12:05:41,879] Trial 2 finished with value: 0.7440169689877121 and parameters: {'age_weight': 1.1, 'n_estimators': 177, 'max_depth': 5, 'learning_rate': 0.07844044464156676, 'lr_C': 0.5942985777569691}. Best is trial 0 with value: 0.7441242441973863.
[I 2025-02-23 12:09:12,419] Trial 3 finished with value: 0.7448166569143748 and parameters: {'age_weight': 2.7, 'n_estimators': 112, 'max_depth': 5, 'learning_rate': 0.04889844125649485, 'lr_C': 0.0018938565

In [None]:
import joblib
# 저장된 모델 불러오기
final_model = joblib.load('./LightGBM_boosting2.pkl')

# 추가 학습을 위해 warm_start 활성화
final_model.warm_start = True
final_model.n_estimators += 50  # 추가 부스팅 라운드 개수 설정

# 추가 학습 진행 (적절한 X_train, y_train 데이터를 사용)
final_model.fit(X_train_encoded, y)

In [65]:
# 최적의 하이퍼파라미터로 최종 모델 구성 및 평가
best_params = study.best_trial.params

# 사전에 저장된 LightGBM 모델 불러오기 (변경 없음)
pre_trained_lgb = joblib.load('./LightGBM_boosting2.pkl')

# 최적화된 XGBoost 모델
best_xgb = XGBClassifier(**best_params,
                         random_state=42,
                         use_label_encoder=False,
                         eval_metric='logloss')

# 최적화된 메타 모델 (로지스틱 회귀)
best_meta = LogisticRegression(C=best_params['lr_C'], random_state=42, max_iter=1000)

# 최종 스태킹 모델 구성
final_stack_model = StackingClassifier(estimators=[('lgb', pre_trained_lgb), ('xgb', best_xgb)],
                                         final_estimator=best_meta,
                                         cv=5)

In [66]:
# 최종 모델 학습 및 평가
final_stack_model.fit(X_train_encoded, y)

[LightGBM] [Info] Number of positive: 66228, number of negative: 190123
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011710 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 716
[LightGBM] [Info] Number of data points in the train set: 256351, number of used features: 62
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258349 -> initscore=-1.054568
[LightGBM] [Info] Start training from score -1.054568
[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008558 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 715
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 62
[LightGBM] [Info

In [12]:
from sklearn.metrics import accuracy_score, roc_auc_score

# 예측
y_train_pred = final_stack_model.predict(X_train_encoded)
y_train_proba = final_stack_model.predict_proba(X_train_encoded)[:, 1]  # ROC-AUC Score 계산용

# 평가
accuracy = accuracy_score(y, y_train_pred)
roc_auc = roc_auc_score(y, y_train_proba)

# 출력
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")

NameError: name 'final_stack_model' is not defined

In [13]:
from sklearn.metrics import accuracy_score, roc_auc_score

# 예측
y_train_pred = stacking_clf.predict(X_train_encoded)
y_train_proba = stacking_clf.predict_proba(X_train_encoded)[:, 1]  # ROC-AUC Score 계산용

# 평가
accuracy = accuracy_score(y, y_train_pred)
roc_auc = roc_auc_score(y, y_train_proba)

# 출력
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")

[WinError 2] 지정된 파일을 찾을 수 없습니다
  File "c:\Users\vhehr\.conda\envs\LGAimers6\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\Users\vhehr\.conda\envs\LGAimers6\Lib\subprocess.py", line 556, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vhehr\.conda\envs\LGAimers6\Lib\subprocess.py", line 1038, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^

Accuracy: 0.7636
ROC-AUC Score: 0.7783


In [15]:
import joblib

# Optuna 최적화 실행 후 최적의 study 저장
joblib.dump(stacking_clf, "stacking(LightGBM+xgboost+ExtraTrees).pkl")

['stacking(LightGBM+xgboost+ExtraTrees).pkl']

### Predict

In [16]:
pred_proba = stacking_clf.predict_proba(X_test_encoded)[:, 1]

### Submission

In [17]:
sample_submission = pd.read_csv('./Data/sample_submission.csv')
sample_submission['probability'] = pred_proba

In [18]:
sample_submission.to_csv('./stacking(LightGBM+xgboost+ExtraTrees)_submit.csv', index=False)