### Import

In [None]:
!conda install lightgbm

Retrieving notices: ...working... done
Channels:
 - conda-forge
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... failed

Note: you may need to restart the kernel to use updated packages.



PackagesNotFoundError: The following packages are not available from current channels:

  - lightgbm-gpu

Current channels:

  - https://conda.anaconda.org/conda-forge
  - https://repo.anaconda.com/pkgs/main
  - https://repo.anaconda.com/pkgs/r
  - https://repo.anaconda.com/pkgs/msys2

To search for alternate channels that may provide the conda package you're
looking for, navigate to

    https://anaconda.org

and use the search bar at the top of the page.




In [23]:
import torch
import torch.nn as nn
import torch.optim as optim

import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder

import optuna
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import roc_auc_score, make_scorer

In [None]:
# GPU 장치 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)


False
Using device: cpu


### Data Load

In [3]:
train = pd.read_csv('./Data/train.csv').drop(columns=['ID'])
test = pd.read_csv('./Data/test.csv').drop(columns=['ID'])

In [4]:
X = train.drop('임신 성공 여부', axis=1)
y = train['임신 성공 여부']

### Data Pre-processing

In [5]:
categorical_columns = [
    "시술 시기 코드",
    "시술 당시 나이",
    "시술 유형",
    "특정 시술 유형",
    "배란 자극 여부",
    "배란 유도 유형",
    "단일 배아 이식 여부",
    "착상 전 유전 검사 사용 여부",
    "착상 전 유전 진단 사용 여부",
    "남성 주 불임 원인",
    "남성 부 불임 원인",
    "여성 주 불임 원인",
    "여성 부 불임 원인",
    "부부 주 불임 원인",
    "부부 부 불임 원인",
    "불명확 불임 원인",
    "불임 원인 - 난관 질환",
    "불임 원인 - 남성 요인",
    "불임 원인 - 배란 장애",
    "불임 원인 - 여성 요인",
    "불임 원인 - 자궁경부 문제",
    "불임 원인 - 자궁내막증",
    "불임 원인 - 정자 농도",
    "불임 원인 - 정자 면역학적 요인",
    "불임 원인 - 정자 운동성",
    "불임 원인 - 정자 형태",
    "배아 생성 주요 이유",
    "총 시술 횟수",
    "클리닉 내 총 시술 횟수",
    "IVF 시술 횟수",
    "DI 시술 횟수",
    "총 임신 횟수",
    "IVF 임신 횟수",
    "DI 임신 횟수",
    "총 출산 횟수",
    "IVF 출산 횟수",
    "DI 출산 횟수",
    "난자 출처",
    "정자 출처",
    "난자 기증자 나이",
    "정자 기증자 나이",
    "동결 배아 사용 여부",
    "신선 배아 사용 여부",
    "기증 배아 사용 여부",
    "대리모 여부",
    "PGD 시술 여부",
    "PGS 시술 여부"
]

In [6]:
# 카테고리형 컬럼들을 문자열로 변환
for col in categorical_columns:
    X[col] = X[col].astype(str)
    test[col] = test[col].astype(str)

In [7]:
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_train_encoded = X.copy()
X_train_encoded[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])

X_test_encoded = test.copy()
X_test_encoded[categorical_columns] = ordinal_encoder.transform(test[categorical_columns])

In [8]:
numeric_columns = [
    "임신 시도 또는 마지막 임신 경과 연수",
    "총 생성 배아 수",
    "미세주입된 난자 수",
    "미세주입에서 생성된 배아 수",
    "이식된 배아 수",
    "미세주입 배아 이식 수",
    "저장된 배아 수",
    "미세주입 후 저장된 배아 수",
    "해동된 배아 수",
    "해동 난자 수",
    "수집된 신선 난자 수",
    "저장된 신선 난자 수",
    "혼합된 난자 수",
    "파트너 정자와 혼합된 난자 수",
    "기증자 정자와 혼합된 난자 수",
    "난자 채취 경과일",
    "난자 해동 경과일",
    "난자 혼합 경과일",
    "배아 이식 경과일",
    "배아 해동 경과일"
]

In [9]:
numeric_columns = [col for col in numeric_columns if col in X.columns]

X_train_encoded[numeric_columns] = X_train_encoded[numeric_columns].fillna(0)
X_test_encoded[numeric_columns] = X_test_encoded[numeric_columns].fillna(0)

In [10]:
from collections import Counter

# 클래스별 샘플 개수 출력
class_counts = Counter(y)
print(class_counts)


Counter({0: 190123, 1: 66228})


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X_train_encoded, y, test_size=0.2, random_state=42, stratify=y
)

### Train

Lightgbm

In [None]:
import joblib

def objective(trial):
    # 사전에 저장된 LightGBM 모델 불러오기 (이미 학습된 모델)
    pre_trained_lgb = joblib.load('./LightGBM_boosting2.pkl')
    
    age_weight = trial.suggest_float("age_weight", 1.0, 3.0, step=0.1)
    X_train_weighted = X_train_encoded.copy()
    
    # '시술 당시 나이' 컬럼에만 가중치 부여
    if '시술 당시 나이' in X_train_encoded.columns:
        X_train_weighted['시술 당시 나이'] *= age_weight
        
    # MLP 하이퍼파라미터 튜닝
    mlp_params= {
        'hidden_layer_sizes' : trial.suggest_int("mlp_hidden_layer_sizes", 50, 300),
        'alpha' : trial.suggest_float("mlp_alpha", 0.0001, 0.1, log=True),
        'learning_rate_init' : trial.suggest_float("mlp_learning_rate_init", 0.001, 0.1, log=True)
    }
    mlp_model = MLPClassifier(**mlp_params, max_iter=1000, random_state=42)
    
    # 베이스 모델 정의
    base_models = [
        ('lgbm', pre_trained_lgb),
        ('mlp', mlp_model)
    ]

    # 메타 모델 (로지스틱 회귀) 하이퍼파라미터 최적화
    lr_C = trial.suggest_loguniform('lr_C', 1e-3, 1e2)
    meta_model = LogisticRegression(C=lr_C, random_state=42, max_iter=1000)
    
    # 스태킹 모델 구성
    stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=StratifiedKFold(n_splits=3))

    # AUC 기반 교차 검증 수행
    auc_scores = cross_val_score(stacking_clf, X_train_weighted, y, cv=StratifiedKFold(n_splits=3), scoring='roc_auc', n_jobs=-1)
    
    return np.mean(auc_scores)

In [14]:
import warnings

# 불필요한 경고 제거
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.filterwarnings("ignore", category=UserWarning, module="lightbgm")

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=150, show_progress_bar=True)

[I 2025-02-25 18:19:57,711] A new study created in memory with name: no-name-d889adc0-180d-4ef0-84a4-010dcddac43f


  0%|          | 0/150 [00:00<?, ?it/s]

[I 2025-02-25 18:22:42,360] Trial 0 finished with value: 0.7395775104128545 and parameters: {'age_weight': 1.7000000000000002, 'mlp_hidden_layer_sizes': 51, 'mlp_alpha': 0.00031775895543037063, 'mlp_learning_rate_init': 0.0012982924503154708, 'lr_C': 0.5407452339042454}. Best is trial 0 with value: 0.7395775104128545.
[I 2025-02-25 18:23:51,391] Trial 1 finished with value: 0.7384386921779754 and parameters: {'age_weight': 2.8, 'mlp_hidden_layer_sizes': 240, 'mlp_alpha': 0.000389820683265132, 'mlp_learning_rate_init': 0.04834413867331109, 'lr_C': 0.060718264385518264}. Best is trial 0 with value: 0.7395775104128545.
[I 2025-02-25 18:25:03,966] Trial 2 finished with value: 0.7388750809681669 and parameters: {'age_weight': 2.3, 'mlp_hidden_layer_sizes': 123, 'mlp_alpha': 0.016058586162497283, 'mlp_learning_rate_init': 0.020257025123775283, 'lr_C': 0.09383578112539094}. Best is trial 0 with value: 0.7395775104128545.
[I 2025-02-25 18:26:07,216] Trial 3 finished with value: 0.7381498011224

In [15]:
# 최적의 하이퍼파라미터로 최종 모델 구성 및 평가
best_params = study.best_trial.params
age_weight = best_params["age_weight"]
mlp_params = {
    'hidden_layer_sizes' : best_params["mlp_hidden_layer_sizes"],
    'alpha' : best_params["mlp_alpha"],
    'learning_rate_init' : best_params["mlp_learning_rate_init"],
    'max_iter': 1000,
    'random_state': 42
}
X_train_weighted = X_train_encoded.copy()
if '시술 당시 나이' in X_train_encoded.columns:
    X_train_weighted['시술 당시 나이'] *= age_weight

# MLP 모델 정의
mlp_model = MLPClassifier(**mlp_params)

# 사전에 저장된 LightGBM 모델 불러오기 (변경 없음)
pre_trained_lgb = joblib.load('./LightGBM_boosting2.pkl')


# 베이스 모델 정의
base_models = [
    ('lgbm', pre_trained_lgb),
    ('mlp', mlp_model)
]

# 최적화된 메타 모델 (로지스틱 회귀)
best_meta = LogisticRegression(C=best_params['lr_C'], random_state=42, max_iter=1000)
# 스태킹 모델 구성
stacking_clf = StackingClassifier(estimators=base_models, final_estimator=best_meta, cv=StratifiedKFold(n_splits=3))

In [16]:
# 최종 모델 학습 및 평가
stacking_clf.fit(X_train_encoded, y)

  File "c:\Users\vhehr\.conda\envs\LGAimers6\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\Users\vhehr\.conda\envs\LGAimers6\Lib\subprocess.py", line 556, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vhehr\.conda\envs\LGAimers6\Lib\subprocess.py", line 1038, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\U

[LightGBM] [Info] Number of positive: 66228, number of negative: 190123
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012872 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 716
[LightGBM] [Info] Number of data points in the train set: 256351, number of used features: 62
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258349 -> initscore=-1.054568
[LightGBM] [Info] Start training from score -1.054568
[LightGBM] [Info] Number of positive: 44152, number of negative: 126748
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006217 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 708
[LightGBM] [Info] Number of data points in the train set: 170900, number of used features: 62
[LightGBM] [Info

In [26]:
import joblib
# 저장된 모델 불러오기
final_stack_model = joblib.load('./stacking(LightGBM+xgboost)_optimization2.pkl')

# 추가 학습을 위해 warm_start 활성화
final_stack_model.warm_start = True

# 추가 학습 진행 (적절한 X_train, y_train 데이터를 사용)
final_stack_model.fit(X_train_encoded, y)

[LightGBM] [Info] Number of positive: 66228, number of negative: 190123
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011182 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 716
[LightGBM] [Info] Number of data points in the train set: 256351, number of used features: 62
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258349 -> initscore=-1.054568
[LightGBM] [Info] Start training from score -1.054568
[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007628 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 715
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 62
[LightGBM] [Info

In [17]:
from sklearn.metrics import accuracy_score, roc_auc_score

# 예측
y_train_pred = stacking_clf.predict(X_train_encoded)
y_train_proba = stacking_clf.predict_proba(X_train_encoded)[:, 1]  # ROC-AUC Score 계산용

# 평가
accuracy = accuracy_score(y, y_train_pred)
roc_auc = roc_auc_score(y, y_train_proba)

# 출력
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")

Accuracy: 0.7486
ROC-AUC Score: 0.7482


In [18]:
import joblib

# Optuna 최적화 실행 후 최적의 study 저장
joblib.dump(stacking_clf, "stacking(LightGBM+MLP)_optimization2.pkl")

['stacking(LightGBM+MLP)_optimization2.pkl']

### Predict

In [19]:
pred_proba = stacking_clf.predict_proba(X_test_encoded)[:, 1]

### Submission

In [20]:
sample_submission = pd.read_csv('./Data/sample_submission.csv')
sample_submission['probability'] = pred_proba

In [21]:
sample_submission.to_csv('./stacking(LightGBM+MLP)_optimization_submit2.csv', index=False)