In [1]:
import pandas as pd
import numpy as np
import random
import os
from autogluon.tabular import TabularPredictor
import warnings

# 경고 무시
warnings.filterwarnings('ignore')

# ==========================================
# 0. 시드 고정
# ==========================================
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

# ==========================================
# 1. 데이터 로드
# ==========================================
train_df = pd.read_csv('../Data/train.csv')
test_df = pd.read_csv('../Data/test.csv')
target = '임신 성공 여부'

# ==========================================
# 2. 파생변수 생성 (Feature Engineering)
# ==========================================

def feature_engineering(df):
    df = df.copy()

    # A. 나이 
    age_map = {
        '만18-34세': 2, '만35-37세': 3, '만38-39세': 4, 
        '만40-42세': 5, '만43-44세': 6, '만45-50세': 7, 
        '알 수 없음': 1 # 결측은 별도 범주로
    }
    df['시술 당시 나이'] = df['시술 당시 나이'].map(age_map).fillna(0)
    
    # B. 배아 퀄리티 추정 지표
    if '총 생성 배아 수' in df.columns and '이식된 배아 수' in df.columns:
        df['배아_이식_효율'] = df['이식된 배아 수'] / (df['총 생성 배아 수'] + 1e-6)
        
    # C. 미세주입 효율 (ICSI)
    if '미세주입된 난자 수' in df.columns and '미세주입에서 생성된 배아 수' in df.columns:
        df['미세주입_수정률'] = df['미세주입에서 생성된 배아 수'] / (df['미세주입된 난자 수'] + 1e-6)
        
    # D. 나이 보정 이식 수 (나이가 많을수록 이식을 많이 시도하는 경향 보정)
    df['나이_대비_이식수'] = df['시술 당시 나이'] * df['이식된 배아 수']

    # E. 불필요한 컬럼 삭제 
    drop_cols = ['ID', '시술 시기 코드'] 
    df = df.drop(drop_cols, axis=1)

    return df

# 전처리 적용
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

# ==========================================
# 4. 모델 학습
# ==========================================

predictor = TabularPredictor(
    label=target,
    eval_metric='roc_auc', 
    path='ag_models_final',
    problem_type='binary'
).fit(
    train_data=train_df,
    presets='best_quality', 
    num_stack_levels=1, 
    num_bag_folds=5,
    time_limit=3600 * 2,
    included_model_types=['GBM', 'CAT', 'XGB', 'RF', 'XT'],
)


# ==========================================
# 5. 추론 및 저장
# ==========================================
pred_probs = predictor.predict_proba(test_df)
final_probs = pred_probs.iloc[:, 1]

# ==========================================
# 5. 리더보드 (학습 데이터 내에서의 Validation Score 확인)
# ==========================================
lb = predictor.leaderboard(train_df, silent=True)
display(lb.head())

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.11.14
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 25.2.0: Tue Nov 18 21:09:40 PST 2025; root:xnu-12377.61.12~1/RELEASE_ARM64_T6000
CPU Count:          10
Pytorch Version:    2.9.1
CUDA Version:       CUDA is not available
Memory Avail:       7.04 GB / 16.00 GB (44.0%)
Disk Space Avail:   179.41 GB / 460.43 GB (39.0%)
Presets specified: ['best_quality']
Using hyperparameters preset: hyperparameters='zeroshot'
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=5, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stac

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,RandomForestEntr_BAG_L1,0.915482,0.729245,roc_auc,1.58997,6.869208,12.072125,1.58997,6.869208,12.072125,1,True,4
1,RandomForestGini_BAG_L1,0.914643,0.728353,roc_auc,1.54196,6.923969,10.554633,1.54196,6.923969,10.554633,1,True,3
2,ExtraTreesEntr_BAG_L2,0.803931,0.74555,roc_auc,9.377821,24.230652,3134.14192,1.499437,6.937764,9.713758,2,True,13
3,ExtraTreesGini_BAG_L2,0.803658,0.746231,roc_auc,9.332817,24.069599,3133.628072,1.454433,6.776711,9.19991,2,True,12
4,LightGBM_r96_BAG_L2,0.779919,0.739012,roc_auc,12.831973,21.06396,3141.334113,4.953589,3.771072,16.905951,2,True,19


In [2]:
# # --- 제출 파일 생성 ---
from datetime import datetime
submission = pd.read_csv('../Data/sample_submission.csv')
submission['probability'] = final_probs.values

# 현재 시간 가져오기 (예: 0206_1031)
now = datetime.now().strftime('%m%d_%H%M')
file_name = f"{now}_submission.csv"
submission.to_csv(file_name, index=False)

print(f"학습 및 예측이 완료되었습니다. 결과가 {file_name}에 저장되었습니다.")

학습 및 예측이 완료되었습니다. 결과가 0208_0147_submission.csv에 저장되었습니다.


In [5]:
# --- 피처 중요도 ---
fi = predictor.feature_importance(data=train_df.sample(n=min(5000, len(train_df)), random_state=42))
display(fi.head(30))

These features in provided data are not utilized by the predictor and will be ignored: ['불임 원인 - 여성 요인', '불임 원인 - 정자 면역학적 요인']
Computing feature importance via permutation shuffling for 67 features using 5000 rows with 5 shuffle sets...
	846.7s	= Expected runtime (169.34s per shuffle set)
	88.07s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
시술 당시 나이,0.065305,0.002255,1.703696e-07,5,0.069949,0.060662
이식된 배아 수,0.052677,0.004514,6.405187e-06,5,0.061971,0.043384
배아 이식 경과일,0.042413,0.001841,4.247209e-07,5,0.046203,0.038623
저장된 배아 수,0.033359,0.002772,5.67027e-06,5,0.039067,0.027651
IVF 시술 횟수,0.012512,0.001119,7.595362e-06,5,0.014816,0.010208
총 생성 배아 수,0.012202,0.000642,9.162058e-07,5,0.013523,0.01088
배아_이식_효율,0.008072,0.002636,0.001189874,5,0.013499,0.002645
클리닉 내 총 시술 횟수,0.008045,0.001327,8.581371e-05,5,0.010778,0.005311
총 임신 횟수,0.006967,0.001144,8.409184e-05,5,0.009322,0.004612
불임 원인 - 남성 요인,0.005435,0.001488,0.0006117968,5,0.008498,0.002371
