In [1]:
import random
import os
import numpy as np
import pandas as pd
from datetime import datetime
from autogluon.tabular import TabularPredictor

random.seed(42)
os.environ['PYTHONHASHSEED'] = str(42)
np.random.seed(42)

# ==========================================
# 1. 데이터 로드
# ==========================================

train_df = pd.read_csv('../Data/train.csv')
test_df = pd.read_csv('../Data/test.csv')
target = '임신 성공 여부'

# ==========================================
# 2. 파생변수 생성
# ==========================================

def derive_features(df):
    
    df['is_blastocyst'] = df['특정 시술 유형'].str.contains('BLASTOCYST', case=False, na=False)
    df['is_ah'] = df['특정 시술 유형'].str.contains('AH', case=False, na=False)
    
    elderly_categories = ['만35-37세', '만38-39세', '만40-42세', '만43-44세', '만45-50세']
    df['고령 여부'] = df['시술 당시 나이'].isin(elderly_categories).astype(int)

    age_order = {'만18-34세': 1, '만35-37세': 2, '만38-39세': 3, '만40-42세': 4, '만43-44세': 5, '만45-50세': 6, '알 수 없음': 0}
    df['나이_순서'] = df['시술 당시 나이'].map(age_order)
    df['나이x배아'] = df['나이_순서'] * df['이식된 배아 수']

    df['배아 발달 기간'] = df['배아 이식 경과일'] - df['난자 혼합 경과일']
    df['배아 생성 효율'] = df['저장된 배아 수'] / (df['저장된 신선 난자 수'] + 1e-6)
    df['이식 비중'] = df['이식된 배아 수'] / (df['이식된 배아 수'] + df['저장된 배아 수'] + 1e-6)

    df['이식배아_구간'] = pd.cut(
    df['이식된 배아 수'].fillna(0),
    bins=[-float('inf'), 0, 2, float('inf')],
    labels=['0개', '1-2개', '3개 이상']
    )

    def clean_treatment(text): 
        text = str(text).upper()
        if 'ICSI' in text:
            return 'ICSI'
        if 'IVF' in text:
            return 'IVF'
        if 'IUI' in text:
            return 'IUI'
        return 'Other'

    df['시술유형_정제'] = df['특정 시술 유형'].apply(clean_treatment)
    
    df.drop('특정 시술 유형', axis=1, inplace=True)
    
    return df

train_df = derive_features(train_df)
test_df = derive_features(test_df)

# ==========================================
# 3. 모델 학습 설정
# ==========================================

predictor = TabularPredictor(
    label=target, 
    eval_metric='roc_auc',
    path='ag_models_out',
).fit(
    train_data=train_df,
    time_limit=3600,
    presets='best_quality',
    ag_args_fit={'num_gpus': 1},
    num_stack_levels=3,
    num_bag_folds=5,
    refit_full=True
)

# ==========================================
# 4. 예측 (Test Data 활용) - 최종 결과를 확률로 출력 (Positive 클래스에 대한 확률만 추출)
# ==========================================

pred_probs = predictor.predict_proba(test_df)
final_probs = pred_probs.iloc[:, 1]


Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.11.14
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 25.2.0: Tue Nov 18 21:09:40 PST 2025; root:xnu-12377.61.12~1/RELEASE_ARM64_T6000
CPU Count:          10
Pytorch Version:    2.9.1
CUDA Version:       CUDA is not available
Memory Avail:       4.38 GB / 16.00 GB (27.4%)
Disk Space Avail:   193.55 GB / 460.43 GB (42.0%)
Presets specified: ['best_quality']
Using hyperparameters preset: hyperparameters='zeroshot'
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=5, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stac

In [2]:
# --- 1. 리더보드 (오름차순 정렬) ---
lb = predictor.leaderboard(silent=True)
display(lb.sort_values(by='score_val', ascending=False))

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.740187,roc_auc,16.987759,2329.772244,0.030427,6.704284,2,True,10
1,XGBoost_BAG_L1,0.738915,roc_auc,1.477447,118.253129,1.477447,118.253129,1,True,6
2,NeuralNetTorch_BAG_L1,0.737321,roc_auc,3.753271,1115.854576,3.753271,1115.854576,1,True,7
3,NeuralNetFastAI_BAG_L1,0.736483,roc_auc,1.998977,1001.462465,1.998977,1001.462465,1,True,5
4,NeuralNetTorch_r79_BAG_L1,0.735102,roc_auc,4.045673,210.800935,4.045673,210.800935,1,True,8
5,ExtraTreesEntr_BAG_L1,0.732316,roc_auc,6.073806,9.08141,6.073806,9.08141,1,True,4
6,ExtraTreesGini_BAG_L1,0.73226,roc_auc,6.307771,8.722309,6.307771,8.722309,1,True,3
7,RandomForestEntr_BAG_L1,0.731049,roc_auc,6.005398,9.848547,6.005398,9.848547,1,True,2
8,RandomForestGini_BAG_L1,0.730288,roc_auc,6.243716,10.821787,6.243716,10.821787,1,True,1
9,NeuralNetFastAI_r191_BAG_L1,0.724671,roc_auc,3.722238,77.649243,3.722238,77.649243,1,True,9


In [3]:
# --- 2. 피처 중요도 ---
fi = predictor.feature_importance(data=train_df.sample(n=min(5000, len(train_df)), random_state=42))
display(fi.sort_values(by='p_value', ascending=True).head(30))

These features in provided data are not utilized by the predictor and will be ignored: ['ID', '불임 원인 - 여성 요인', '불임 원인 - 정자 면역학적 요인']
Computing feature importance via permutation shuffling for 74 features using 5000 rows with 5 shuffle sets...
	479.31s	= Expected runtime (95.86s per shuffle set)
	320.08s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
혼합된 난자 수,0.007939,0.000188,3.748945e-08,5,0.008325,0.007552
파트너 정자와 혼합된 난자 수,0.006745,0.000192,7.814105e-08,5,0.00714,0.006351
총 시술 횟수,0.004947,0.000181,2.15667e-07,5,0.00532,0.004574
미세주입 후 저장된 배아 수,0.002102,0.000108,8.448503e-07,5,0.002325,0.001879
수집된 신선 난자 수,0.009586,0.000506,9.254406e-07,5,0.010627,0.008545
불임 원인 - 배란 장애,0.001691,9.6e-05,1.235072e-06,5,0.001888,0.001494
배아 생성 효율,0.002973,0.000169,1.249304e-06,5,0.003321,0.002625
고령 여부,0.003539,0.000204,1.325257e-06,5,0.003959,0.003118
시술 당시 나이,0.007863,0.000472,1.549788e-06,5,0.008834,0.006891
저장된 배아 수,0.00323,0.000195,1.581407e-06,5,0.003631,0.002829


In [11]:
display(fi.sort_values(by='p_value').head(30))

Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
혼합된 난자 수,0.007939,0.000188,3.748945e-08,5,0.008325,0.007552
파트너 정자와 혼합된 난자 수,0.006745,0.000192,7.814105e-08,5,0.00714,0.006351
총 시술 횟수,0.004947,0.000181,2.15667e-07,5,0.00532,0.004574
미세주입 후 저장된 배아 수,0.002102,0.000108,8.448503e-07,5,0.002325,0.001879
수집된 신선 난자 수,0.009586,0.000506,9.254406e-07,5,0.010627,0.008545
불임 원인 - 배란 장애,0.001691,9.6e-05,1.235072e-06,5,0.001888,0.001494
배아 생성 효율,0.002973,0.000169,1.249304e-06,5,0.003321,0.002625
고령 여부,0.003539,0.000204,1.325257e-06,5,0.003959,0.003118
시술 당시 나이,0.007863,0.000472,1.549788e-06,5,0.008834,0.006891
저장된 배아 수,0.00323,0.000195,1.581407e-06,5,0.003631,0.002829


In [4]:
# --- 3. 제출 파일 생성 ---
# submission = pd.read_csv('../Data/sample_submission.csv')
# submission['probability'] = final_probs.values

# # 현재 시간 가져오기 (예: 0206_1031)
# now = datetime.now().strftime('%m%d_%H%M')
# file_name = f"{now}_submission.csv"
# submission.to_csv(file_name, index=False)

# print(f"학습 및 예측이 완료되었습니다. 결과가 {file_name}에 저장되었습니다.")