In [1]:
import pandas as pd
import numpy as np
import random
import os
from autogluon.tabular import TabularPredictor
import warnings

# 경고 무시
warnings.filterwarnings('ignore')

# ==========================================
# 0. 시드 고정 (재현성 확보)
# ==========================================
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

# ==========================================
# 1. 데이터 로드
# ==========================================

train_df = pd.read_csv('../Data/train.csv')
test_df = pd.read_csv('../Data/test.csv')
target = '임신 성공 여부'

# ==========================================
# 2. 데이터 전처리
# ==========================================

def preprocess_data(df):
    df = df.copy()
    
    # 1. 횟수 관련 컬럼 매핑 (0회:0 ~ 6회 이상:6)
    count_cols = [
        '총 시술 횟수', '클리닉 내 총 시술 횟수', 
        '총 임신 횟수', '총 출산 횟수',
        'IVF 시술 횟수', 'DI 시술 횟수',
        'IVF 임신 횟수', 'DI 임신 횟수',
        'IVF 출산 횟수', 'DI 출산 횟수'
    ]
    
    def map_count_str(x):
        if pd.isna(x): return np.nan
        x = str(x)
        if '6회 이상' in x: return 6
        try:
            return int(x.replace('회', ''))
        except:
            return np.nan

    for col in count_cols:
        df[col] = df[col].apply(map_count_str)
        
    # 2. 나이 범주형 -> 수치형 매핑
    age_map = {
        '만18-34세': 2, '만35-37세': 3, '만38-39세': 4, 
        '만40-42세': 5, '만43-44세': 6, '만45-50세': 7, 
        '알 수 없음': 1
    }
    df['나이_코드'] = df['시술 당시 나이'].map(age_map)
    
    # 3. 시술 유형 그룹화
    def clean_type(x):
        x = str(x).upper()
        if 'ICSI' in x: return 'ICSI'
        if 'IVF' in x: return 'IVF'
        if 'IUI' in x: return 'IUI'
        return 'Other'
    
    df['시술_유형_그룹'] = df['특정 시술 유형'].apply(clean_type)

    # 4. 배아 생성 주요 이유 그룹화
    def binning_reason(reason):
        if reason == '현재 시술용':
            return 'Direct_Treatment'
        elif '현재 시술용' in reason:
            return 'Mixed_Purpose'
        elif any(keyword in reason for keyword in ['기증용', '저장용']):
            return 'Storage_Donation'
        else:
            return 'Others'
        
    # 결측치를 미리 '미상'으로 변경
    df['배아 생성 주요 이유'] = df['배아 생성 주요 이유'].fillna('Unknown')
    df['배아 생성 주요 이유'] = df['배아 생성 주요 이유'].apply(binning_reason)
    
    return df

# ==========================================
# 3. 파생변수 생성
# ==========================================

def create_features(df):
    df = df.copy()
    
    # --- A. 상호작용 지표 ---
    # 기존 코드의 '나이_순서'를 '나이_코드'로 수정 (문법 오류 방지)
    df['나이x배아'] = df['나이_코드'] * df['이식된 배아 수'].fillna(0)
    
    # --- B. 불임 원인 복합 지표 ---
    infertility_cols = [col for col in df.columns if '불임 원인' in col]
    df['총_불임_원인_수'] = df[infertility_cols].sum(axis=1)
    
    # --- C. 난자/배아 효율성 지표 (Divide by Zero 방지: 1e-6) ---
    oocyte_cols = ['수집된 신선 난자 수', '혼합된 난자 수', '기증자 정자와 혼합된 난자 수', '해동 난자 수']
    df['총_난자_수'] = df[oocyte_cols].fillna(0).sum(axis=1)
    
    df['배아_생성_효율'] = df['총 생성 배아 수'] / (df['총_난자_수'] + 1e-6)
    df['이식_효율'] = df['이식된 배아 수'] / (df['총 생성 배아 수'] + 1e-6)
    df['저장_비율'] = df['저장된 배아 수'] / (df['총 생성 배아 수'] + 1e-6)
    df['미세주입_성공률'] = df['미세주입에서 생성된 배아 수'] / (df['미세주입된 난자 수'] + 1e-6)

    # --- D. 과거 성공 이력 (Row-wise이므로 누수 없음) ---
    df['과거_임신_성공률'] = df['총 임신 횟수'] / (df['총 시술 횟수'] + 1e-6)
    df['과거_출산_성공률'] = df['총 출산 횟수'] / (df['총 시술 횟수'] + 1e-6)
    
    # --- E. 로그 변환 (Skewed Data) ---
    skewed_cols = ['총 생성 배아 수', '총_난자_수', '총 시술 횟수']
    for col in skewed_cols:
        df[col] = np.log1p(df[col].fillna(0))
        
    # --- F. 불필요 컬럼 제거 ---
    drop_cols = ['ID', '시술 시기 코드', '특정 시술 유형', '시술 유형']
    df = df.drop(drop_cols, axis=1)
    
    return df

# --- 실행부 ---
# 1. 전처리 적용
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

# 2. 파생변수 생성 적용
train_df = create_features(train_df)
test_df = create_features(test_df)

# ==========================================
# 4. 모델 학습 설정
# ==========================================

ag_args_fit = {
    'num_gpus': 0, 
    'num_cpus': os.cpu_count()  # 가용 CPU 전체 사용
}

predictor = TabularPredictor(
    label=target,
    eval_metric='roc_auc', 
    path='ag_models_final',
    problem_type='binary'
).fit(
    train_data=train_df,
    presets='best_quality',
    time_limit=7200,
    num_stack_levels=2,
    num_bag_folds=8,
    ag_args_fit=ag_args_fit,
)

# ==========================================
# 5. 예측 (Test Data 활용) - 최종 결과를 확률로 출력 (Positive 클래스에 대한 확률만 추출)
# ==========================================

pred_probs = predictor.predict_proba(test_df)
final_probs = pred_probs.iloc[:, 1]


Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.5.0
Python Version:     3.11.14
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 25.2.0: Tue Nov 18 21:09:40 PST 2025; root:xnu-12377.61.12~1/RELEASE_ARM64_T6000
CPU Count:          10
Pytorch Version:    2.9.1
CUDA Version:       CUDA is not available
Memory Avail:       9.20 GB / 16.00 GB (57.5%)
Disk Space Avail:   181.74 GB / 460.43 GB (39.5%)
Presets specified: ['best_quality']
Using hyperparameters preset: hyperparameters='zeroshot'
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=2, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stac

In [2]:
# --- 1. 리더보드 (오름차순 정렬) ---
# 학습 데이터 내에서의 Validation Score 확인
lb = predictor.leaderboard(train_df, silent=True)
display(lb.head())

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT_BAG_L1,0.747116,0.721273,roc_auc,2.595732,0.390352,3634.965492,2.595732,0.390352,3634.965492,1,True,1
1,WeightedEnsemble_L4,0.747116,0.721273,roc_auc,2.597698,0.422097,3635.003601,0.001966,0.031745,0.038109,4,True,3
2,WeightedEnsemble_L2,0.747116,0.721273,roc_auc,2.59875,0.424042,3634.99739,0.003018,0.03369,0.031898,2,True,2


In [5]:
fi.to_excel("fi7.xlsx")

In [3]:
# --- 2. 피처 중요도 ---
fi = predictor.feature_importance(data=train_df.sample(n=min(5000, len(train_df)), random_state=42))
display(fi)

These features in provided data are not utilized by the predictor and will be ignored: ['불임 원인 - 여성 요인']
Computing feature importance via permutation shuffling for 52 features using 5000 rows with 5 shuffle sets...
	30.74s	= Expected runtime (6.15s per shuffle set)
	1103.67s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
이식된 배아 수,0.042639,0.003388,5e-06,5,0.049615,0.035664
배아 이식 경과일,0.025925,0.001805,3e-06,5,0.029641,0.022209
시술 당시 나이,0.01446,0.001282,7e-06,5,0.017101,0.011819
나이_코드,0.010468,0.001987,0.000149,5,0.014559,0.006377
배아 생성 주요 이유,0.010299,0.002654,0.000485,5,0.015763,0.004834
저장_비율,0.00694,0.001028,5.6e-05,5,0.009057,0.004822
이식_효율,0.004557,0.000513,1.9e-05,5,0.005613,0.003502
클리닉 내 총 시술 횟수,0.002812,0.000567,0.000187,5,0.003979,0.001646
IVF 시술 횟수,0.002696,0.000472,0.000108,5,0.003668,0.001725
배아_생성_효율,0.00253,0.000846,0.001302,5,0.004273,0.000788


In [7]:
# --- 3. 제출 파일 생성 ---
from datetime import datetime
submission = pd.read_csv('../Data/sample_submission.csv')
submission['probability'] = final_probs.values

# 현재 시간 가져오기 (예: 0206_1031)
now = datetime.now().strftime('%m%d_%H%M')
file_name = f"{now}_submission.csv"
submission.to_csv(file_name, index=False)

print(f"학습 및 예측이 완료되었습니다. 결과가 {file_name}에 저장되었습니다.")

학습 및 예측이 완료되었습니다. 결과가 0207_1729_submission.csv에 저장되었습니다.
