In [5]:
import pandas as pd
import numpy as np
import warnings
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# 경고 무시
warnings.filterwarnings('ignore')

# 랜덤 시드 고정
def seed_everything(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.backends.mps.is_available():
        torch.mps.manual_seed(seed)

seed_everything()

# MPS 장치 설정 (Apple Silicon 가속)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# ==========================================
# 1. 데이터 로드
# ==========================================
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

# ==========================================
# 2. 사용자 정의 전처리 함수
# ==========================================
def preprocess_data(df):
    df = df.copy()
    
    # 1. 횟수 관련 컬럼 매핑
    count_cols = [
        '총 시술 횟수', '클리닉 내 총 시술 횟수', 
        '총 임신 횟수', '총 출산 횟수',
        'IVF 시술 횟수', 'DI 시술 횟수',
        'IVF 임신 횟수', 'DI 임신 횟수',
        'IVF 출산 횟수', 'DI 출산 횟수'
    ]
    
    def map_count_str(x):
        if pd.isna(x): return np.nan
        x = str(x)
        if '6회 이상' in x: return 6
        try:
            return int(x.replace('회', ''))
        except:
            return np.nan

    for col in count_cols:
        if col in df.columns:
            df[col] = df[col].apply(map_count_str)
        
    # 2. 나이 범주형 -> 수치형 매핑
    age_map = {
        '만18-34세': 2, '만35-37세': 3, '만38-39세': 4, 
        '만40-42세': 5, '만43-44세': 6, '만45-50세': 7, 
        '알 수 없음': 1
    }
    if '시술 당시 나이' in df.columns:
        df['나이_코드'] = df['시술 당시 나이'].map(age_map)
    
    # 3. 시술 유형 그룹화
    def clean_type(x):
        x = str(x).upper()
        if 'ICSI' in x: return 'ICSI'
        if 'IVF' in x: return 'IVF'
        if 'IUI' in x: return 'IUI'
        return 'Other'
    
    if '특정 시술 유형' in df.columns:
        df['시술_유형_그룹'] = df['특정 시술 유형'].apply(clean_type)

    # 4. 배아 생성 주요 이유 그룹화
    def binning_reason(reason):
        if reason == '현재 시술용':
            return 'Direct_Treatment'
        elif '현재 시술용' in reason:
            return 'Mixed_Purpose'
        elif any(keyword in reason for keyword in ['기증용', '저장용']):
            return 'Storage_Donation'
        else:
            return 'Others'
    
    if '배아 생성 주요 이유' in df.columns:
        df['배아 생성 주요 이유'] = df['배아 생성 주요 이유'].fillna('Unknown')
        df['배아 생성 주요 이유'] = df['배아 생성 주요 이유'].apply(binning_reason)
    
    return df

# ==========================================
# 3. 파생변수 생성
# ==========================================
def create_features(df):
    df = df.copy()
    
    # 컬럼 존재 여부 확인 후 계산
    def get_col(col_name):
        return df[col_name] if col_name in df.columns else pd.Series(0, index=df.index)

    # --- A. 상호작용 지표 ---
    if '나이_코드' in df.columns and '이식된 배아 수' in df.columns:
        df['나이x배아'] = df['나이_코드'] * df['이식된 배아 수'].fillna(0)
    
    # --- B. 불임 원인 복합 지표 ---
    infertility_cols = [col for col in df.columns if '불임 원인' in col]
    if infertility_cols:
        df['총_불임_원인_수'] = df[infertility_cols].sum(axis=1)
    
    # --- C. 난자/배아 효율성 지표 ---
    oocyte_cols = ['수집된 신선 난자 수', '혼합된 난자 수', '기증자 정자와 혼합된 난자 수', '해동 난자 수']
    valid_oocyte_cols = [c for c in oocyte_cols if c in df.columns]
    
    df['총_난자_수'] = df[valid_oocyte_cols].fillna(0).sum(axis=1)
    
    df['배아_생성_효율'] = get_col('총 생성 배아 수') / (df['총_난자_수'] + 1e-6)
    df['이식_효율'] = get_col('이식된 배아 수') / (get_col('총 생성 배아 수') + 1e-6)
    df['저장_비율'] = get_col('저장된 배아 수') / (get_col('총 생성 배아 수') + 1e-6)
    df['미세주입_성공률'] = get_col('미세주입에서 생성된 배아 수') / (get_col('미세주입된 난자 수') + 1e-6)

    # --- D. 과거 성공 이력 ---
    df['과거_임신_성공률'] = get_col('총 임신 횟수') / (get_col('총 시술 횟수') + 1e-6)
    df['과거_출산_성공률'] = get_col('총 출산 횟수') / (get_col('총 시술 횟수') + 1e-6)
    
    # --- E. 로그 변환 ---
    skewed_cols = ['총 생성 배아 수', '총_난자_수', '총 시술 횟수']
    for col in skewed_cols:
        if col in df.columns:
            df[col] = np.log1p(df[col].fillna(0))
            
    # --- F. 불필요 컬럼 제거 ---
    drop_cols = ['ID', '시술 시기 코드', '특정 시술 유형', '시술 유형']
    df = df.drop([c for c in drop_cols if c in df.columns], axis=1)
    
    return df

# 전처리 적용
print("Preprocessing Data...")
train_prep = create_features(preprocess_data(train))
test_prep = create_features(preprocess_data(test))

# Target 설정
target_col = '임신 성공 여부' 
if target_col not in train_prep.columns:
    target_col = train_prep.columns[-1]

X = train_prep.drop(columns=[target_col])
y = train_prep[target_col]
X_test = test_prep.copy()

# 컬럼 순서 및 일치 확인
X_test = X_test[X.columns]

# ==========================================
# 4. 데이터 누수 방지 파이프라인 구축
# ==========================================

# 수치형/범주형 컬럼 분류
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

# 전처리기 정의 (Train 통계만 활용)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # 결측치는 Train 중앙값으로
    ('scaler', StandardScaler())                    # 스케일링은 Train Mean/Std로
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) # Test에 없는 범주는 무시
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# ==========================================
# 5. 모델 정의 (Neural Network & Boosters)
# ==========================================

class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.layer2 = nn.Sequential(
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.layer3 = nn.Sequential(
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        self.output = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.output(x)
        return self.sigmoid(x)

def train_nn_model(X_train, y_train, X_val, y_val, input_dim):
    model = SimpleNN(input_dim).to(device)
    criterion = nn.BCELoss()
    optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5)
    
    # 데이터셋 생성 시 .values를 명시하여 numpy array로 변환 후 텐서화
    train_tensor = TensorDataset(torch.FloatTensor(X_train).to(device), 
                                 torch.FloatTensor(y_train).unsqueeze(1).to(device))
    
    val_tensor = TensorDataset(torch.FloatTensor(X_val).to(device), 
                               torch.FloatTensor(y_val.values).unsqueeze(1).to(device)) 
    
    train_loader = DataLoader(train_tensor, batch_size=1024, shuffle=True)
    
    best_auc = 0
    best_model_state = None
    early_stopping = 10
    counter = 0
    
    for epoch in range(50):
        model.train()
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
        
        # Validation
        model.eval()
        with torch.no_grad():
            val_inputs = torch.FloatTensor(X_val).to(device)
            val_preds = model(val_inputs).cpu().numpy().flatten()
            val_auc = roc_auc_score(y_val, val_preds)
        
        scheduler.step(val_auc)
        
        if val_auc > best_auc:
            best_auc = val_auc
            best_model_state = model.state_dict()
            counter = 0
        else:
            counter += 1
            if counter >= early_stopping:
                break
    
    model.load_state_dict(best_model_state)
    return model

# ==========================================
# 6. 학습 및 앙상블 (Stratified K-Fold)
# ==========================================

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# 결과 저장
oof_preds_lgbm = np.zeros(X.shape[0])
oof_preds_xgb = np.zeros(X.shape[0])
oof_preds_nn = np.zeros(X.shape[0])

test_preds_lgbm = np.zeros(X_test.shape[0])
test_preds_xgb = np.zeros(X_test.shape[0])
test_preds_nn = np.zeros(X_test.shape[0])

# 불균형 비율 계산
scale_pos_weight = 190123 / 66228

print("Starting Cross Validation...")

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"=== Fold {fold+1} / {n_splits} ===")
    
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # 1. 데이터 누수 방지: Fit on Train -> Transform Val & Test
    X_train_trans = preprocessor.fit_transform(X_train_fold)
    X_val_trans = preprocessor.transform(X_val_fold)
    X_test_trans = preprocessor.transform(X_test)
    
    # --- LGBM ---
    lgbm = LGBMClassifier(
        n_estimators=2000,
        learning_rate=0.03,
        num_leaves=31,
        max_depth=-1,
        scale_pos_weight=scale_pos_weight,
        random_state=42,
        verbosity=-1,
        n_jobs=-1
    )
    lgbm.fit(
        X_train_trans, y_train,
        eval_set=[(X_val_trans, y_val)],
        eval_metric='auc'
    )
    oof_preds_lgbm[val_idx] = lgbm.predict_proba(X_val_trans)[:, 1]
    test_preds_lgbm += lgbm.predict_proba(X_test_trans)[:, 1] / n_splits
    print(f"LGBM AUC: {roc_auc_score(y_val, oof_preds_lgbm[val_idx]):.5f}")

    # --- XGB ---
    xgb = XGBClassifier(
        n_estimators=2000,
        learning_rate=0.03,
        max_depth=6,
        scale_pos_weight=scale_pos_weight,
        tree_method='hist',
        random_state=42,
        n_jobs=-1,
        eval_metric='auc',
        early_stopping_rounds=100
    )
    xgb.fit(
        X_train_trans, y_train,
        eval_set=[(X_val_trans, y_val)],
        verbose=False
    )
    oof_preds_xgb[val_idx] = xgb.predict_proba(X_val_trans)[:, 1]
    test_preds_xgb += xgb.predict_proba(X_test_trans)[:, 1] / n_splits
    print(f"XGB AUC: {roc_auc_score(y_val, oof_preds_xgb[val_idx]):.5f}")

    # --- NN (PyTorch MPS) ---
    input_dim = X_train_trans.shape[1]
    nn_model = train_nn_model(X_train_trans, y_train.values, X_val_trans, y_val, input_dim)
    
    nn_model.eval()
    with torch.no_grad():
        val_tensor = torch.FloatTensor(X_val_trans).to(device)
        test_tensor = torch.FloatTensor(X_test_trans).to(device)
        
        oof_preds_nn[val_idx] = nn_model(val_tensor).cpu().numpy().flatten()
        test_preds_nn += nn_model(test_tensor).cpu().numpy().flatten() / n_splits
    print(f"NN AUC: {roc_auc_score(y_val, oof_preds_nn[val_idx]):.5f}")

# ==========================================
# 7. 앙상블 및 제출 파일 생성
# ==========================================

# 모델별 전체 OOF Score 확인
auc_lgbm = roc_auc_score(y, oof_preds_lgbm)
auc_xgb = roc_auc_score(y, oof_preds_xgb)
auc_nn = roc_auc_score(y, oof_preds_nn)

print(f"\nFinal OOF AUC Scores -> LGBM: {auc_lgbm:.5f}, XGB: {auc_xgb:.5f}, NN: {auc_nn:.5f}")

# 가중 평균 앙상블 (성능 기반 가중치 부여)
# NN이 정규화된 데이터에서 좋은 성능을 보일 경우 가중치를 높임
total_auc = auc_lgbm + auc_xgb + auc_nn
w_lgbm = auc_lgbm / total_auc
w_xgb = auc_xgb / total_auc
w_nn = auc_nn / total_auc

print(f"Weights -> LGBM: {w_lgbm:.2f}, XGB: {w_xgb:.2f}, NN: {w_nn:.2f}")

final_preds = (test_preds_lgbm * w_lgbm) + (test_preds_xgb * w_xgb) + (test_preds_nn * w_nn)

# 제출 파일 저장
submission['probability'] = final_preds
submission.to_csv('submission_ensemble.csv', index=False)

print("Submission file 'submission_ensemble.csv' created successfully.")

Using device: mps
Preprocessing Data...
Starting Cross Validation...
=== Fold 1 / 5 ===
LGBM AUC: 0.73121
XGB AUC: 0.73636
NN AUC: 0.62289
=== Fold 2 / 5 ===
LGBM AUC: 0.73633
XGB AUC: 0.74132
NN AUC: 0.62738
=== Fold 3 / 5 ===
LGBM AUC: 0.73229
XGB AUC: 0.73840
NN AUC: 0.63131
=== Fold 4 / 5 ===
LGBM AUC: 0.73200
XGB AUC: 0.73740
NN AUC: 0.62247
=== Fold 5 / 5 ===
LGBM AUC: 0.73397
XGB AUC: 0.73927
NN AUC: 0.62033

Final OOF AUC Scores -> LGBM: 0.73315, XGB: 0.73852, NN: 0.53154
Weights -> LGBM: 0.37, XGB: 0.37, NN: 0.27
