In [None]:
import random
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from pytorch_tabnet.tab_model import TabNetClassifier
import os

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.backends.mps.is_available():
        torch.mps.empty_cache()

set_seed(42)
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"

train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test_x.csv")

train_data = train_data.drop(train_data[train_data.familysize > 20].index).reset_index(drop=True)
flip_cols = ["QaA", "QdA", "QgA", "QiA", "QnA", "QeA", "QfA", "QkA", "QqA", "QrA"]
for col in flip_cols:
    train_data[col] = 6 - train_data[col]
    test_data[col] = 6 - test_data[col]

train_y = (2 - train_data['voted']).to_numpy().astype(np.int64)

cat_cols = ['education', 'engnat', 'married', 'urban', 'age_group', 'gender', 'race', 'religion']
num_cols = [c for c in train_data.columns if c not in cat_cols + ['voted', 'index', 'hand']]

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
])

train_x_raw = train_data[num_cols + cat_cols]
test_x_raw = test_data[num_cols + cat_cols]

train_x_scaled = preprocessor.fit_transform(train_x_raw).astype(np.float32)
test_x_scaled = preprocessor.transform(test_x_raw).astype(np.float32)

skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)
tabnet_preds = np.zeros(len(test_x_scaled))


for fold, (t_idx, v_idx) in enumerate(skf.split(train_x_scaled, train_y)):
    X_train, y_train = train_x_scaled[t_idx], train_y[t_idx]
    X_val, y_val = train_x_scaled[v_idx], train_y[v_idx]
    
    model = TabNetClassifier(
        n_d=32, n_a=32, n_steps=4,
        gamma=1.3, lambda_sparse=1e-3,
        optimizer_fn=torch.optim.AdamW,
        optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
        mask_type='entmax',
        device_name=DEVICE
    )
    
    model.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_val, y_val)],
        eval_name=['valid'],
        eval_metric=['auc'],
        max_epochs=100, patience=20,
        batch_size=1024, virtual_batch_size=128,
        num_workers=0, pin_memory=False
    )
    
    tabnet_preds += model.predict_proba(test_x_scaled)[:, 1] / 7
    print(f"✅ Fold {fold+1} 완료")

In [None]:
sub = pd.read_csv("./sample_submission.csv")
sub['voted'] = 1.0 - tabnet_preds
sub.to_csv('Model7.csv', index=False)