In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.pretraining import TabNetPretrainer
import warnings

warnings.filterwarnings('ignore')

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.set_float32_matmul_precision('high') 

def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test_x.csv')
train_data = train_data.drop(train_data[train_data.familysize > 50].index).reset_index(drop=True)

def manual_preprocess(df):
    df = df.copy()
    drop_list = [f'Q{chr(97+i)}E' for i in range(20)] + ['index', 'hand']
    df = df.drop(columns=[c for c in drop_list if c in df.columns])
    for col in [f'Q{chr(97+i)}A' for i in range(20)]: df[col] = (df[col] - 3.0) / 2.0
    for col in [f'tp{i:02d}' for i in range(1, 11)]: df[col] = (df[col] - 3.5) / 3.5
    return df

train_df = manual_preprocess(train_data)
test_df = manual_preprocess(test_data)

y = (2 - train_df['voted']).values
X = train_df.drop(columns=['voted'])
X_test = test_df[X.columns]

cat_cols = ['education', 'engnat', 'married', 'urban', 'gender', 'race', 'religion', 'age_group']
cat_idxs, cat_dims = [], []

for col in cat_cols:
    le = LabelEncoder()
    le.fit(X[col].astype(str))
    X[col] = le.transform(X[col].astype(str))
    X_test[col] = X_test[col].astype(str).map(lambda s: le.transform([s])[0] if s in le.classes_ else 0)
    cat_idxs.append(X.columns.get_loc(col))
    cat_dims.append(len(le.classes_))

X_train_np = X.values.astype(np.float32)
X_test_np = X_test.values.astype(np.float32)
y_train_np = y.astype(np.int64)

pretrainer = TabNetPretrainer(
    cat_idxs=cat_idxs, cat_dims=cat_dims,
    optimizer_params=dict(lr=2e-2),
    device_name=DEVICE, mask_type='entmax'
)

pretrainer.fit(
    X_train=X_train_np, eval_set=[X_train_np],
    max_epochs=50, batch_size=1024, virtual_batch_size=128, pretraining_ratio=0.8
)


best_params = {
    'n_d': 128, 'n_a': 128,
    'n_steps': 7,
    'gamma': 1.5, 'lambda_sparse': 1e-3,
    'optimizer_params': dict(lr=1e-2, weight_decay=1e-5),
    'mask_type': 'entmax', 'device_name': DEVICE,
    'cat_idxs': cat_idxs, 'cat_dims': cat_dims, 'verbose': 0
}


N_REPEAT, N_SKFOLD = 5, 7
final_preds = np.zeros(len(X_test_np))
power = 1.5

for repeat in range(N_REPEAT):
    skf = StratifiedKFold(n_splits=N_SKFOLD, shuffle=True, random_state=repeat + 42)
    for fold, (t_idx, v_idx) in enumerate(skf.split(X_train_np, y_train_np)):
        clf = TabNetClassifier(**best_params)
        clf.fit(
            X_train=X_train_np[t_idx], y_train=y_train_np[t_idx],
            eval_set=[(X_train_np[v_idx], y_train_np[v_idx])],
            eval_metric=['auc'],
            max_epochs=100, batch_size=2048, virtual_batch_size=256,
            patience=15, from_unsupervised=pretrainer 
        )
        fold_preds = clf.predict_proba(X_test_np)[:, 1]
        final_preds += np.power(fold_preds, power) / (N_REPEAT * N_SKFOLD)
        print(f"✅ R{repeat+1} F{fold+1} 완료 | Best AUC: {clf.best_cost:.4f}")

In [None]:
sub_df = pd.read_csv("sample_submission.csv")
sub_df['voted'] = 1.0 + final_preds
sub_df.to_csv("Model4.csv", index=False)