In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold
import os
import random

SEED = 42
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
if torch.backends.mps.is_available(): DEVICE = 'mps'

def set_seed(seed):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed(seed)

set_seed(SEED)

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test_x.csv')

train_data = train_data.drop(train_data[train_data.familysize > 20].index).reset_index(drop=True)
train_y = (2 - train_data['voted']).values.astype(np.float32)

answers = [f'Q{c}A' for c in 'abcdefghijklmnopqrst']
cat_cols = ['education', 'engnat', 'married', 'urban', 'age_group', 'gender', 'race', 'religion']
num_cols = [c for c in train_data.columns if c not in answers + cat_cols + ['voted', 'index', 'hand']]

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), answers + num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
])

train_x_scaled = preprocessor.fit_transform(train_data).astype(np.float32)
test_x_scaled = preprocessor.transform(test_data).astype(np.float32)

X_t = torch.tensor(train_x_scaled).unsqueeze(1).to(DEVICE)
y_t = torch.tensor(train_y).to(DEVICE)
X_test_t = torch.tensor(test_x_scaled).unsqueeze(1).to(DEVICE)


class TabularCNN(nn.Module):
    def __init__(self, input_len):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=32, kernel_size=3, padding=1),
            nn.BatchNorm1d(32),
            nn.SiLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm1d(64),
            nn.SiLU(),
            nn.MaxPool1d(2)
        )
        
        dummy = torch.zeros(1, 1, input_len)
        out_size = self.features(dummy).view(1, -1).shape[1]
        
        self.classifier = nn.Sequential(
            nn.Linear(out_size, 128),
            nn.SiLU(),
            nn.Dropout(0.4),
            nn.Linear(128, 1)
        )
        
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        return self.classifier(x).squeeze()


model_preds = np.zeros(len(test_data))
skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=SEED)

for fold, (t_idx, v_idx) in enumerate(skf.split(train_x_scaled, train_y)):
    train_dl = DataLoader(TensorDataset(X_t[t_idx], y_t[t_idx]), batch_size=256, shuffle=True)
    valid_dl = DataLoader(TensorDataset(X_t[v_idx], y_t[v_idx]), batch_size=256)
    
    model = TabularCNN(input_len=train_x_scaled.shape[1]).to(DEVICE)
    opt = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
    crit = nn.BCEWithLogitsLoss()
    
    best_loss = float('inf'); best_pred = None
    for ep in range(40):
        model.train()
        for xx, yy in train_dl:
            opt.zero_grad()
            loss = crit(model(xx), yy)
            loss.backward(); opt.step()
            
        model.eval()
        with torch.no_grad():
            vl = sum(crit(model(xx), yy).item()*len(yy) for xx, yy in valid_dl)/len(v_idx)
            if vl < best_loss:
                best_loss = vl
                best_pred = torch.sigmoid(model(X_test_t)).cpu().numpy().flatten()
    
    model_preds += best_pred / 7
    print(f"   Fold {fold+1} 완료. Best Val Loss: {best_loss:.4f}")

In [None]:
sub = pd.read_csv('sample_submission.csv')
sub['voted'] = model_preds
sub.to_csv("Model5.csv", index=False)