In [None]:
import random
from datetime import datetime
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test_x.csv')

train_data = train_data.drop(train_data[train_data.familysize > 20].index).reset_index(drop=True)
flip_cols = ["QaA", "QdA", "QgA", "QiA", "QnA", "QeA", "QfA", "QkA", "QqA", "QrA"]
for col in flip_cols:
    train_data[col] = 6 - train_data[col]
    test_data[col] = 6 - test_data[col]

answers = [f'Q{c}A' for c in 'abcdefghijklmnopqrst']
times = [f'Q{c}E' for c in 'abcdefghijklmnopqrst']
train_data['mach_score'] = train_data[answers].mean(axis=1)
test_data['mach_score'] = test_data[answers].mean(axis=1)
train_data['ans_var'] = train_data[answers].var(axis=1)
test_data['ans_var'] = test_data[answers].var(axis=1)
train_data['total_time'] = np.log1p(train_data[times].sum(axis=1))
test_data['total_time'] = np.log1p(test_data[times].sum(axis=1))

drop_list = times + ['index', 'hand']
train_y = (2 - train_data['voted']).to_numpy().astype(np.float32)
train_x_raw = train_data.drop(drop_list + ['voted'], axis=1)
test_x_raw = test_data.drop(drop_list, axis=1)

cat_cols = ['education', 'engnat', 'married', 'urban', 'age_group', 'gender', 'race', 'religion']
num_cols = [c for c in train_x_raw.columns if c not in cat_cols]

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
])

train_x_scaled = preprocessor.fit_transform(train_x_raw).astype(np.float32)
test_x_scaled = preprocessor.transform(test_x_raw).astype(np.float32)

train_y_t = torch.tensor(train_y).to(DEVICE)
train_x_t = torch.tensor(train_x_scaled).to(DEVICE)
test_x_t = torch.tensor(test_x_scaled).to(DEVICE)

class CNN1D_Voter(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.expansion = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.SiLU()
        )
        self.conv_layers = nn.Sequential(
            nn.Unflatten(1, (1, 256)),
            nn.Conv1d(1, 64, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm1d(64),
            nn.SiLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm1d(128),
            nn.SiLU(),
            nn.AdaptiveAvgPool1d(1)
        )
        self.fc = nn.Sequential(
            nn.Linear(128, 64),
            nn.SiLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        x = self.expansion(x)
        x = self.conv_layers(x).squeeze(-1)
        return self.fc(x).squeeze()

N_REPEAT, N_SKFOLD, N_EPOCH = 3, 5, 60
BATCH_SIZE = 512
prediction = np.zeros((len(test_x_t), 1), dtype=np.float32)

for repeat in range(N_REPEAT):
    skf = StratifiedKFold(n_splits=N_SKFOLD, random_state=repeat+2026, shuffle=True)
    for skfold, (train_idx, valid_idx) in enumerate(skf.split(train_x_scaled, train_y)):
        train_loader = DataLoader(TensorDataset(train_x_t[train_idx], train_y_t[train_idx]), batch_size=BATCH_SIZE, shuffle=True)
        valid_loader = DataLoader(TensorDataset(train_x_t[valid_idx], train_y_t[valid_idx]), batch_size=BATCH_SIZE)
        
        model = CNN1D_Voter(train_x_t.shape[1]).to(DEVICE)
        criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.1], device=DEVICE))
        optimizer = optim.AdamW(model.parameters(), lr=2e-3, weight_decay=0.05)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=N_EPOCH)
        
        best_loss = float('inf')
        prediction_fold = np.zeros((len(test_x_t), 1), dtype=np.float32)

        pbar = tqdm(range(N_EPOCH), desc=f'R{repeat+1} S{skfold+1}')
        for epoch in pbar:
            model.train()
            for xx, yy in train_loader:
                optimizer.zero_grad()
                loss = criterion(model(xx), yy * 0.95 + 0.025)
                loss.backward(); optimizer.step()
            scheduler.step()

            model.eval()
            with torch.no_grad():
                val_loss = sum(criterion(model(xx), yy).item() * len(yy) for xx, yy in valid_loader) / len(valid_idx)
                if val_loss < best_loss:
                    best_loss = val_loss
                    prediction_fold = torch.sigmoid(model(test_x_t)).detach().cpu().numpy().reshape(-1, 1)
            pbar.set_postfix({'val_loss': f'{best_loss:.4f}'})

        prediction += prediction_fold / (N_REPEAT * N_SKFOLD)

In [None]:
submission = pd.read_csv('./sample_submission.csv')
submission['voted'] = 1.0 + prediction.flatten()
submission.to_csv('Model8.csv', index=False)