In [None]:
import random
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import copy

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test_x.csv')

train_data = train_data.drop(train_data[train_data.familysize > 20].index).reset_index(drop=True)
flip_cols = ["QaA", "QdA", "QgA", "QiA", "QnA", "QeA", "QfA", "QkA", "QqA", "QrA"]
for col in flip_cols:
    train_data[col] = 6 - train_data[col]
    test_data[col] = 6 - test_data[col]

answers = [f'Q{c}A' for c in 'abcdefghijklmnopqrst']
times = [f'Q{c}E' for c in 'abcdefghijklmnopqrst']
train_data['mach_score'] = train_data[answers].mean(axis=1)
test_data['mach_score'] = test_data[answers].mean(axis=1)
train_data['total_time'] = np.log1p(train_data[times].sum(axis=1))
test_data['total_time'] = np.log1p(test_data[times].sum(axis=1))

drop_list = times + ['index', 'hand']
train_y = (2 - train_data['voted']).to_numpy().astype(np.float32)
train_x_raw = train_data.drop(drop_list + ['voted'], axis=1)
test_x_raw = test_data.drop(drop_list, axis=1)

cat_cols = ['education', 'engnat', 'married', 'urban', 'age_group', 'gender', 'race', 'religion']
num_cols = [c for c in train_x_raw.columns if c not in cat_cols]

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
])

train_x_scaled = preprocessor.fit_transform(train_x_raw).astype(np.float32)
test_x_scaled = preprocessor.transform(test_x_raw).astype(np.float32)

train_x_t = torch.tensor(train_x_scaled).to(DEVICE)
train_y_t = torch.tensor(train_y).to(DEVICE)
test_x_t = torch.tensor(test_x_scaled).to(DEVICE)

class SwapNoise(nn.Module):
    def __init__(self, prob=0.15):
        super().__init__()
        self.prob = prob
    def forward(self, x):
        if not self.training: return x
        mask = torch.rand(x.shape, device=x.device) < self.prob
        shuffled = x[torch.randperm(x.shape[0], device=x.device)]
        x_noised = x.clone()
        x_noised[mask] = shuffled[mask]
        return x_noised

class DAE(nn.Module):
    def __init__(self, input_dim, hidden_dim=1024):
        super().__init__()
        self.noise = SwapNoise(prob=0.15)
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim), nn.BatchNorm1d(hidden_dim), nn.SiLU(),
            nn.Linear(hidden_dim, hidden_dim), nn.BatchNorm1d(hidden_dim), nn.SiLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), nn.BatchNorm1d(hidden_dim), nn.SiLU(),
            nn.Linear(hidden_dim, input_dim)
        )
    def forward(self, x):
        encoded = self.encoder(self.noise(x))
        decoded = self.decoder(encoded)
        return decoded, encoded

class DAE_Classifier(nn.Module):
    def __init__(self, dae_encoder, input_dim, dae_dim=1024):
        super().__init__()
        self.encoder = dae_encoder
        for param in self.encoder.parameters():
            param.requires_grad = False 
        combined_dim = input_dim + dae_dim
        self.net = nn.Sequential(
            nn.Linear(combined_dim, 512), nn.BatchNorm1d(512), nn.SiLU(), nn.Dropout(0.4),
            nn.Linear(512, 256), nn.BatchNorm1d(256), nn.SiLU(), nn.Dropout(0.3),
            nn.Linear(256, 1)
        )
    def forward(self, x):
        with torch.no_grad():
            encoded = self.encoder(x)
        combined = torch.cat([x, encoded], dim=1)
        return self.net(combined).squeeze()


dae_model = DAE(train_x_t.shape[1]).to(DEVICE)
dae_opt = optim.AdamW(dae_model.parameters(), lr=1e-3)
dae_crit = nn.MSELoss()

dae_loader = DataLoader(TensorDataset(train_x_t), batch_size=512, shuffle=True)
for _ in tqdm(range(50)):
    dae_model.train()
    for (batch_x,) in dae_loader:
        dae_opt.zero_grad()
        decoded, _ = dae_model(batch_x)
        loss = dae_crit(decoded, batch_x)
        loss.backward(); dae_opt.step()

skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)
dae_preds = np.zeros((len(test_x_t), 1), dtype=np.float32)

for fold, (t_idx, v_idx) in enumerate(skf.split(train_x_scaled, train_y)):
    train_loader = DataLoader(TensorDataset(train_x_t[t_idx], train_y_t[t_idx]), batch_size=512, shuffle=True)
    valid_loader = DataLoader(TensorDataset(train_x_t[v_idx], train_y_t[v_idx]), batch_size=512)
    
    model = DAE_Classifier(copy.deepcopy(dae_model.encoder), train_x_t.shape[1]).to(DEVICE)
    optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=0.05)
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.1], device=DEVICE))
    
    best_loss = float('inf')
    fold_pred = np.zeros((len(test_x_t), 1))
    
    pbar = tqdm(range(60), desc=f'Fold {fold+1}')
    for epoch in pbar:
        model.train()
        for xx, yy in train_loader:
            optimizer.zero_grad()
            loss = criterion(model(xx), yy)
            loss.backward(); optimizer.step()
        
        model.eval()
        with torch.no_grad():
            val_loss = sum(criterion(model(xx), yy).item() * len(yy) for xx, yy in valid_loader) / len(v_idx)
            if val_loss < best_loss:
                best_loss = val_loss
                fold_pred = torch.sigmoid(model(test_x_t)).cpu().numpy().reshape(-1, 1)
        pbar.set_postfix({'val_loss': f'{best_loss:.4f}'})
    
    dae_preds += fold_pred / 7

In [None]:
sub_df = pd.read_csv("./sample_submission.csv")
sub_df.iloc[:, 1:] = 1.0 - dae_preds
sub_df.to_csv("Model6.csv", index=False)