## Import

In [1]:
import pandas as pd
import numpy as np
import os
import random

from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

## Setting

In [2]:
CFG = {
    'BATCH_SIZE': 2098,
    'EPOCHS': 40,
    'LEARNING_RATE': 1e-5,
    'SEED' : 42
}
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CFG['SEED']) # Seed 고정

## Data Load

In [4]:
# 데이터 로드
all_train = pd.read_parquet("./train.parquet", engine="pyarrow")
test = pd.read_parquet("./test.parquet", engine="pyarrow").drop(columns=['ID'])

print("Train shape:", all_train.shape)
print("Test shape:", test.shape)

Train shape: (10704179, 119)
Test shape: (1527298, 118)


## Data Down-Sampling

In [5]:
# clicked == 1 데이터
clicked_1 = all_train[all_train['clicked'] == 1]

# clicked == 0 데이터에서 동일 개수x2 만큼 무작위 추출 (다운 샘플링)
clicked_0 = all_train[all_train['clicked'] == 0].sample(n=len(clicked_1)*2, random_state=42)

# 두 데이터프레임 합치기
train = pd.concat([clicked_1, clicked_0], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)

In [6]:
print("Train shape:", train.shape)
print("Train clicked:0:", train[train['clicked']==0].shape)
print("Train clicked:1:", train[train['clicked']==1].shape)

Train shape: (612537, 119)
Train clicked:0: (408358, 119)
Train clicked:1: (204179, 119)


## Data Column Setting

In [7]:
# Target / Sequence
target_col = "clicked"
seq_col = "seq"

# 학습에 사용할 피처: ID/seq/target 제외, 나머지 전부
FEATURE_EXCLUDE = {target_col, seq_col, "ID"}
feature_cols = [c for c in train.columns if c not in FEATURE_EXCLUDE]

print("Num features:", len(feature_cols))
print("Sequence:", seq_col)
print("Target:", target_col)

Num features: 117
Sequence: seq
Target: clicked


## Define Custom Dataset

In [8]:
class ClickDataset(Dataset):
    def __init__(self, df, feature_cols, seq_col, target_col=None, has_target=True):
        self.df = df.reset_index(drop=True)
        self.feature_cols = feature_cols
        self.seq_col = seq_col
        self.target_col = target_col
        self.has_target = has_target

        # 비-시퀀스 피처: 전부 연속값으로
        self.X = self.df[self.feature_cols].astype(float).fillna(0).values

        # 시퀀스: 문자열 그대로 보관 (lazy 파싱)
        self.seq_strings = self.df[self.seq_col].astype(str).values

        if self.has_target:
            self.y = self.df[self.target_col].astype(np.float32).values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        x = torch.tensor(self.X[idx], dtype=torch.float)

        # 전체 시퀀스 사용 (빈 시퀀스만 방어)
        s = self.seq_strings[idx]
        if s:
            arr = np.fromstring(s, sep=",", dtype=np.float32)
        else:
            arr = np.array([], dtype=np.float32)

        if arr.size == 0:
            arr = np.array([0.0], dtype=np.float32)  # 빈 시퀀스 방어

        seq = torch.from_numpy(arr)  # shape (seq_len,)

        if self.has_target:
            y = torch.tensor(self.y[idx], dtype=torch.float)
            return x, seq, y
        else:
            return x, seq

In [9]:
def collate_fn_train(batch):
    xs, seqs, ys = zip(*batch)
    xs = torch.stack(xs)
    ys = torch.stack(ys)
    seqs_padded = nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=0.0)
    seq_lengths = torch.tensor([len(s) for s in seqs], dtype=torch.long)
    seq_lengths = torch.clamp(seq_lengths, min=1)  # 빈 시퀀스 방지
    return xs, seqs_padded, seq_lengths, ys

def collate_fn_infer(batch):
    xs, seqs = zip(*batch)
    xs = torch.stack(xs)
    seqs_padded = nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=0.0)
    seq_lengths = torch.tensor([len(s) for s in seqs], dtype=torch.long)
    seq_lengths = torch.clamp(seq_lengths, min=1)
    return xs, seqs_padded, seq_lengths

## Define Model Architecture

In [10]:
class TabularSeqModel(nn.Module):
    def __init__(self, d_features, lstm_hidden=32, hidden_units=[1024, 512, 256, 128], dropout=0.2):
        super().__init__()
        # 모든 비-시퀀스 피처에 BN
        self.bn_x = nn.BatchNorm1d(d_features)
        # seq: 숫자 시퀀스 → LSTM
        self.lstm = nn.LSTM(input_size=1, hidden_size=lstm_hidden, batch_first=True)

        # 최종 MLP
        input_dim = d_features + lstm_hidden
        layers = []
        for h in hidden_units:
            layers += [nn.Linear(input_dim, h), nn.ReLU(), nn.Dropout(dropout)]
            input_dim = h
        layers += [nn.Linear(input_dim, 1)]
        self.mlp = nn.Sequential(*layers)

    def forward(self, x_feats, x_seq, seq_lengths):
        # 비-시퀀스 피처
        x = self.bn_x(x_feats)

        # 시퀀스 → LSTM (pack)
        x_seq = x_seq.unsqueeze(-1)  # (B, L, 1)
        packed = nn.utils.rnn.pack_padded_sequence(
            x_seq, seq_lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        _, (h_n, _) = self.lstm(packed)
        h = h_n[-1]                  # (B, lstm_hidden)

        z = torch.cat([x, h], dim=1)
        return self.mlp(z).squeeze(1)  # logits

## Train / Validation

In [11]:
def train_model(train_df, feature_cols, seq_col, target_col,
                batch_size=512, epochs=3, lr=1e-3, device="cuda"):

    # 1) split
    tr_df, va_df = train_test_split(train_df, test_size=0.2, random_state=42, shuffle=True)

    # 2) Dataset / Loader (l_max 인자 제거)
    train_dataset = ClickDataset(tr_df, feature_cols, seq_col, target_col, has_target=True)
    val_dataset   = ClickDataset(va_df, feature_cols, seq_col, target_col, has_target=True)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,  collate_fn=collate_fn_train)
    val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False, collate_fn=collate_fn_train)

    # 3) 모델
    d_features = len(feature_cols)
    model = TabularSeqModel(d_features=d_features, lstm_hidden=64, hidden_units=[256,128], dropout=0.2).to(device)

    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # 4) Loop
    for epoch in range(1, epochs+1):
        model.train()
        train_loss = 0.0
        for xs, seqs, seq_lens, ys in tqdm(train_loader, desc=f"Train Epoch {epoch}"):
            xs, seqs, seq_lens, ys = xs.to(device), seqs.to(device), seq_lens.to(device), ys.to(device)
            optimizer.zero_grad()
            logits = model(xs, seqs, seq_lens)
            loss = criterion(logits, ys)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * ys.size(0)
        train_loss /= len(train_dataset)

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for xs, seqs, seq_lens, ys in tqdm(val_loader, desc=f"Val Epoch {epoch}"):
                xs, seqs, seq_lens, ys = xs.to(device), seqs.to(device), seq_lens.to(device), ys.to(device)
                logits = model(xs, seqs, seq_lens)
                loss = criterion(logits, ys)
                val_loss += loss.item() * len(ys)
        val_loss /= len(val_dataset)

        print(f"[Epoch {epoch}] Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

    return model

## Run!!

In [12]:
model = train_model(
    train_df=train,
    feature_cols=feature_cols,
    seq_col=seq_col,
    target_col=target_col,
    batch_size=CFG['BATCH_SIZE'],
    epochs=CFG['EPOCHS'],
    lr=CFG['LEARNING_RATE'],
    device=device
)

Train Epoch 1: 100%|██████████| 234/234 [01:38<00:00,  2.37it/s]
Val Epoch 1: 100%|██████████| 59/59 [00:15<00:00,  3.84it/s]


[Epoch 1] Train Loss: 0.6694 | Val Loss: 0.6437


Train Epoch 2: 100%|██████████| 234/234 [01:40<00:00,  2.33it/s]
Val Epoch 2: 100%|██████████| 59/59 [00:15<00:00,  3.91it/s]


[Epoch 2] Train Loss: 0.6306 | Val Loss: 0.6217


Train Epoch 3: 100%|██████████| 234/234 [01:36<00:00,  2.42it/s]
Val Epoch 3: 100%|██████████| 59/59 [00:15<00:00,  3.88it/s]


[Epoch 3] Train Loss: 0.6179 | Val Loss: 0.6148


Train Epoch 4: 100%|██████████| 234/234 [01:40<00:00,  2.33it/s]
Val Epoch 4: 100%|██████████| 59/59 [00:16<00:00,  3.67it/s]


[Epoch 4] Train Loss: 0.6118 | Val Loss: 0.6093


Train Epoch 5: 100%|██████████| 234/234 [01:40<00:00,  2.34it/s]
Val Epoch 5: 100%|██████████| 59/59 [00:16<00:00,  3.68it/s]


[Epoch 5] Train Loss: 0.6065 | Val Loss: 0.6036


Train Epoch 6: 100%|██████████| 234/234 [01:37<00:00,  2.40it/s]
Val Epoch 6: 100%|██████████| 59/59 [00:15<00:00,  3.71it/s]


[Epoch 6] Train Loss: 0.6013 | Val Loss: 0.6013


Train Epoch 7: 100%|██████████| 234/234 [01:40<00:00,  2.32it/s]
Val Epoch 7: 100%|██████████| 59/59 [00:15<00:00,  3.69it/s]


[Epoch 7] Train Loss: 0.5969 | Val Loss: 0.5970


Train Epoch 8: 100%|██████████| 234/234 [01:41<00:00,  2.30it/s]
Val Epoch 8: 100%|██████████| 59/59 [00:16<00:00,  3.64it/s]


[Epoch 8] Train Loss: 0.5931 | Val Loss: 0.5928


Train Epoch 9: 100%|██████████| 234/234 [01:36<00:00,  2.44it/s]
Val Epoch 9: 100%|██████████| 59/59 [00:15<00:00,  3.89it/s]


[Epoch 9] Train Loss: 0.5906 | Val Loss: 0.5876


Train Epoch 10: 100%|██████████| 234/234 [01:36<00:00,  2.43it/s]
Val Epoch 10: 100%|██████████| 59/59 [00:15<00:00,  3.86it/s]


[Epoch 10] Train Loss: 0.5885 | Val Loss: 0.5862


Train Epoch 11: 100%|██████████| 234/234 [01:37<00:00,  2.39it/s]
Val Epoch 11: 100%|██████████| 59/59 [00:15<00:00,  3.91it/s]


[Epoch 11] Train Loss: 0.5869 | Val Loss: 0.5855


Train Epoch 12: 100%|██████████| 234/234 [01:39<00:00,  2.36it/s]
Val Epoch 12: 100%|██████████| 59/59 [00:15<00:00,  3.90it/s]


[Epoch 12] Train Loss: 0.5857 | Val Loss: 0.5819


Train Epoch 13: 100%|██████████| 234/234 [01:43<00:00,  2.27it/s]
Val Epoch 13: 100%|██████████| 59/59 [00:15<00:00,  3.71it/s]


[Epoch 13] Train Loss: 0.5852 | Val Loss: 0.5842


Train Epoch 14: 100%|██████████| 234/234 [01:38<00:00,  2.38it/s]
Val Epoch 14: 100%|██████████| 59/59 [00:16<00:00,  3.67it/s]


[Epoch 14] Train Loss: 0.5839 | Val Loss: 0.5829


Train Epoch 15: 100%|██████████| 234/234 [01:43<00:00,  2.27it/s]
Val Epoch 15: 100%|██████████| 59/59 [00:15<00:00,  3.70it/s]


[Epoch 15] Train Loss: 0.5836 | Val Loss: 0.5813


Train Epoch 16: 100%|██████████| 234/234 [01:40<00:00,  2.32it/s]
Val Epoch 16: 100%|██████████| 59/59 [00:16<00:00,  3.64it/s]


[Epoch 16] Train Loss: 0.5830 | Val Loss: 0.5820


Train Epoch 17: 100%|██████████| 234/234 [01:40<00:00,  2.32it/s]
Val Epoch 17: 100%|██████████| 59/59 [00:14<00:00,  3.94it/s]


[Epoch 17] Train Loss: 0.5825 | Val Loss: 0.5812


Train Epoch 18: 100%|██████████| 234/234 [01:32<00:00,  2.52it/s]
Val Epoch 18: 100%|██████████| 59/59 [00:15<00:00,  3.86it/s]


[Epoch 18] Train Loss: 0.5818 | Val Loss: 0.5800


Train Epoch 19: 100%|██████████| 234/234 [01:41<00:00,  2.31it/s]
Val Epoch 19: 100%|██████████| 59/59 [00:15<00:00,  3.90it/s]


[Epoch 19] Train Loss: 0.5816 | Val Loss: 0.5781


Train Epoch 20: 100%|██████████| 234/234 [01:44<00:00,  2.23it/s]
Val Epoch 20: 100%|██████████| 59/59 [00:17<00:00,  3.36it/s]


[Epoch 20] Train Loss: 0.5809 | Val Loss: 0.5803


Train Epoch 21: 100%|██████████| 234/234 [01:43<00:00,  2.27it/s]
Val Epoch 21: 100%|██████████| 59/59 [00:16<00:00,  3.56it/s]


[Epoch 21] Train Loss: 0.5808 | Val Loss: 0.5787


Train Epoch 22: 100%|██████████| 234/234 [01:39<00:00,  2.36it/s]
Val Epoch 22: 100%|██████████| 59/59 [00:15<00:00,  3.89it/s]


[Epoch 22] Train Loss: 0.5803 | Val Loss: 0.5791


Train Epoch 23: 100%|██████████| 234/234 [01:37<00:00,  2.40it/s]
Val Epoch 23: 100%|██████████| 59/59 [00:15<00:00,  3.90it/s]


[Epoch 23] Train Loss: 0.5799 | Val Loss: 0.5779


Train Epoch 24: 100%|██████████| 234/234 [01:37<00:00,  2.41it/s]
Val Epoch 24: 100%|██████████| 59/59 [00:16<00:00,  3.68it/s]


[Epoch 24] Train Loss: 0.5795 | Val Loss: 0.5780


Train Epoch 25: 100%|██████████| 234/234 [01:43<00:00,  2.27it/s]
Val Epoch 25: 100%|██████████| 59/59 [00:15<00:00,  3.70it/s]


[Epoch 25] Train Loss: 0.5795 | Val Loss: 0.5804


Train Epoch 26: 100%|██████████| 234/234 [01:38<00:00,  2.37it/s]
Val Epoch 26: 100%|██████████| 59/59 [00:15<00:00,  3.69it/s]


[Epoch 26] Train Loss: 0.5788 | Val Loss: 0.5772


Train Epoch 27: 100%|██████████| 234/234 [01:41<00:00,  2.30it/s]
Val Epoch 27: 100%|██████████| 59/59 [00:16<00:00,  3.68it/s]


[Epoch 27] Train Loss: 0.5785 | Val Loss: 0.5795


Train Epoch 28: 100%|██████████| 234/234 [01:39<00:00,  2.36it/s]
Val Epoch 28: 100%|██████████| 59/59 [00:15<00:00,  3.71it/s]


[Epoch 28] Train Loss: 0.5783 | Val Loss: 0.5797


Train Epoch 29: 100%|██████████| 234/234 [01:40<00:00,  2.34it/s]
Val Epoch 29: 100%|██████████| 59/59 [00:15<00:00,  3.71it/s]


[Epoch 29] Train Loss: 0.5781 | Val Loss: 0.5770


Train Epoch 30: 100%|██████████| 234/234 [01:40<00:00,  2.33it/s]
Val Epoch 30: 100%|██████████| 59/59 [00:15<00:00,  3.92it/s]


[Epoch 30] Train Loss: 0.5780 | Val Loss: 0.5767


Train Epoch 31: 100%|██████████| 234/234 [01:37<00:00,  2.39it/s]
Val Epoch 31: 100%|██████████| 59/59 [00:15<00:00,  3.89it/s]


[Epoch 31] Train Loss: 0.5777 | Val Loss: 0.5756


Train Epoch 32: 100%|██████████| 234/234 [01:36<00:00,  2.43it/s]
Val Epoch 32: 100%|██████████| 59/59 [00:15<00:00,  3.70it/s]


[Epoch 32] Train Loss: 0.5773 | Val Loss: 0.5745


Train Epoch 33: 100%|██████████| 234/234 [01:40<00:00,  2.33it/s]
Val Epoch 33: 100%|██████████| 59/59 [00:16<00:00,  3.69it/s]


[Epoch 33] Train Loss: 0.5773 | Val Loss: 0.5747


Train Epoch 34: 100%|██████████| 234/234 [01:39<00:00,  2.36it/s]
Val Epoch 34: 100%|██████████| 59/59 [00:15<00:00,  3.69it/s]


[Epoch 34] Train Loss: 0.5770 | Val Loss: 0.5769


Train Epoch 35: 100%|██████████| 234/234 [01:41<00:00,  2.30it/s]
Val Epoch 35: 100%|██████████| 59/59 [00:15<00:00,  3.70it/s]


[Epoch 35] Train Loss: 0.5766 | Val Loss: 0.5756


Train Epoch 36: 100%|██████████| 234/234 [01:41<00:00,  2.32it/s]
Val Epoch 36: 100%|██████████| 59/59 [00:15<00:00,  3.71it/s]


[Epoch 36] Train Loss: 0.5765 | Val Loss: 0.5744


Train Epoch 37: 100%|██████████| 234/234 [01:39<00:00,  2.36it/s]
Val Epoch 37: 100%|██████████| 59/59 [00:15<00:00,  3.73it/s]


[Epoch 37] Train Loss: 0.5765 | Val Loss: 0.5743


Train Epoch 38: 100%|██████████| 234/234 [01:40<00:00,  2.33it/s]
Val Epoch 38: 100%|██████████| 59/59 [00:15<00:00,  3.73it/s]


[Epoch 38] Train Loss: 0.5764 | Val Loss: 0.5762


Train Epoch 39: 100%|██████████| 234/234 [01:40<00:00,  2.34it/s]
Val Epoch 39: 100%|██████████| 59/59 [00:15<00:00,  3.75it/s]


[Epoch 39] Train Loss: 0.5763 | Val Loss: 0.5744


Train Epoch 40: 100%|██████████| 234/234 [01:39<00:00,  2.36it/s]
Val Epoch 40: 100%|██████████| 59/59 [00:15<00:00,  3.69it/s]


[Epoch 40] Train Loss: 0.5760 | Val Loss: 0.5741


## Inference

In [13]:
# 1) Dataset/Loader
test_ds = ClickDataset(test, feature_cols, seq_col, has_target=False)
test_ld = DataLoader(test_ds, batch_size=CFG['BATCH_SIZE'], shuffle=False, collate_fn=collate_fn_infer)

# 2) Predict
model.eval()
outs = []
with torch.no_grad():
    for xs, seqs, lens in tqdm(test_ld, desc="Inference"):
        xs, seqs, lens = xs.to(device), seqs.to(device), lens.to(device)
        outs.append(torch.sigmoid(model(xs, seqs, lens)).cpu())

test_preds = torch.cat(outs).numpy()

Inference: 100%|██████████| 728/728 [03:20<00:00,  3.63it/s]


## Submission

In [14]:
submit = pd.read_csv('./sample_submission.csv')
submit['clicked'] = test_preds

In [15]:
submit.to_csv('./baseline_submit.csv', index=False)