In [1]:
import os
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import QuantileTransformer

import torch
from pytorch_tabnet.tab_model import TabNetClassifier
import lightgbm as lgb
import warnings
import re
import os
warnings.filterwarnings('ignore')

SEED = 42

def seed_everything(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

seed_everything()
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", DEVICE)


DEVICE: cpu


In [2]:
DATA_PATH = "data"
DATA2_PATH = "data2"

try:
    train_data = pd.read_csv(os.path.join(DATA_PATH, "train.csv"))
    test_data = pd.read_csv(os.path.join(DATA_PATH, "test.csv"))

    train_data2 = pd.read_csv(os.path.join(DATA2_PATH, "train_dataset.csv"))
    test_data2 = pd.read_csv(os.path.join(DATA2_PATH, "test_dataset.csv"))

    test_ids = test_data['id']
    
    train_data = train_data.drop('id', axis=1)
    test_data = test_data.drop('id', axis=1)

    def clean_col_names(df):
        cols = df.columns
        new_cols = []
        for col in cols:
            new_col = re.sub(r'[^A-Za-z0-9_]+', '', col)
            new_cols.append(new_col.lower())
        df.columns = new_cols
        return df

    train_data = clean_col_names(train_data)
    test_data = clean_col_names(test_data)
    train_data2 = clean_col_names(train_data2)
    test_data2 = clean_col_names(test_data2)


    df_train = pd.concat([train_data, train_data2, test_data2], ignore_index=True)
    df_test = test_data
    
    df_train = df_train.drop_duplicates()

    df_train.dropna(subset=['smoking'], inplace=True)
    
    df_train['smoking'] = df_train['smoking'].astype(int)

    print(f"Original training data had {train_data.shape[0]} rows.")
    print(f"Our new combined training data has {df_train.shape[0]} rows.")
    print(f"Our final test data has {test_data.shape[0]} rows to predict.\n")

except FileNotFoundError:
    print(f"Couldn't find the files. Check this path: {DATA_PATH} and {DATA2_PATH}")


TARGET = "smoking"
DROP_COLS = ["id"]

df_train[TARGET] = df_train[TARGET].astype(int)


Original training data had 15000 rows.
Our new combined training data has 48467 rows.
Our final test data has 10000 rows to predict.



In [3]:
def add_features(df):
    df = df.copy()

    df["bmi"] = df["weightkg"] / (df["heightcm"] / 100) ** 2

    df["vai"] = (
        (df["waistcm"] / 100) *
        df["bmi"] *
        (df["triglyceride"] / 150) *
        (40 / (df["hdl"] + 1e-6))
    )
    return df


LOG_COLS = ["triglyceride", "gtp", "ast", "alt", "ldl"]

def add_log_features(df):
    df = df.copy()
    for col in LOG_COLS:
        df[f"log_{col}"] = np.log1p(df[col])
    return df


In [4]:
df_train_fe = add_features(df_train)
df_test_fe  = add_features(df_test)

df_train_fe = add_log_features(df_train_fe)
df_test_fe  = add_log_features(df_test_fe)


In [5]:
# --- TabNet 用（logのみ使う） ---
TABNET_LOG_COLS = [f"log_{c}" for c in LOG_COLS]

FEATURE_COLS_TABNET = [
    c for c in df_train_fe.columns
    if c not in DROP_COLS + [TARGET] + LOG_COLS
] + TABNET_LOG_COLS


# --- LightGBM 用（生 + log） ---
FEATURE_COLS_LGB = [
    c for c in df_train_fe.columns
    if c not in DROP_COLS + [TARGET]
]


In [6]:
def apply_rankgauss_fold(train_df, valid_df, test_df, cols):
    qt = QuantileTransformer(
        n_quantiles=1000,
        output_distribution="normal",
        random_state=SEED
    )
    qt.fit(train_df[cols].values)

    train_df[cols] = qt.transform(train_df[cols].values)
    valid_df[cols] = qt.transform(valid_df[cols].values)
    test_df[cols]  = qt.transform(test_df[cols].values)

    return train_df, valid_df, test_df


In [7]:
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)


In [8]:
oof_tabnet = np.zeros(len(df_train_fe))
test_tabnet = np.zeros(len(df_test_fe))

for fold, (tr_idx, va_idx) in enumerate(skf.split(df_train_fe, df_train_fe[TARGET])):
    print(f"\n[TabNet] Fold {fold+1}")

    tr_df = df_train_fe.iloc[tr_idx].copy()
    va_df = df_train_fe.iloc[va_idx].copy()
    te_df = df_test_fe.copy()

    tr_df, va_df, te_df = apply_rankgauss_fold(
        tr_df, va_df, te_df, TABNET_LOG_COLS
    )

    X_tr = tr_df[FEATURE_COLS_TABNET].apply(pd.to_numeric, errors="coerce")
    X_va = va_df[FEATURE_COLS_TABNET].apply(pd.to_numeric, errors="coerce")
    X_te = te_df[FEATURE_COLS_TABNET].apply(pd.to_numeric, errors="coerce")

    med = X_tr.median()
    X_tr = X_tr.fillna(med).values.astype(np.float32)
    X_va = X_va.fillna(med).values.astype(np.float32)
    X_te = X_te.fillna(med).values.astype(np.float32)

    y_tr = tr_df[TARGET].values
    y_va = va_df[TARGET].values

    model = TabNetClassifier(
        n_d=32, n_a=32,
        n_steps=5,
        gamma=1.5,
        lambda_sparse=1e-4,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=2e-2),
        mask_type="entmax",
        seed=SEED,
        device_name=DEVICE,
        verbose=0
    )

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric=["auc"],
        max_epochs=150,
        patience=20,
        batch_size=1024,
        virtual_batch_size=128,
        num_workers=0
    )

    oof_tabnet[va_idx] = model.predict_proba(X_va)[:, 1]
    test_tabnet += model.predict_proba(X_te)[:, 1] / N_SPLITS

print("TabNet CV AUC:", roc_auc_score(df_train_fe[TARGET], oof_tabnet))



[TabNet] Fold 1

Early stopping occurred at epoch 87 with best_epoch = 67 and best_val_0_auc = 0.84931

[TabNet] Fold 2

Early stopping occurred at epoch 56 with best_epoch = 36 and best_val_0_auc = 0.84603

[TabNet] Fold 3

Early stopping occurred at epoch 84 with best_epoch = 64 and best_val_0_auc = 0.84697

[TabNet] Fold 4

Early stopping occurred at epoch 83 with best_epoch = 63 and best_val_0_auc = 0.85063

[TabNet] Fold 5

Early stopping occurred at epoch 57 with best_epoch = 37 and best_val_0_auc = 0.85555
TabNet CV AUC: 0.849041017810293


In [9]:
oof_lgb = np.zeros(len(df_train_fe))
test_lgb = np.zeros(len(df_test_fe))

X_all = df_train_fe[FEATURE_COLS_LGB].apply(pd.to_numeric, errors="coerce")
X_all = X_all.fillna(X_all.median()).values
y_all = df_train_fe[TARGET].values

X_test_lgb = df_test_fe[FEATURE_COLS_LGB].apply(pd.to_numeric, errors="coerce")
X_test_lgb = X_test_lgb.fillna(
    df_train_fe[FEATURE_COLS_LGB].median()
).values

lgb_params = {
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.03,
    "num_leaves": 31,
    "min_data_in_leaf": 30,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "lambda_l2": 1.0,
    "verbosity": -1,
    "seed": SEED,
}

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_all, y_all)):
    print(f"\n[LGBM] Fold {fold+1}")

    model = lgb.LGBMClassifier(**lgb_params, n_estimators=5000)

    model.fit(
        X_all[tr_idx], y_all[tr_idx],
        eval_set=[(X_all[va_idx], y_all[va_idx])],
        callbacks=[
            lgb.early_stopping(100),
            lgb.log_evaluation(0)
        ]
    )

    oof_lgb[va_idx] = model.predict_proba(X_all[va_idx])[:, 1]
    test_lgb += model.predict_proba(X_test_lgb)[:, 1] / N_SPLITS

print("LGBM CV AUC:", roc_auc_score(y_all, oof_lgb))



[LGBM] Fold 1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[729]	valid_0's auc: 0.854549

[LGBM] Fold 2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[585]	valid_0's auc: 0.852865

[LGBM] Fold 3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[556]	valid_0's auc: 0.851317

[LGBM] Fold 4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[397]	valid_0's auc: 0.855285

[LGBM] Fold 5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[453]	valid_0's auc: 0.862287
LGBM CV AUC: 0.8551638507403657


In [10]:
oof_lgb2 = np.zeros(len(df_train_fe))
test_lgb2 = np.zeros(len(df_test_fe))

# ★ 保守型パラメータ（性格を変える）
lgb_params_safe = {
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.02,
    "num_leaves": 16,          # 小さめ
    "min_data_in_leaf": 60,    # 大きめ
    "feature_fraction": 0.7,
    "bagging_fraction": 0.7,
    "bagging_freq": 1,
    "lambda_l1": 0.5,          # 正則化強め
    "lambda_l2": 2.0,
    "verbosity": -1,
    "seed": SEED,
}

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_all, y_all)):
    print(f"\n[LGBM-safe] Fold {fold+1}")

    model = lgb.LGBMClassifier(**lgb_params_safe, n_estimators=5000)

    model.fit(
        X_all[tr_idx], y_all[tr_idx],
        eval_set=[(X_all[va_idx], y_all[va_idx])],
        callbacks=[
            lgb.early_stopping(100),
            lgb.log_evaluation(0)
        ]
    )

    oof_lgb2[va_idx] = model.predict_proba(X_all[va_idx])[:, 1]
    test_lgb2 += model.predict_proba(X_test_lgb)[:, 1] / N_SPLITS

print("LGBM-safe CV AUC:", roc_auc_score(y_all, oof_lgb2))



[LGBM-safe] Fold 1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[975]	valid_0's auc: 0.854535

[LGBM-safe] Fold 2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[748]	valid_0's auc: 0.852122

[LGBM-safe] Fold 3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1298]	valid_0's auc: 0.852388

[LGBM-safe] Fold 4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[847]	valid_0's auc: 0.855437

[LGBM-safe] Fold 5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[744]	valid_0's auc: 0.86146
LGBM-safe CV AUC: 0.8551096832806264


In [11]:
import xgboost as xgb

oof_xgb = np.zeros(len(df_train_fe))
test_xgb = np.zeros(len(df_test_fe))

xgb_params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.03,
    "max_depth": 6,
    "min_child_weight": 30,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_alpha": 0.0,
    "reg_lambda": 1.0,
    "tree_method": "hist",
    "random_state": SEED,
}

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_all, y_all)):
    print(f"\n[XGB] Fold {fold+1}")

    model = xgb.XGBClassifier(
        **xgb_params,
        n_estimators=5000,
        early_stopping_rounds=100   # ★ ここに移動
    )

    model.fit(
        X_all[tr_idx], y_all[tr_idx],
        eval_set=[(X_all[va_idx], y_all[va_idx])],
        verbose=False
    )

    oof_xgb[va_idx] = model.predict_proba(X_all[va_idx])[:, 1]
    test_xgb += model.predict_proba(X_test_lgb)[:, 1] / N_SPLITS

print("XGB CV AUC:", roc_auc_score(y_all, oof_xgb))



[XGB] Fold 1

[XGB] Fold 2

[XGB] Fold 3

[XGB] Fold 4

[XGB] Fold 5
XGB CV AUC: 0.8553348278510315


In [14]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
# ===== FEATURE_COLS 再定義（保険） =====
TARGET = "smoking"
DROP_COLS = ["id"] if "id" in df_train_fe.columns else []

FEATURE_COLS = [
    c for c in df_train_fe.columns
    if c not in DROP_COLS + [TARGET]
]

print("FEATURE_COLS defined:", len(FEATURE_COLS))

# ===============================
# Dataset
# ===============================
class TabularDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

# ===============================
# Model
# ===============================
class ResidualBlock(nn.Module):
    def __init__(self, dim, dropout=0.25):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim),
            nn.SiLU(),
            nn.Dropout(dropout),
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim),
        )
        self.act = nn.SiLU()

    def forward(self, x):
        return self.act(x + self.net(x))


class TabularResMLP(nn.Module):
    def __init__(self, in_dim, hidden_dim=256, n_blocks=4, dropout=0.25):
        super().__init__()

        self.stem = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.SiLU(),
            nn.Dropout(dropout),
        )

        self.blocks = nn.Sequential(
            *[ResidualBlock(hidden_dim, dropout) for _ in range(n_blocks)]
        )

        self.head = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.SiLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, 1),
        )

    def forward(self, x):
        x = self.stem(x)
        x = self.blocks(x)
        return self.head(x).squeeze(1)

# ===============================
# Training / Evaluation
# ===============================
def train_one_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0.0

    for xb, yb in loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)

        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item() * xb.size(0)

    return total_loss / len(loader.dataset)


@torch.no_grad()
def predict(model, loader):
    model.eval()
    preds = []

    for batch in loader:
        # batch が tuple / list の場合に対応
        if isinstance(batch, (list, tuple)):
            xb = batch[0]
        else:
            xb = batch

        xb = xb.to(DEVICE)
        logits = model(xb)
        preds.append(torch.sigmoid(logits).cpu().numpy())

    return np.concatenate(preds)


# ===============================
# ResMLP K-Fold CV
# ===============================
oof_resmlp = np.zeros(len(df_train_fe))
test_resmlp = np.zeros(len(df_test_fe))

BATCH_SIZE = 512
EPOCHS = 50
PATIENCE = 10

X_all = df_train_fe[FEATURE_COLS].apply(pd.to_numeric, errors="coerce")
X_all = X_all.fillna(X_all.median()).values
y_all = df_train_fe[TARGET].values

X_test_all = df_test_fe[FEATURE_COLS].apply(pd.to_numeric, errors="coerce")
X_test_all = X_test_all.fillna(
    df_train_fe[FEATURE_COLS].median()
).values

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_all, y_all)):
    print(f"\n[ResMLP] Fold {fold+1}")

    X_tr, X_va = X_all[tr_idx], X_all[va_idx]
    y_tr, y_va = y_all[tr_idx], y_all[va_idx]

    # 標準化（fold内fit）
    scaler = StandardScaler()
    X_tr = scaler.fit_transform(X_tr)
    X_va = scaler.transform(X_va)
    X_te = scaler.transform(X_test_all)

    train_ds = TabularDataset(X_tr, y_tr)
    val_ds   = TabularDataset(X_va, y_va)
    test_ds  = TabularDataset(X_te)

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
    test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

    model = TabularResMLP(
        in_dim=X_tr.shape[1],
        hidden_dim=256,
        n_blocks=4,
        dropout=0.25,
    ).to(DEVICE)

    # クラス不均衡対応
    pos = (y_tr == 1).sum()
    neg = (y_tr == 0).sum()
    pos_weight = torch.tensor([neg / pos], device=DEVICE)

    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-3, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)

    best_auc = -1
    patience = 0
    best_state = None

    for epoch in range(EPOCHS):
        train_one_epoch(model, train_loader, optimizer, criterion)

        # validation AUC
        val_preds = predict(model, val_loader)
        val_auc = roc_auc_score(y_va, val_preds)
        scheduler.step()

        if val_auc > best_auc:
            best_auc = val_auc
            best_state = {k: v.cpu() for k, v in model.state_dict().items()}
            patience = 0
        else:
            patience += 1
            if patience >= PATIENCE:
                break

    model.load_state_dict(best_state)

    # OOF
    oof_resmlp[va_idx] = predict(model, val_loader)

    # test
    test_resmlp += predict(model, test_loader) / N_SPLITS

print("ResMLP CV AUC:", roc_auc_score(y_all, oof_resmlp))



FEATURE_COLS defined: 29

[ResMLP] Fold 1

[ResMLP] Fold 2

[ResMLP] Fold 3

[ResMLP] Fold 4

[ResMLP] Fold 5
ResMLP CV AUC: 0.8509225164422449


In [15]:

ensemble_oof_5 = (
    oof_tabnet +
    oof_lgb +
    oof_lgb2 +
    oof_xgb +
    oof_resmlp
) / 5

ensemble_test_5 = (
    test_tabnet +
    test_lgb +
    test_lgb2 +
    test_xgb +
    test_resmlp
) / 5

print("Ensemble (5 models) CV AUC:", roc_auc_score(y_all, ensemble_oof_5))

Ensemble (5 models) CV AUC: 0.8570729257337946


In [16]:
submission = pd.DataFrame({
    "id": test_ids,
    "smoking": ensemble_test_5
})

submission.to_csv("submission_ensemble_5models.csv", index=False)
submission.head()


Unnamed: 0,id,smoking
0,15000,0.018891
1,15001,0.564897
2,15002,0.111995
3,15003,0.298498
4,15004,0.14578
