In [1]:
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold

import itertools
from datetime import datetime
from sklearn.decomposition import PCA
from sklearn.metrics import log_loss

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning import LightningModule, LightningDataModule, Trainer, seed_everything

# preprocess

In [2]:
train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
sample = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')
greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')

train['EJ'] = train['EJ'].map({'A': 0, 'B': 1})
test['EJ']  = test['EJ'].map({'A': 0, 'B': 1})

# process epsilon
train = pd.merge(train, greeks, on = "Id", how = "inner")
train_stratify = train[["Class", "Beta", "Delta", "Gamma"]] 
train["Epsilon_ordinal"] = train["Epsilon"].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal() if x != "Unknown" else np.nan)

org_features = [n for n in train.columns if n not in ['Class', 'Id', 'Alpha', "Beta", "Gamma", "Delta", "Epsilon"]]
test_times = pd.DataFrame([train.Epsilon_ordinal.max() + 1] * len(test), columns = ["Epsilon_ordinal"])
final_test = pd.concat((test, test_times), axis=1)

# fill missing value
train.fillna(-999, inplace=True)
final_test.fillna(-999, inplace=True)

# add pca columns
pca_feat_num = 15
pca_cols = ["pca"+str(i+1) for i in range(pca_feat_num)]
pca = PCA(n_components=pca_feat_num,random_state=42)
pca_train = pca.fit_transform(train[org_features])
pca_test = pca.transform(final_test[org_features])
pca_train = pd.DataFrame(pca_train, columns=pca_cols)
pca_test = pd.DataFrame(pca_test, columns=pca_cols)
train = pd.concat([train, pca_train],axis=1)
final_test = pd.concat([final_test, pca_test],axis=1)

scalar = MinMaxScaler()
cons_feats = org_features + pca_cols
normalize_train = scalar.fit_transform(train[cons_feats])
normalize_train = pd.DataFrame(normalize_train, columns = cons_feats)
normalize_test = scalar.transform(final_test[cons_feats])
normalize_test = pd.DataFrame(normalize_test, columns = cons_feats)

print(normalize_train.shape, normalize_test.shape)

(617, 72) (5, 72)


In [3]:
def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)
    return log_loss(y_true, y_pred, sample_weight = 1/nc[y_true], eps=1e-15)

config = {
    "n_splits" : 5,
    "batch_size": 32,
    "epochs": 10,
    "metric_function" : balanced_log_loss,
}

In [4]:
# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.trainer.trainer.Trainer.html#lightning.pytorch.trainer.trainer.Trainer

class MyModel(LightningModule):
    def __init__(self):
        super(MyModel, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(len(cons_feats))
        self.dropout1 = nn.Dropout(0.1)
        self.dense1 = nn.utils.weight_norm(nn.Linear(len(cons_feats), 512))
        self.relu1 = nn.LeakyReLU()
        
        self.batch_norm2 = nn.BatchNorm1d(512)
        self.dropout2 = nn.Dropout(0.1)
        self.dense2 = nn.utils.weight_norm(nn.Linear(512, 512))
        self.relu2 = nn.LeakyReLU()
        
        self.batch_norm3 = nn.BatchNorm1d(512)
        self.dropout3 = nn.Dropout(0.1)
        self.dense3 = nn.utils.weight_norm(nn.Linear(512, 1))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = self.relu1(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = self.relu2(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        x = torch.squeeze(x)
        
        return x


class MyModule(LightningModule):
    def __init__(self):
        super(MyModule, self).__init__()
        self.loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([4.0]))
        self.model = MyModel()
        self.log_outputs = {}
        self.validation_step_outputs = []
        self.train_step_outputs = []
        
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.parameters(), lr = 0.001)
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma = 0.9)
        return [optimizer], [scheduler]
        
    def forward(self, x):
        x = self.model(x)
        return x
    
    def training_step(self, batch, batch_idx):
        inputs, targets = batch
        preds = self.forward(inputs)        
        loss = self.loss_fn(preds, targets)        
        self.train_step_outputs.append(loss)

        return {"loss": loss}
    
    def validation_step(self, batch, batch_idx):
        inputs, targets = batch
        preds = self.forward(inputs)
        loss = self.loss_fn(preds, targets)
        output = {"targets": targets.detach(), "preds": preds.detach(), "loss": loss.detach()}
        self.validation_step_outputs.append(output)
                
        return output
    
    def predict_step(self, batch, batch_idx):
        inputs, targets = batch
        preds = self.forward(inputs)
                
        return preds
    
    def on_train_start(self) -> None:
        self.print("Train start")
        return super().on_train_start()
    
    def on_train_end(self) -> None:
        self.print("Train end")
        return super().on_train_end()
    
    def on_train_epoch_end(self) -> None:
        train_loss = torch.stack([x for x in self.train_step_outputs]).mean()
        self.log_dict({"loss": train_loss})
        self.log_outputs["loss"] = train_loss
        
        train_loss     = self.log_outputs["loss"]
        valid_loss     = self.log_outputs["valid_loss"]
        self.print(f"loss: {train_loss:.3f} - val_loss: {valid_loss:.3f}")
        
        return super().on_train_epoch_end()
        
    def on_validation_epoch_end(self) -> None:
        valid_loss = torch.stack([x["loss"] for x in self.validation_step_outputs]).mean()
        
        self.log_dict({"valid_loss": valid_loss})
        self.log_outputs["valid_loss"] = valid_loss

        return super().on_validation_epoch_end()

In [5]:
class MyDataModule(LightningDataModule):
    def __init__(self, x, y = None, batch_size = 32):
        super(MyDataModule, self).__init__()
        self.batch_size = batch_size
        self.x = x
        self.y = y
        
    def get_ds(self, phase):
        return MyDataset(df = self.x, target = self.y, phase = phase)
        
    def get_loader(self, phase):
        assert phase in ["train", "valid", "test"]
        ds = self.get_ds(phase = phase)
        return DataLoader(ds, batch_size = self.batch_size, num_workers = 4,
                        shuffle = True if phase == "train" else False,
                        drop_last = True if phase == "train" else False)
        
    def train_dataloader(self):
        return self.get_loader("train")
    
    def val_dataloader(self):
        return self.get_loader("valid")
    
    def test_dataloader(self):
        return self.get_loader("test")


class CombinedDataModule(LightningDataModule):
    def __init__(self, train_module: MyDataModule, valid_module: MyDataModule):
        super().__init__()
        self.train_module = train_module
        self.valid_module = valid_module

    def train_dataloader(self):
        return self.train_module.train_dataloader()

    def val_dataloader(self):
        return self.valid_module.val_dataloader()


class MyDataset(Dataset):
    def __init__(self, df, target, phase = "train"):
        self.phase = phase 
        self.data = df
        self.target = target
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        if self.phase in ['train', "valid"]:
            return self.data.values[index].astype(float), self.target.values[index].astype(float)
        elif self.phase == 'test':
            return self.data.values[index].astype(float), 0

In [6]:
seed_everything(42, workers=True)
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
test_module = MyDataModule(normalize_test[cons_feats], batch_size = config["batch_size"])

test_preds = []
valid_preds = []
valid_targets = []
for fold, (tr_index, val_index) in enumerate(skf.split(normalize_train[cons_feats], train.Class)):
    callbacks = [EarlyStopping(monitor='valid_loss', patience=3),]
    checkpoint_callback = ModelCheckpoint(
        monitor="valid_loss",
        dirpath=f"./checkpoints-{fold}",
        filename=f"model", save_top_k=1, mode="min",)
    callbacks.append(checkpoint_callback)

    # train
    trainer = Trainer(max_epochs = config["epochs"], callbacks=callbacks, enable_progress_bar = False, log_every_n_steps = 10)
    model = MyModule().to("cpu", dtype=float)
    train_module = MyDataModule(normalize_train[cons_feats].loc[tr_index], train.Class.loc[tr_index], batch_size = config["batch_size"])
    valid_module = MyDataModule(normalize_train[cons_feats].loc[val_index], train.Class.loc[val_index], batch_size = config["batch_size"])
    data_module = CombinedDataModule(train_module, valid_module)
    trainer.fit(model, datamodule = data_module)
    
    best_model = MyModule.load_from_checkpoint(checkpoint_callback.best_model_path).to("cpu", dtype=float)

    # validation
    valid_preds.append(torch.cat(trainer.predict(best_model, valid_module.get_loader("valid"))))
    valid_targets.append(valid_module.y.values)

    # test
    test_preds.append(torch.sigmoid(torch.cat(trainer.predict(best_model, test_module.get_loader("test")))))

valid_targets = np.hstack(valid_targets)
valid_preds = torch.cat(valid_preds)

test_preds = torch.mean(torch.vstack(test_preds), axis=0)

print("OOF metric: ", config["metric_function"](valid_targets, torch.sigmoid(valid_preds)))

  WeightNorm.apply(module, name, dim)
2025-04-17 01:03:15.803926: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744851796.092050      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744851796.177773      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Train start
loss: 0.835 - val_loss: 0.953
loss: 0.695 - val_loss: 0.834
loss: 0.613 - val_loss: 0.753
loss: 0.559 - val_loss: 0.722
loss: 0.513 - val_loss: 0.672
loss: 0.473 - val_loss: 0.657
loss: 0.447 - val_loss: 0.683
loss: 0.421 - val_loss: 0.762
loss: 0.396 - val_loss: 0.779
Train end


  WeightNorm.apply(module, name, dim)


Train start
loss: 0.762 - val_loss: 0.997
loss: 0.664 - val_loss: 0.901
loss: 0.579 - val_loss: 0.820
loss: 0.525 - val_loss: 0.775
loss: 0.475 - val_loss: 0.751
loss: 0.439 - val_loss: 0.720
loss: 0.410 - val_loss: 0.701
loss: 0.392 - val_loss: 0.685
loss: 0.372 - val_loss: 0.672
loss: 0.358 - val_loss: 0.664
Train end


  WeightNorm.apply(module, name, dim)


Train start
loss: 0.879 - val_loss: 0.949
loss: 0.706 - val_loss: 0.916
loss: 0.617 - val_loss: 0.872
loss: 0.553 - val_loss: 0.838
loss: 0.511 - val_loss: 0.817
loss: 0.476 - val_loss: 0.792
loss: 0.455 - val_loss: 0.764
loss: 0.431 - val_loss: 0.748
loss: 0.409 - val_loss: 0.734
loss: 0.387 - val_loss: 0.726
Train end


  WeightNorm.apply(module, name, dim)


Train start
loss: 0.809 - val_loss: 1.023
loss: 0.672 - val_loss: 0.948
loss: 0.579 - val_loss: 0.870
loss: 0.524 - val_loss: 0.818
loss: 0.483 - val_loss: 0.780
loss: 0.449 - val_loss: 0.753
loss: 0.421 - val_loss: 0.731
loss: 0.397 - val_loss: 0.717
loss: 0.374 - val_loss: 0.712
loss: 0.354 - val_loss: 0.700
Train end


  WeightNorm.apply(module, name, dim)


Train start
loss: 0.802 - val_loss: 0.932
loss: 0.640 - val_loss: 0.843
loss: 0.572 - val_loss: 0.750
loss: 0.523 - val_loss: 0.683
loss: 0.480 - val_loss: 0.631
loss: 0.447 - val_loss: 0.602
loss: 0.422 - val_loss: 0.570
loss: 0.395 - val_loss: 0.548
loss: 0.375 - val_loss: 0.527
loss: 0.359 - val_loss: 0.507
Train end


  WeightNorm.apply(module, name, dim)


OOF metric:  0.3595554945276908
