- focus on nn modelling
- save checkpoint files * 

# common class

In [1]:
import sys
import numpy as np
import pandas as pd
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection import StratifiedKFold, KFold

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning import LightningModule, LightningDataModule, Trainer, seed_everything

# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.trainer.trainer.Trainer.html#lightning.pytorch.trainer.trainer.Trainer

#####
class MyModel(nn.Module):
    def __init__(self, num_columns, last_num):
        super(MyModel, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_columns)
        self.dropout1 = nn.Dropout(0.1)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_columns, 512))
        self.relu1 = nn.LeakyReLU()
        
        self.batch_norm2 = nn.BatchNorm1d(512)
        self.dropout2 = nn.Dropout(0.1)
        self.dense2 = nn.utils.weight_norm(nn.Linear(512, 512))
        self.relu2 = nn.LeakyReLU()
        
        self.batch_norm3 = nn.BatchNorm1d(512)
        self.dropout3 = nn.Dropout(0.1)
        self.dense3 = nn.utils.weight_norm(nn.Linear(512, last_num))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = self.relu1(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = self.relu2(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        x = torch.squeeze(x)
        
        return x
#####

        
class MyModule(LightningModule):
    def __init__(self, lr, num_columns, last_num = 1):
        super(MyModule, self).__init__()
        self.lr = lr
        self.num_columns = num_columns
        self.last_num = last_num
        self.loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([4.0]))
        self.model = MyModel(num_columns = self.num_columns, last_num = self.last_num)
        self.log_outputs = {}
        self.validation_step_outputs = []
        self.train_step_outputs = []
        
    #####
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.model.parameters(), lr = self.lr)
        return optimizer
    #####
        
    def forward(self, x):
        x = self.model(x)
        return x
    
    def training_step(self, batch, batch_idx):
        inputs, targets = batch
        preds = self.forward(inputs)        
        loss = self.loss_fn(preds, targets)        
        self.train_step_outputs.append(loss)

        return {"loss": loss}
    
    def validation_step(self, batch, batch_idx):
        inputs, targets = batch
        preds = self.forward(inputs)
        loss = self.loss_fn(preds, targets)
        output = {"targets": targets.detach(), "preds": preds.detach(), "loss": loss.detach()}
        self.validation_step_outputs.append(output)
                
        return output
    
    def predict_step(self, batch, batch_idx):
        inputs, targets = batch
        preds = self.forward(inputs)
                
        return preds
    
    def on_train_start(self) -> None:
        self.print(f"Train start")
        return super().on_train_start()
    
    def on_train_end(self) -> None:
        self.print(" ")
        return super().on_train_end()
    
    def on_train_epoch_end(self) -> None:
        train_loss = torch.stack([x for x in self.train_step_outputs]).mean()
        self.log_dict({"loss": train_loss})
        self.log_outputs["loss"] = train_loss
        
        train_loss     = self.log_outputs["loss"]
        valid_loss     = self.log_outputs["valid_loss"]
        self.print(f"loss: {train_loss:.3f} - val_loss: {valid_loss:.3f}")
        
        return super().on_train_epoch_end()
        
    def on_validation_epoch_end(self) -> None:
        valid_loss = torch.stack([x["loss"] for x in self.validation_step_outputs]).mean()
        
        self.log_dict({"valid_loss": valid_loss})
        self.log_outputs["valid_loss"] = valid_loss

        return super().on_validation_epoch_end()
    
    
class MyDataModule(LightningModule):
    def __init__(self, train, test, target, feats, fold, batch_size = 32):
        super(MyDataModule, self).__init__()
        self.batch_size = batch_size
        self.train = train
        self.target = target
        self.input_test = test
        self.feats = feats
        self.fold = fold
        self.x_train = None
        self.x_valid = None
        self.y_train = None
        self.y_valid = None
        
    #####
    def split_train_valid_df(self):
        skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
        for n, (tr_index, val_index) in enumerate(skf.split(self.train, self.target)):
            if n == self.fold: 
                x_train = self.train.loc[tr_index].reset_index(drop=True)
                x_valid = self.train.loc[val_index].reset_index(drop=True)
                y_train = self.target.loc[tr_index].reset_index(drop=True)
                y_valid = self.target.loc[val_index].reset_index(drop=True)
        
        return x_train, x_valid, y_train, y_valid
    #####
    
    def setup(self, stage):
        x_tr, x_va, y_tr, y_va = self.split_train_valid_df()
        self.x_train = x_tr
        self.x_valid = x_va
        self.y_train = y_tr 
        self.y_valid = y_va
        self.test = self.input_test
        
    def get_dataframe(self, phase):
        assert phase in ["train", "valid", "test"]
        if phase == "train":
            return self.x_train, self.y_train
        elif phase == "valid":
            return self.x_valid, self.y_valid
        elif phase == "test":
            return self.test, None
        
    def get_ds(self, phase):
        x, y = self.get_dataframe(phase)
        return MyDataset(df = x, target = y, feats = self.feats, phase = phase)
        
    def get_loader(self, phase):
        assert phase in ["train", "valid", "test"]
        ds = self.get_ds(phase = phase)
        return DataLoader(ds, batch_size = self.batch_size, num_workers = 4,
                        shuffle = True if phase == "train" else False,
                        drop_last = True if phase == "train" else False)
        
    def train_dataloader(self):
        return self.get_loader("train")
    
    def val_dataloader(self):
        return self.get_loader("valid")
    
    
class MyDataset(Dataset):
    def __init__(self, df, target, feats, phase = "train"):
        self.phase = phase 
        self.feats = feats
        self.data = df[feats]
        self.target = target
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        if self.phase in ['train', "valid"]:
            return self.data.values[index].astype(float), self.target.values[index].astype(float)
        elif self.phase == 'test':
            return self.data.values[index].astype(float), 0



# preprocess

In [2]:
import itertools
from datetime import datetime
from sklearn.decomposition import PCA
from sklearn.metrics import log_loss

def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)
    return log_loss(y_true, y_pred, sample_weight = 1/nc[y_true], eps=1e-15)

train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
sample = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')
greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')

train['EJ'] = train['EJ'].map({'A': 0, 'B': 1})
test['EJ']  = test['EJ'].map({'A': 0, 'B': 1})

# process epsilon
train = pd.merge(train, greeks, on = "Id", how = "inner")
train_stratify = train[["Class", "Beta", "Delta", "Gamma"]] 
train["Epsilon_ordinal"] = train["Epsilon"].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal() if x != "Unknown" else np.nan)

org_features = [n for n in train.columns if n not in ['Class', 'Id', 'Alpha', "Beta", "Gamma", "Delta", "Epsilon"]]
test_times = pd.DataFrame([train.Epsilon_ordinal.max() + 1] * len(test), columns = ["Epsilon_ordinal"])
final_test = pd.concat((test, test_times), axis=1)

# fill missing value
train.fillna(-999, inplace=True)
final_test.fillna(-999, inplace=True)

# add pca columns
pca_feat_num = 15
pca_cols = ["pca"+str(i+1) for i in range(pca_feat_num)]
pca = PCA(n_components=pca_feat_num,random_state=42)
pca_train = pca.fit_transform(train[org_features])
pca_test = pca.transform(final_test[org_features])
pca_train = pd.DataFrame(pca_train, columns=pca_cols)
pca_test = pd.DataFrame(pca_test, columns=pca_cols)
train = pd.concat([train, pca_train],axis=1)
final_test = pd.concat([final_test, pca_test],axis=1)

scalar = MinMaxScaler()
cons_feats = org_features + pca_cols
normalize_train = scalar.fit_transform(train[cons_feats])
normalize_train = pd.DataFrame(normalize_train, columns = cons_feats)
normalize_test = scalar.transform(final_test[cons_feats])
normalize_test = pd.DataFrame(normalize_test, columns = cons_feats)

print(normalize_train.shape, normalize_test.shape)

(617, 72) (5, 72)


In [3]:
config = {
    "feats": cons_feats,
    "cv": MultilabelStratifiedKFold(5, shuffle=True, random_state=42),
    "n_splits" : 5,
    "train_data": normalize_train,
    "fold_y": train_stratify,
    "test_data": normalize_test,
    "target": train.Class,
    "batch_size": 16,
    "epochs": 10,
    "lr": 0.01,
    "metric_function" : balanced_log_loss,
}

# NN

In [4]:
seed_everything(42, workers=True)
    
for fold in range(config["n_splits"]):
    callbacks = []
    es_callback = EarlyStopping(monitor='valid_loss', patience=3)
    checkpoint_callback = ModelCheckpoint(monitor="valid_loss", dirpath=f"./checkpoints-{fold}", filename=f"model-{fold}", save_top_k=1, mode="min",)
    callbacks.append(es_callback)
    callbacks.append(checkpoint_callback)

    # train
    trainer = Trainer(max_epochs = config["epochs"], callbacks=callbacks, enable_progress_bar = False, log_every_n_steps = 10)
    model = MyModule(lr = config["lr"], num_columns = len(config["feats"]), last_num = 1).to("cpu", dtype=float)
    data_module = MyDataModule(train = config["train_data"], test = config["test_data"], feats = config["feats"], 
                           fold = fold, target = config["target"], batch_size = config["batch_size"])
    trainer.fit(model, datamodule = data_module)
    
    valid_loader = data_module.get_loader("valid")
    test_loader = data_module.get_loader("test")
    if fold == 0:
        valid_preds = trainer.predict(model, dataloaders = valid_loader, ckpt_path = "best")
        test_preds = trainer.predict(model, dataloaders = test_loader, ckpt_path = "best")

        valid_preds = torch.cat(valid_preds)
        test_preds = torch.cat(test_preds)
        valid_target = data_module.y_valid.values        
    else:
        tmp_preds = trainer.predict(model, dataloaders = valid_loader, ckpt_path = "best")
        tmp_test_preds = trainer.predict(model, dataloaders = test_loader, ckpt_path = "best")

        tmp_preds = torch.cat(tmp_preds)
        tmp_test_preds = torch.cat(tmp_test_preds)
        tmp_target = data_module.y_valid.values
        
        valid_target = np.hstack((valid_target, tmp_target))
        valid_preds = torch.cat((valid_preds, tmp_preds))        
        
        test_preds += tmp_test_preds

test_preds /= config["n_splits"]

Train start
loss: 0.775 - val_loss: 0.932
loss: 0.650 - val_loss: 0.777
loss: 0.570 - val_loss: 0.860
loss: 0.528 - val_loss: 1.045
loss: 0.491 - val_loss: 1.424
 
Train start
loss: 0.943 - val_loss: 0.751
loss: 0.722 - val_loss: 0.680
loss: 0.606 - val_loss: 0.661
loss: 0.553 - val_loss: 0.656
loss: 0.507 - val_loss: 0.704
loss: 0.464 - val_loss: 0.764
loss: 0.444 - val_loss: 0.746
 
Train start
loss: 0.850 - val_loss: 0.840
loss: 0.682 - val_loss: 0.781
loss: 0.575 - val_loss: 0.746
loss: 0.525 - val_loss: 0.696
loss: 0.489 - val_loss: 0.717
loss: 0.462 - val_loss: 0.706
loss: 0.430 - val_loss: 0.746
 
Train start
loss: 0.928 - val_loss: 0.888
loss: 0.710 - val_loss: 0.750
loss: 0.600 - val_loss: 0.796
loss: 0.539 - val_loss: 0.748
loss: 0.490 - val_loss: 0.751
loss: 0.461 - val_loss: 0.758
loss: 0.431 - val_loss: 0.761
 
Train start
loss: 0.837 - val_loss: 0.694
loss: 0.650 - val_loss: 0.624
loss: 0.563 - val_loss: 0.593
loss: 0.510 - val_loss: 0.600
loss: 0.480 - val_loss: 0.558
lo

In [5]:
print(config["metric_function"](valid_target, torch.sigmoid(valid_preds)))

0.37644220842761844
