In [2]:
import numpy as np
import pandas as pd
import torch
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import TensorDataset
import json
import pytorch_lightning as pl
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from pytorch_lightning.callbacks import LearningRateMonitor, TQDMProgressBar, EarlyStopping, StochasticWeightAveraging
from sklearn.model_selection import StratifiedKFold
import random
import sys
# sys.path.append("/kaggle/input/")

from functools import reduce
from ktools_utils import *
from functools import reduce
from copy import deepcopy

## Prepare data

Below are a few utility functions to load and prepare the data for training with pytorch.

In [3]:
RANDOM_SEED = 32
ORIGINAL_DATA=False

train_csv_path = "../data/post_hct_survival/train.csv"
test_csv_path = "../data/post_hct_survival/test.csv"
sub_csv_path = "../data/post_hct_survival/sample_submission.csv"

# train_csv_path = "/kaggle/input/equity-post-HCT-survival-predictions/train.csv"
# test_csv_path = "/kaggle/input/equity-post-HCT-survival-predictions/test.csv"
# sub_csv_path = "/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv"

target_col_name = ['efs', 'efs_time']

In [45]:
def get_X_cat(df, cat_cols, transformers=None):
    """
    Apply a specific categorical data transformer or a LabelEncoder if None.
    """
    if transformers is None:
        transformers = [LabelEncoder().fit(df[col]) for col in cat_cols]
    return transformers, np.array(
        [transformer.transform(df[col]) for col, transformer in zip(cat_cols, transformers)]
    ).T


def preprocess_data(train, val, numericals=None, categoricals=None):
    """
    Standardize numerical variables and transform (Label-encode) categoricals.
    Fill NA values with mean for numerical.
    Create torch dataloaders to prepare data for training and evaluation.
    """
    train = add_features(train)
    val = add_features(val)
    X_cat_train, X_cat_val, numerical, categorical_cols, transformers = get_categoricals(train, val)
    numerical = numerical if numericals is None else numericals
    scaler = StandardScaler()
    imp = SimpleImputer(missing_values=np.nan, strategy='mean', add_indicator=True)
    X_num_train = imp.fit_transform(train[numerical])
    X_num_train = scaler.fit_transform(X_num_train)
    X_num_val = imp.transform(val[numerical])
    X_num_val = scaler.transform(X_num_val)
    dl_train = init_dl(X_cat_train, X_num_train, train, training=True)
    dl_val = init_dl(X_cat_val, X_num_val, val)

    # train[categorical_cols] = X_cat_train
    # train[numerical] = X_num_train
    # val[categorical_cols] = X_cat_val
    # val[numerical] = X_num_val

    return X_cat_train, X_cat_val, X_num_train, X_num_val, dl_train, dl_val, transformers


def get_categoricals(train, val):
    """
    Remove constant categorical columns and transform them using LabelEncoder.
    Return the label-transformers for each categorical column, categorical dataframes and numerical columns.
    """
    categorical_cols, numerical = get_feature_types(train)
    remove = []
    for col in categorical_cols:
        if train[col].nunique() == 1:
            remove.append(col)
        ind = ~val[col].isin(train[col])
        if ind.any():
            val.loc[ind, col] = np.nan
    categorical_cols = [col for col in categorical_cols if col not in remove]
    transformers, X_cat_train = get_X_cat(train, categorical_cols)
    _, X_cat_val = get_X_cat(val, categorical_cols, transformers)
    return X_cat_train, X_cat_val, numerical, categorical_cols, transformers


def init_dl(X_cat, X_num, df, training=False):
    """
    Initialize data loaders with 4 dimensions : categorical dataframe, numerical dataframe and target values (efs and efs_time).
    Notice that efs_time is log-transformed.
    Fix batch size to 2048 and return dataloader for training or validation depending on training value.
    """
    ds_train = TensorDataset(
        torch.tensor(X_cat, dtype=torch.long),
        torch.tensor(X_num, dtype=torch.float32),
        torch.tensor(df.efs_time.values, dtype=torch.float32).log(),
        torch.tensor(df.efs.values, dtype=torch.long)
    )
    bs = 2048
    dl_train = torch.utils.data.DataLoader(ds_train, batch_size=bs, pin_memory=True, shuffle=training)
    return dl_train


def get_feature_types(train):
    """
    Utility function to return categorical and numerical column names.
    """
    categorical_cols = [col for i, col in enumerate(train.columns) if ((train[col].dtype == "object") | (2 < train[col].nunique() < 25))]
    RMV = ["ID", "efs", "efs_time", "y"]
    FEATURES = [c for c in train.columns if not c in RMV]
    print(f"There are {len(FEATURES)} FEATURES: {FEATURES}")
    numerical = [i for i in FEATURES if i not in categorical_cols]
    return categorical_cols, numerical


def add_features(df):
    """
    Create some new features to help the model focus on specific patterns.
    """
    df['is_cyto_score_same'] = (df['cyto_score'] == df['cyto_score_detail']).astype(int)
    df['year_hct'] -= 2000
    
    return df


def load_data():
    """
    Load data and add features.
    """
    test = pd.read_csv(test_csv_path)
    test = add_features(test)
    print("Test shape:", test.shape)
    train = pd.read_csv(train_csv_path)
    train = add_features(train)
    print("Train shape:", train.shape)
    return test, train

In [5]:
from typing import List


def init_ktools_dl(X : pd.DataFrame, y : pd.DataFrame, categorical_idcs : List, numerical_idcs : List, training=False):
    """
    Initialize data loaders with 4 dimensions : categorical dataframe, numerical dataframe and target values (efs and efs_time).
    Notice that efs_time is log-transformed.
    Fix batch size to 2048 and return dataloader for training or validation depending on training value.
    """

    X_cat = X.iloc[:, categorical_idcs].values
    X_num = X.iloc[:, numerical_idcs].values
    ds_train = TensorDataset(
        torch.tensor(X_cat, dtype=torch.long),
        torch.tensor(X_num, dtype=torch.float32),
        torch.tensor(y.efs_time.values, dtype=torch.float32).log(),
        torch.tensor(y.efs.values, dtype=torch.long)
    )
    bs = 2048
    dl_train = torch.utils.data.DataLoader(ds_train, batch_size=bs, pin_memory=True, shuffle=training)
    return dl_train, X_cat, X_num

In [6]:
def get_cats():
    df = pd.read_csv(train_csv_path)
    cats = [col for col in df.columns if (2 < df[col].nunique() < 25) | (df[col].dtype == 'object')]
    return cats
# categoricals = get_cats()

In [9]:
from lifelines.utils import concordance_index

def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    >>> import pandas as pd
    >>> row_id_column_name = "id"
    >>> y_pred = {'prediction': {0: 1.0, 1: 0.0, 2: 1.0}}
    >>> y_pred = pd.DataFrame(y_pred)
    >>> y_pred.insert(0, row_id_column_name, range(len(y_pred)))
    >>> y_true = { 'efs': {0: 1.0, 1: 0.0, 2: 0.0}, 'efs_time': {0: 25.1234,1: 250.1234,2: 2500.1234}, 'race_group': {0: 'race_group_1', 1: 'race_group_1', 2: 'race_group_1'}}
    >>> y_true = pd.DataFrame(y_true)
    >>> y_true.insert(0, row_id_column_name, range(len(y_true)))
    >>> score(y_true.copy(), y_pred.copy(), row_id_column_name)
    0.75
    """
    
    del solution[row_id_column_name]
    del submission[row_id_column_name]
    
    event_label = 'efs'
    interval_label = 'efs_time'
    prediction_label = 'prediction'
    # Merging solution and submission dfs on ID
    merged_df = pd.concat([solution, submission], axis=1)
    merged_df.reset_index(inplace=True)
    merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
    metric_list = []
    for race in merged_df_race_dict.keys():
        # Retrieving values from y_test based on index
        indices = sorted(merged_df_race_dict[race])
        merged_df_race = merged_df.iloc[indices]
        # Calculate the concordance index
        c_index_race = concordance_index(
                        merged_df_race[interval_label],
                        -merged_df_race[prediction_label],
                        merged_df_race[event_label])
        metric_list.append(c_index_race)
    return float(np.mean(metric_list)-np.sqrt(np.var(metric_list)))


def scci_metric(y_test, y_pred, id_col_name : str = "ID",
               survived_col_name : str = "efs",
               survival_time_col_name : str = "efs_time",
               stratify_col_name : str = "race_group"):
    idcs = y_test.index
    og_train = pd.read_csv(train_csv_path)
    
    y_true = og_train.loc[idcs, [id_col_name, survived_col_name, survival_time_col_name, stratify_col_name]].copy()
    y_pred_df = og_train.loc[idcs, [id_col_name]].copy()
    y_pred_df["prediction"] = y_pred
    scci = score(y_true.copy(), y_pred_df.copy(), id_col_name)
    return scci

In [10]:
# # from sklearn.metrics import accuracy_score
# # from ktools.fitting.cross_validation_executor import CrossValidationExecutor
# # from ktools.modelling.ktools_models.xgb_model import XGBoostModel
# # from ktools.modelling.model_transform_wrappers.survival_model_wrapper import transform_quantile


# settings = DataSciencePipelineSettings(train_csv_path,
#                                         test_csv_path,
#                                         target_col_name,
#                                         categorical_col_names=categoricals
#                                         )

class AddHCTFeatures():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        settings.combined_df['is_cyto_score_same'] = (settings.combined_df['cyto_score'] == settings.combined_df['cyto_score_detail']).astype(int)
        settings.combined_df['year_hct'] -= 2000
        settings.training_col_names += ['is_cyto_score_same']
        settings.numerical_col_names += ['is_cyto_score_same']
        return settings

class ImputeNumericalAddIndicator():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings, imputation_strategy='mean', add_indicator=True):
        settings = deepcopy(original_settings)
        train_df, test_df = settings.update()
        added_cols = []
        for col_name in settings.numerical_col_names:
            imputer = SimpleImputer(strategy=imputation_strategy, add_indicator=add_indicator)
            train_transformed = imputer.fit_transform(train_df[[col_name]])
            test_transformed = imputer.transform(test_df[[col_name]])
            if train_transformed.shape[1] > 1:
                nan_col_name = col_name + "_nan"
                settings.combined_df.loc['train', [col_name, nan_col_name]] = train_transformed
                settings.combined_df.loc['test', [col_name, nan_col_name]] = test_transformed
                settings.training_col_names += [nan_col_name]
                added_cols += [nan_col_name]
            else:
                settings.combined_df.loc['train', col_name] = train_transformed
                settings.combined_df.loc['test', col_name] = test_transformed
        settings.numerical_col_names.extend(added_cols)
        return settings


class NanUnknownCategoricals():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        train_df, test_df = settings.update()
        for col in settings.categorical_col_names:
            mask = ~test_df[col].isin(train_df[col].unique())
            test_df.loc[mask, col] = np.nan
        settings.combined_df = pd.concat([train_df, test_df], keys=['train', 'test'])
        return settings
    
transforms = [
            AddHCTFeatures.transform,
            ImputeNumericalAddIndicator.transform,
            StandardScaleNumerical.transform,
            NanUnknownCategoricals.transform,
            FillNullValues.transform,
            OrdinalEncode.transform,
            ConvertObjectToCategorical.transform,
            # AddOOFFeatures.transform
            ]

# settings = reduce(lambda acc, func: func(acc), transforms, settings)
# train, test_df = settings.update()

# test_df.drop(columns=target_col_name, inplace=True)
# X, y = train.drop(columns=settings.target_col_name), train[settings.target_col_name]

In [11]:
# cat_names = settings.categorical_col_names
# cat_sizes = [int(x) for x in X[cat_names].nunique().values]
# emb_sizes = [16] * len(cat_sizes)

# # emb_sizes = [min(3*n, 16) for n in cat_sizes]
# categorical_idcs = [X.columns.get_loc(col) for col in cat_names]
# numerical_idcs = list(set(range(X.shape[1])).difference(set(categorical_idcs)))
# race_idx = X[cat_names].columns.get_loc('race_group')

## Define models with pairwise ranking loss

The model is defined in 3 steps :
* Embedding class for categorical data
* MLP for numerical and categorical data
* Final model trained with pairwise ranking loss with selection of valid pairs

In [25]:
import functools
import torch.nn.functional as F

@functools.lru_cache
def combinations(N):
    with torch.no_grad():
        ind = torch.arange(N)
        comb = torch.combinations(ind, r=2)
    return comb

def pairwise_loss(event :torch.Tensor, event_time:torch.Tensor, risk:torch.Tensor, margin=0.2):
    n = event.shape[0]
    pairwise_combinations = combinations(n)

    # Find mask
    pairwise_combinations = pairwise_combinations.clone().detach()
    first_of_pair, second_of_pair = pairwise_combinations.T
    valid_mask = False
    valid_mask |= ((event[first_of_pair] == 1) & (event[second_of_pair] == 1))
    valid_mask |= ((event[first_of_pair] == 1) & (event_time[first_of_pair] < event_time[second_of_pair]))
    valid_mask |= ((event[second_of_pair] == 1) & (event_time[second_of_pair] < event_time[first_of_pair]))

    direction = 2*(event_time[first_of_pair] > event_time[second_of_pair]).int() - 1
    margin_loss = F.relu(-direction*(risk[first_of_pair] - risk[second_of_pair]) + margin)
    return (margin_loss.double()*valid_mask.double()).sum()/valid_mask.sum()


def race_equality_loss(race, event, event_time, risk, margin=0.2):
    unq_races, _ = torch.unique(race, return_counts=True)
    race_specific_loss = torch.zeros(len(unq_races), dtype=torch.double).to(race.device)
    for i, r in enumerate(unq_races):
        idcs = race == r
        race_specific_loss[i] = pairwise_loss(event[idcs], event_time[idcs], risk[idcs], margin=margin)
    return torch.std(race_specific_loss)

In [13]:
from collections import defaultdict
from torch.nn.modules import Module
from lifelines.utils import concordance_index
import pytorch_lightning as pl
import torch.nn as nn
import torch
from typing import *
from abc import ABC, abstractmethod
from collections import defaultdict


class KtoolsBaseLightningmodel(pl.LightningModule):
    """
    Main Model creation and losses definition to fully train the model.
    """
    def __init__(
            self,
            model : nn.Module,
            learning_rate : float,
            weight_decay : float,
    ):
        super(KtoolsBaseLightningmodel, self).__init__()
        self.model = model
        self._learning_rate = learning_rate
        self._weight_decay = weight_decay
        self.global_metrics = defaultdict(list)

    def forward(self, x_cat, x_cont):
        return self.model(x_cat, x_cont)
    
    @abstractmethod
    def get_loss(self, batch, mode : str):
        assert mode in {'train', 'valid', 'test'}
        pass

    @abstractmethod
    def get_global_metrics(self):
        pass

    def training_step(self, batch, batch_idx):
        total_loss, loss_dict, batch_metrics = self.get_loss(batch, mode='train')
        for (k, v) in loss_dict.items():
            self.log(k, v, on_epoch=True, prog_bar=True, logger=True)
        return total_loss

    def validation_step(self, batch, batch_idx):
        total_loss, loss_dict, batch_metrics = self.get_loss(batch, mode='valid')
        if batch_idx == 0:
            self.global_metrics.update(batch_metrics)
        else:
            for (k, v) in batch_metrics.items():
                self.global_metrics[k] += v
        for (k, v) in loss_dict.items():
            self.log(k, v, on_epoch=True, prog_bar=True, logger=True)
        return total_loss
    
    def test_step(self, batch, batch_idx):
        total_loss, loss_dict, batch_metrics = self.get_loss(batch, mode='test')
        if batch_idx == 0:
            self.global_metrics.update(batch_metrics)
        else:
            for (k, v) in batch_metrics.items():
                self.global_metrics[k] += v
        for (k, v) in loss_dict.items():
            self.log(k, v, on_epoch=True, prog_bar=True, logger=True)
        return total_loss
    
    def on_validation_epoch_end(self) -> None:
        metric_dict = self.get_global_metrics()
        for (k, v) in metric_dict.items():
            self.log(k, v, on_epoch=True, prog_bar=True, logger=True)

    def on_test_epoch_end(self) -> None:
        metric_dict = self.get_global_metrics()
        for (k, v) in metric_dict.items():
            self.log(k, v, on_epoch=True, prog_bar=True, logger=True)

    def configure_optimizers(self):

        optimizer = torch.optim.Adam(self.parameters(), lr=self._learning_rate, weight_decay=self._weight_decay)
        scheduler_config = {
            "scheduler": torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer,
                T_max=1,
                eta_min=6e-3
            ),
            "interval": "epoch",
            "frequency": 1,
            "strict": False,
        }

        return {"optimizer": optimizer, "lr_scheduler": scheduler_config}

class NonLinearFeedForwardModule(nn.Module):

    def __init__(self,
                 input_dim : int,
                 hidden_dim : int,
                 output_dim : int,
                 activation : str = 'gelu'):
        
        super(NonLinearFeedForwardModule, self).__init__()
        self.ffm = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            get_activation(activation),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x : torch.Tensor):
        return self.ffm(x)

class PostHCTModel(KtoolsBaseLightningmodel):

    def __init__(self, 
                 model: Module,
                 race_index : int, 
                 learning_rate: float = 0.06464861983337984, 
                 weight_decay: float = 0.0002773544957610778,
                 margin : float = 0.2588153271003354):
        
        super(PostHCTModel, self).__init__(model, learning_rate, weight_decay)
        self._race_index = race_index
        self._margin = margin

        hidden_size = 56
        self.aux_predictor = NonLinearFeedForwardModule(hidden_size,
                                                        hidden_size//3,
                                                        1
                                                        )

    def get_loss(self, batch, mode = None):
        x_cat, x_num, efs_time, efs = batch
        risk, emb = self(x_cat, x_num)
        aux_pred = self.aux_predictor(emb)

        risk = risk.squeeze()
        aux_pred = aux_pred.squeeze()

        pwloss = pairwise_loss(efs, efs_time, risk, margin=self._margin)
        race_loss = 0.1*race_equality_loss(x_cat[:, self._race_index], efs, efs_time, risk, margin=self._margin)

        aux_mask = efs == 1
        # print(aux_pred, efs_time)
        aux_loss = F.mse_loss(aux_pred, efs_time, reduction='none')
        efs_time_loss = 0.26545778308743806*(aux_loss.double() * aux_mask.double()).sum()/aux_mask.sum()


        loss_dict = {f'{mode}_pairwise_loss' : pwloss,
                     f'{mode}_efs_time_loss' : efs_time_loss,
                     f'{mode}_race_std_loss' : race_loss}
        
        batch_dict = {'efs_time' : [efs_time],
                      'efs' : [efs],
                      'risk_score' : [risk.squeeze()],
                      'races' : [x_cat[:, self._race_index]]}
        
        return pwloss + efs_time_loss + race_loss, loss_dict, batch_dict
    
    def get_global_metrics(self):
        efs = torch.cat(self.global_metrics['efs']).cpu().numpy()
        y_hat = torch.cat(self.global_metrics['risk_score']).cpu().numpy()
        efs_time = torch.cat(self.global_metrics['efs_time']).cpu().numpy()
        races = torch.cat(self.global_metrics['races']).cpu().numpy()
        self.eval_preds = y_hat.copy()
        self.global_metrics.clear()

        try:
            metric = self._metric(efs, races, efs_time, y_hat)
            cindex = concordance_index(efs_time, y_hat, efs)
        except:
            metric = 0.5
            cindex = 0.5
        return {'stratified concordance index' : metric, 'basic_concordance_index' : cindex}
    
    def _metric(self, efs, races, y, y_hat):
        metric_list = []
        for race in np.unique(races):
            y_ = y[races == race]
            y_hat_ = y_hat[races == race]
            efs_ = efs[races == race]
            metric_list.append(concordance_index(y_, y_hat_, efs_))
        metric = float(np.mean(metric_list) - np.sqrt(np.var(metric_list)))
        return metric
    
    def configure_optimizers(self):

        optimizer = torch.optim.Adam(self.parameters(), lr=self._learning_rate, weight_decay=self._weight_decay)
        scheduler_config = {
            "scheduler": torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer,
                T_max=45,
                eta_min=6e-3
            ),
            "interval": "epoch",
            "frequency": 1,
            "strict": False,
        }

        return {"optimizer": optimizer, "lr_scheduler": scheduler_config}
    
    def predict_step(self, batch, batch_idx):
        """Unpacks the batch correctly for inference."""
        x_cat, x_cont = batch
        risk, _ = self(x_cat, x_cont)
        print(risk)
        return risk
    
    def on_test_epoch_end(self) -> None:
        metric_dict = self.get_global_metrics()
        for (k, v) in metric_dict.items():
            self.log(k, v, on_epoch=True, prog_bar=True, logger=True)

In [14]:
def ktools_preprocess_data(train, test):
    categoricals = get_cats()
    settings = DataSciencePipelineSettings(train_csv_path,
                                            test_csv_path,
                                            target_col_name,
                                            categorical_col_names=categoricals,
                                            train_data=train,
                                            test_data=test
                                            )
    settings = reduce(lambda acc, func: func(acc), transforms, settings)
    return settings.get_data(), settings

In [15]:
def eval_dl(X_cat, X_num):
    ds_train = TensorDataset(
        torch.tensor(X_cat, dtype=torch.long),
        torch.tensor(X_num, dtype=torch.float32),
        torch.randn(X_cat.shape[0], dtype=torch.float32),
        torch.ones(X_cat.shape[0], dtype=torch.long)
    )
    bs = 2048
    dl = torch.utils.data.DataLoader(ds_train, batch_size=bs, pin_memory=True)
    return dl

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import *
from pytorch_tabular.models.common.layers import ODST


class EmbeddingCategoricalModule(nn.Module):
    """
    Embed categorical feature
    """

    def __init__(self,
                 category_cardinalities : List[str],
                 embedding_sizes : List[str],
                 projection_dim : Union[None, int] = None) -> None:
        
        super(EmbeddingCategoricalModule, self).__init__()
        self._category_cardinalities = category_cardinalities
        self._embedding_sizes = embedding_sizes
        self._embedding_layers = self._build_embedding_layers()

        if projection_dim is not None:
            self.mlp = nn.Linear(self.concatenated_len, projection_dim)
        else:
            self.mlp = nn.Identity()

    def forward(self, x_cat : torch.Tensor) -> torch.Tensor:
        x = [embedder(x_cat[:, i]) for i, embedder in enumerate(self._embedding_layers)]
        x = torch.cat(x, dim=1)
        x = self.mlp(x)
        return x

    @property
    def num_features(self):
        return len(self._embedding_sizes)
    
    @property
    def concatenated_len(self):
        return sum(self._embedding_sizes)

    def _build_embedding_layers(self):
        embedding_layers = nn.ModuleList([
            nn.Embedding(self._category_cardinalities[i], self._embedding_sizes[i]) for i in range(self.num_features)
            ])
        return embedding_layers

class IskanderPairwiseNetwork(nn.Module):

    def __init__(self,
                category_cardinalities : List[str],
                numerical_size : int,
                embedding_sizes : List[str],
                embedding_projected_dim : int = 112,
                hidden_size : int = 56,
                output_size : int = 1,
                dropout : float = 0.05463240181423116
                ):
        
        super(IskanderPairwiseNetwork, self).__init__()
        self.embedding_module = EmbeddingCategoricalModule(category_cardinalities,
                                                           embedding_sizes)
        cat_dim = self.embedding_module.concatenated_len
        
        self.project_embeddings = NonLinearFeedForwardModule(cat_dim, 
                                                            embedding_projected_dim,
                                                            embedding_projected_dim)
        
        # self.aux_predictor = NonLinearFeedForwardModule(hidden_size,
        #                                                 hidden_size//3,
        #                                                 output_size
        #                                                 )

        self.odst = nn.Sequential(
            nn.Dropout(dropout),
            ODST(embedding_projected_dim + numerical_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.Dropout(dropout)
        )
        self.risk_out = nn.Linear(hidden_size, output_size)

        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                nn.init.zeros_(m.bias)
    
    def forward(self, x_cat : torch.Tensor, x_num : torch.Tensor):
        emb = self.embedding_module(x_cat)
        emb = self.project_embeddings(emb)
        x = torch.cat([emb, x_num], dim=1)
        x = self.odst(x)
        risk = self.risk_out(x)
        # efs_time_pred = self.aux_predictor(x)
        return risk, x
    
    def data_aware_init(self, dataloader):
        
        cats,  nums = [], []
        for batch in dataloader:
            x_cat, x_num, *other = batch
            cats += [x_cat]
            nums += [x_num]
        all_cat = torch.cat(cats)
        all_num = torch.cat(nums)

        with torch.no_grad():
            self(all_cat, all_num)

In [17]:
from dataclasses import dataclass
from typing import *
import pandas as pd


@dataclass
class DataSciencePipelineSettings(object):
    train_csv_path : str
    test_csv_path : str
    target_col_name : List[str]
    original_csv_path : str = None

    train_data : Union[None, pd.DataFrame] = None
    test_data : Union[None, pd.DataFrame] = None
    original_data : Union[None, pd.DataFrame] = None

    original_csv_processing : callable = lambda x : x
    sample_submission_path : str = None
    training_col_names : List[str] = None
    categorical_col_names : List[str] = None
    training_data_percentage : float = 0.8
    category_occurrence_threshold : int = 300
    logged : bool = False

    def __post_init__(self):
        self.train_df, self.test_df = self._gather_data()
        self.training_col_names, self.categorical_col_names, self.numerical_col_names = self._get_column_info()
        self.combined_df = self._combine_datasets()

    def _gather_data(self):
        train_df = self._smart_drop_index(pd.read_csv(self.train_csv_path) if self.train_data is None else self.train_data)
        test_df = self._smart_drop_index(pd.read_csv(self.test_csv_path) if self.test_data is None else self.test_data)

        if self.original_csv_path is not None:
            train_df = train_df.assign(source=0)
            test_df = test_df.assign(source=0)
            original_df = self._smart_drop_index(pd.read_csv(self.original_csv_path) if self.original_data is None else self.original_data).assign(source=1)
            original_df = self.original_csv_processing(original_df)

            pd.testing.assert_index_equal(train_df.columns.sort_values(), original_df.columns.sort_values(), check_exact=True)
            pd.testing.assert_series_equal(train_df.dtypes.sort_index(), original_df.dtypes.sort_index(), check_exact=True)
            train_df = pd.concat([train_df, original_df], axis=0).reset_index(drop=True)

        return train_df, test_df
    
    def _get_column_info(self):
        cat_col_names = [col_name for col_name in self.train_df.columns if self.train_df[col_name].dtype == 'object']
        training_features = list(self.train_df.drop(columns=self.target_col_name).columns)
        cat_col_names = cat_col_names if self.categorical_col_names is None else self.categorical_col_names
        num_col_names = [f for f in training_features if f not in cat_col_names]
        return training_features, cat_col_names, num_col_names
    
    def _combine_datasets(self):
        combined_df = pd.concat([self.train_df, self.test_df], keys=['train', 'test'])
        return combined_df
    
    def update(self):
        self.train_df = self.combined_df.loc['train'].copy()
        self.test_df = self.combined_df.loc['test'].copy()
        return self.train_df, self.test_df
    
    def get_data(self):
        self.update()
        X_test, y_test = self.test_df.drop(columns=self.target_col_name), self.test_df[self.target_col_name]
        X, y = self.train_df.drop(columns=self.target_col_name), self.train_df[self.target_col_name]
        return X, y, X_test, y_test

    @staticmethod
    def _smart_drop_index(df):
        try:
            differences = df.iloc[:, 0].diff().dropna()
            if differences.nunique() == 1:
                df = df.drop(columns=df.columns[0])
        except:
            pass
        return df
    
    @property
    def target_col(self):
        """target column name property."""
        return self.target_col_name

    @target_col.setter
    def target_col(self, value):
        self.target_col_name = value

In [18]:
def encode_in_order(array):
    d = {}
    idx = 0
    for i, n in enumerate(array):
        if n not in d:
            d[n] = idx
            idx += 1
        array[i] = d[array[i]]
    return array

def get_activation(activation):
    if activation == 'relu':
        return nn.ReLU()
    elif activation == 'gelu':
        return nn.GELU()
    elif activation == 'sigmoid':
        return nn.Sigmoid()
    elif activation == 'none':
        return nn.Identity()

In [19]:
torch.use_deterministic_algorithms(False)

In [18]:
# pl.seed_everything(42)

# kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
# train_dataframe = pd.read_csv(train_csv_path, index_col=0)
# test_dataframe = pd.read_csv(test_csv_path, index_col=0)

# # print(train_dataframe)

# folds = kf.split(
#                 train_dataframe, train_dataframe.race_group.astype(str)
#             )

# metrics_list = []
# oof_preds = np.zeros(train_dataframe.shape[0])
# test_pred = np.zeros(test_dataframe.shape[0])

# _, train_original = load_data()

# for i, (train_index, test_index) in enumerate(folds):

#     train_df = train_dataframe.iloc[train_index]
#     test_df = train_dataframe.iloc[test_index]

#     tt = train_original.copy()
#     train = tt.iloc[train_index]
#     val = tt.iloc[test_index]

#     (X_train, y_train, X_val, y_val), settings = ktools_preprocess_data(train_df, test_df)

#     cat_names = settings.categorical_col_names
#     cat_sizes = [int(x) for x in X_train[cat_names].nunique().values]
#     emb_sizes = [16] * len(cat_sizes)
#     categorical_idcs = [X_train.columns.get_loc(col) for col in cat_names]
#     numerical_idcs = list(set(range(X_train.shape[1])).difference(set(categorical_idcs)))
#     race_idx = X_train[cat_names].columns.get_loc('race_group')
    
#     dl_train, X_cat_train, X_num_train = init_ktools_dl(X_train, y_train, categorical_idcs, numerical_idcs, training=True)
#     dl_val, X_cat_val, X_num_val = init_ktools_dl(X_val, y_val, categorical_idcs, numerical_idcs)

#     num_cols = [x for x in X_val.iloc[:, numerical_idcs].columns.tolist() if 'nan' not in x]
#     X_cat_train_exp, X_cat_val_exp, X_num_train_exp, X_num_val_exp, dl_train, dl_val, transformers = preprocess_data(train, val, numericals=num_cols)
    
#     for i in range(X_cat_train_exp.shape[1]):
#         assert np.allclose(encode_in_order(X_cat_train_exp[:, i]), encode_in_order(X_cat_train[:, i]))
#         assert np.allclose(encode_in_order(X_cat_val_exp[:, i]), encode_in_order(X_cat_val[:, i]))

#     X_num_val_exp[:, [-2, -1]] = X_num_val_exp[:, [-1, -2]]
#     X_num_train_exp[:, [-2, -1]] = X_num_train_exp[:, [-1, -2]]
#     assert np.allclose(X_num_val, X_num_val_exp)
#     assert np.allclose(X_num_train, X_num_train_exp)

#     base_model = IskanderPairwiseNetwork(cat_sizes, len(numerical_idcs), emb_sizes)
#     model = PostHCTModel(base_model, race_index=race_idx)
    
#     checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor=None,
#                                                        dirpath="checkpoints/",
#                                                        filename="last",
#                                                        save_top_k=1, 
#                                                        save_last=True)
#     trainer = pl.Trainer(
#         accelerator='cuda',
#         max_epochs=60,
#         callbacks=[
#             checkpoint_callback,
#             LearningRateMonitor(logging_interval='epoch'),
#             TQDMProgressBar(),
#             StochasticWeightAveraging(swa_lrs=1e-5, swa_epoch_start=45, annealing_epochs=15)
#         ],
#         # deterministic=True
#     )
#     trainer.fit(model, dl_train)
#     trainer.test(model, dl_val)
#     model.eval()

#     predictions = model.eval_preds
#     print(scci_metric(test_df, -predictions.squeeze()))

#     metrics_list += [scci_metric(test_df, -predictions.squeeze())]
#     print(metrics_list)
    
#     oof_preds[test_index] = predictions.squeeze()


#     (_, _, X_test, y_test), settings = ktools_preprocess_data(train_df, test_dataframe)
#     X_cat_val, X_num_val = X_test.iloc[:, categorical_idcs].values, X_test.iloc[:, numerical_idcs].values
    
#     # test_dl = eval_dl(X_cat_val, X_num_val)
#     # trainer.test(model, test_dl)
#     # test_pred += model.eval_preds.squeeze()

#     pred, _ = model.cuda().eval()(
#         torch.tensor(X_cat_val, dtype=torch.long).cuda(),
#         torch.tensor(X_num_val, dtype=torch.float32).cuda()
#     )
#     test_predictions_this_fold = pred.squeeze().detach().cpu().numpy()

#     test_predictions_this_fold = StandardScaler().fit_transform(test_predictions_this_fold[:, None])
#     test_pred += test_predictions_this_fold.squeeze()


# print(f"metric across folds: ", [f"{n:.3f}" for n in metrics_list])
# print("oof scci metric score: ", scci_metric(train_dataframe, -oof_preds))


In [56]:
train_dataframe = pd.read_csv(train_csv_path, index_col=0)
test_dataframe = pd.read_csv(test_csv_path, index_col=0)

In [57]:
test_dataframe['efs'] = 1
test_dataframe['efs_time'] = 1

In [62]:
(X_train, y_train, X_val, y_val), settings = ktools_preprocess_data(train_dataframe, test_dataframe)

cat_names = settings.categorical_col_names
cat_sizes = [int(x) for x in X_train[cat_names].nunique().values]
emb_sizes = [16] * len(cat_sizes)
categorical_idcs = [X_train.columns.get_loc(col) for col in cat_names]
numerical_idcs = list(set(range(X_train.shape[1])).difference(set(categorical_idcs)))
race_idx = X_train[cat_names].columns.get_loc('race_group')

dl_train, X_cat_train, X_num_train = init_ktools_dl(X_train, y_train, categorical_idcs, numerical_idcs, training=True)
dl_val, X_cat_val, X_num_val = init_ktools_dl(X_val, y_val, categorical_idcs, numerical_idcs)
num_cols = [x for x in X_val.iloc[:, numerical_idcs].columns.tolist() if 'nan' not in x]
X_cat_train_exp, X_cat_val_exp, X_num_train_exp, X_num_val_exp, dl_train, dl_val, transformers = preprocess_data(train_dataframe, test_dataframe, numericals=num_cols)

for i in range(X_cat_train_exp.shape[1]):
    print(X_cat_val_exp[:, i].min())
    assert np.allclose(encode_in_order(X_cat_train_exp[:, i]), encode_in_order(X_cat_train[:, i]))
    assert np.allclose(encode_in_order(X_cat_val_exp[:, i]), encode_in_order(X_cat_val[:, i]))

X_num_val_exp[:, [-2, -1]] = X_num_val_exp[:, [-1, -2]]
X_num_train_exp[:, [-2, -1]] = X_num_train_exp[:, [-1, -2]]
assert np.allclose(X_num_val, X_num_val_exp)
assert np.allclose(X_num_train, X_num_train_exp)

# display(X_val)

base_model = IskanderPairwiseNetwork(cat_sizes, len(numerical_idcs), emb_sizes)
model = PostHCTModel(base_model, race_index=race_idx)

checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor=None,
                                                   dirpath="checkpoints/",
                                                   filename="last",
                                                   save_top_k=1, 
                                                   save_last=True)
trainer = pl.Trainer(
    accelerator='cpu',
    max_epochs=60,
    callbacks=[
        checkpoint_callback,
        LearningRateMonitor(logging_interval='epoch'),
        TQDMProgressBar(),
        StochasticWeightAveraging(swa_lrs=1e-5, swa_epoch_start=45, annealing_epochs=15)
    ],
    # deterministic=True
)
trainer.fit(model, dl_train)
trainer.test(model, dl_val)

test_preds = model.eval_preds.squeeze()

There are 58 FEATURES: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia', 'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status', 'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6', 'hla_match_c_low', 'rituximab', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe', 'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer', 'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue', 'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score', 'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high', 'pulm_moderate', 'hla_low_res_10', 'is_cyto_score_same']
2
0
1
0
2
6
0
0
4


GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/yuwei-1/anaconda3/envs/ktools/lib/python3.12/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/yuwei-1/anaconda3/envs/ktools/lib/python3.12/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
/Users/yuwei-1/anaconda3/envs/ktools/lib/python3.12/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /Users/yuwei-1/Documents/projects/Kaggle-tools/post_HCT_survival_notebooks/checkpoints exists and is not empty.

  | Name          | Type                       | Params | Mode 
---------------------------------------------------------------------
0 | model         | IskanderPairwiseNetwork    | 159 K  | train
1 | aux_predictor | NonLinearFeedForwardM

0
6


/Users/yuwei-1/anaconda3/envs/ktools/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/Users/yuwei-1/anaconda3/envs/ktools/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (15) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

In [None]:
subm_data = pd.read_csv(sub_csv_path)
subm_data['prediction'] = -test_preds
subm_data.to_csv('submission.csv', index=False)