In [2]:
import sys
sys.path.append('..')
import pandas as pd
from functools import reduce
from lifelines.utils import concordance_index
from ktools.metrics.stratified_concordance_index import stratified_concordance_index

In [3]:
from dataclasses import dataclass
from typing import *
import pandas as pd


def func(x):
    return x


@dataclass
class DataSciencePipelineSettings:
    train_csv_path : str
    test_csv_path : str
    target_col_name : str
    original_csv_path : str = None
    original_csv_processing : callable = func
    sample_submission_path : str = None
    training_col_names : List[str] = None
    categorical_col_names : List[str] = None
    training_data_percentage : float = 0.8
    category_occurrence_threshold : int = 300
    logged : bool = False

    def __post_init__(self):
        self.train_df, self.test_df = self._load_csv_paths()
        self.training_col_names, self.categorical_col_names = self._get_column_info()
        self.combined_df = self._combine_datasets()

    def _load_csv_paths(self):
        train_df = self._smart_drop_index(pd.read_csv(self.train_csv_path))
        test_df = self._smart_drop_index(pd.read_csv(self.test_csv_path))
        if self.original_csv_path is not None:
            train_df = train_df.assign(source=0)
            test_df = test_df.assign(source=0)
            original_df = self._smart_drop_index(pd.read_csv(self.original_csv_path)).assign(source=1)
            original_df = self.original_csv_processing(original_df)

            pd.testing.assert_index_equal(train_df.columns.sort_values(), original_df.columns.sort_values(), check_exact=True)
            pd.testing.assert_series_equal(train_df.dtypes.sort_index(), original_df.dtypes.sort_index(), check_exact=True)
            train_df = pd.concat([train_df, original_df], axis=0).reset_index(drop=True)

        return train_df, test_df
    
    def _get_column_info(self):
        cat_col_names = [col_name for col_name in self.train_df.columns if self.train_df[col_name].dtype == 'object']
        training_features = list(self.train_df.drop(columns=self.target_col_name).columns)
        return training_features, cat_col_names
    
    def _combine_datasets(self):
        combined_df = pd.concat([self.train_df, self.test_df], keys=['train', 'test'])
        return combined_df
    
    def update(self):
        self.train_df = self.combined_df.loc['train'].copy()
        self.test_df = self.combined_df.loc['test'].copy()
        return self.train_df, self.test_df        

    @staticmethod
    def _smart_drop_index(df):
        try:
            differences = df.iloc[:, 0].diff().dropna()
            if differences.nunique() == 1:
                df = df.drop(columns=df.columns[0])
        except:
            pass
        return df

In [4]:
import numpy as np
from copy import deepcopy
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler



class ConvertToLower():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        for col_name in settings.categorical_col_names:
            settings.combined_df[col_name] = settings.combined_df[col_name].str.lower()
        return settings
    

class FillNullValues():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings, numeric_fill=-1, category_fill='missing'):
        settings = deepcopy(original_settings)
        for col_name in settings.training_col_names:
            if pd.api.types.is_numeric_dtype(settings.combined_df[col_name]):
                settings.combined_df[col_name] = settings.combined_df[col_name].fillna(numeric_fill)
            else:
                settings.combined_df[col_name] = settings.combined_df[col_name].fillna(category_fill)
        return settings
    

class ConvertObjectToCategorical():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        cat_cols = settings.categorical_col_names
        settings.combined_df[cat_cols] = settings.combined_df[cat_cols].astype('category')
        return settings
    

class ConvertObjectToStrCategorical():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        cat_cols = settings.categorical_col_names
        settings.combined_df[cat_cols] = settings.combined_df[cat_cols].astype(str).astype('category')
        return settings


class ConvertAllToCategorical():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        all_cols = settings.training_col_names
        settings.combined_df[all_cols] = settings.combined_df[all_cols].astype(str).astype('category')
        return settings
    

class LogTransformTarget():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        target = settings.target_col_name
        settings.combined_df[target] = np.log1p(settings.combined_df[target])
        return settings
    

class OrdinalEncode():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        train_df, test_df = settings.update()
        ordinal_encoder = OrdinalEncoder(encoded_missing_value=-1, handle_unknown="use_encoded_value", unknown_value=-1)
        train_df[settings.categorical_col_names] = ordinal_encoder.fit_transform(train_df[settings.categorical_col_names])
        test_df[settings.categorical_col_names] = ordinal_encoder.transform(test_df[settings.categorical_col_names])
        settings.combined_df = pd.concat([train_df, test_df], keys=['train', 'test'])
        settings.combined_df[settings.categorical_col_names] = settings.combined_df[settings.categorical_col_names].astype(int)
        return settings

class StandardScaleNumerical():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        scaler = StandardScaler()
        train_df, test_df = settings.update()
        num_cols = settings.combined_df.select_dtypes(include=['number']).columns
        train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
        test_df[num_cols] = scaler.transform(test_df[num_cols])
        settings.combined_df = pd.concat([train_df, test_df], keys=['train', 'test'])
        return settings

class MinMaxScalerNumerical():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        scaler = MinMaxScaler()
        train_df, test_df = settings.update()
        num_cols = settings.combined_df.select_dtypes(include=['number']).columns
        train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
        test_df[num_cols] = scaler.transform(test_df[num_cols])
        settings.combined_df = pd.concat([train_df, test_df], keys=['train', 'test'])
        return settings

In [5]:
import pandas as pd
import pandas.api.types
import numpy as np
from typing import Union
from lifelines.utils import concordance_index


def stratified_concordance_index(solution : pd.DataFrame, 
                                 predictions : Union[pd.Series, np.ndarray], 
                                 event_binary_col_name : str,
                                 duration_col_name : str,
                                 group_col_name : str) -> float:
    
    """
    Solution dataframe should contain all necessary columns
    """

    solution['predictions'] = predictions
    solution.reset_index(inplace=True)
    solution_group_dict = dict(solution.groupby([group_col_name]).groups)
    metric_list = []

    for race in solution_group_dict.keys():

        indices = sorted(solution_group_dict[race])
        merged_df_race = solution.iloc[indices]

        c_index_race = concordance_index(
                        merged_df_race[duration_col_name],
                        -merged_df_race['predictions'],
                        merged_df_race[event_binary_col_name])
        metric_list.append(c_index_race)
    return float(np.mean(metric_list)-np.sqrt(np.var(metric_list)))

In [6]:
train_csv_path = "../data/post_hct_survival/train.csv"
test_csv_path = "../data/post_hct_survival/test.csv"
target_col_name = "efs_time"

class CreateSurvivalTarget():
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        settings.combined_df["survival_target"] = np.where(settings.combined_df['efs'].astype(bool), 
                                                         settings.combined_df['efs_time'], 
                                                         -settings.combined_df['efs_time'])
        return settings    

settings = DataSciencePipelineSettings(train_csv_path,
                                       test_csv_path,
                                       target_col_name,
                                       )
transforms = [
             FillNullValues.transform,
             OrdinalEncode.transform,
             ConvertObjectToStrCategorical.transform,
             CreateSurvivalTarget.transform
             ]

settings = reduce(lambda acc, func: func(acc), transforms, settings)
settings.update()

train, test_df = settings.update()
test_df.drop(columns=[target_col_name], inplace=True)
X, y = train.drop(columns=["survival_target"]), train[["survival_target"]]

In [7]:
X

Unnamed: 0,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,...,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
0,7,0,7,0,-1.0,-1.0,0,0,6.0,0,...,4,2,1,8.0,0,2.0,0,10.0,0.0,42.356
1,2,0,1,0,2.0,8.0,6,0,6.0,1,...,3,1,1,8.0,0,2.0,2,10.0,1.0,4.672
2,7,0,7,0,2.0,8.0,0,0,6.0,0,...,3,1,1,8.0,0,2.0,0,10.0,0.0,19.793
3,0,0,1,0,2.0,8.0,0,0,6.0,0,...,3,2,1,8.0,0,2.0,0,10.0,0.0,102.349
4,0,0,7,0,2.0,8.0,0,0,6.0,1,...,3,1,0,8.0,0,2.0,0,10.0,0.0,16.223
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28795,3,3,0,0,2.0,8.0,0,0,6.0,1,...,0,3,1,8.0,3,2.0,0,10.0,0.0,18.633
28796,0,0,5,2,1.0,4.0,0,0,5.0,1,...,1,1,1,6.0,2,1.0,2,8.0,1.0,4.892
28797,9,3,5,3,2.0,8.0,0,3,6.0,1,...,1,2,1,8.0,3,2.0,0,10.0,0.0,23.157
28798,7,0,5,0,1.0,4.0,0,0,3.0,1,...,3,1,0,4.0,0,1.0,0,5.0,0.0,52.351


In [8]:
X = X.drop(columns='efs_time')
y = X.pop('efs')

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from ktools.fitting.cross_validation_executor import CrossValidationExecutor
from ktools.modelling.ktools_models.lgbm_model import LGBMModel


lgbm = LGBMModel(**{'objective': 'binary', 'metric': 'binary_logloss'})

kf = KFold(5, shuffle=True, random_state=42)
score_tuple, oof_predictions, model_list, _ = CrossValidationExecutor(lgbm,
                              accuracy_score,
                              kf,
                              verbose=2).run(X, y, output_transform_list=[lambda x : (x[1] > 0.5).astype(int)])



The CV results of the current fold is 0.6880208333333333
The CV results of the current fold is 0.6899305555555556
The CV results of the current fold is 0.6940972222222223
The CV results of the current fold is 0.6885416666666667
The CV results of the current fold is 0.6784722222222223
####################################################################################################
OOF prediction score :  0.6878125
Mean 5-cv results : 0.6878124999999999 +- 0.005134194552576257
####################################################################################################


In [4]:
train_features = [f for f in X.columns.tolist() if f not in [target_col_name, "efs"]]

In [5]:
from ktools.modelling.ktools_models.catboost_model import CatBoostModel


cb_model = CatBoostModel(predict_type="else", loss_function='Cox', eval_metric='Cox', grow_policy='Lossguide',
        use_best_model=False, num_boost_round=100, learning_rate=0.1, early_stopping_rounds=10, verbose=False)

In [6]:
indices = np.array([X.columns.get_loc(col) for col in ['efs', 'efs_time', 'race_group']])


In [7]:
def output_transform(input):
    (X_test, y_pred) = input
    X_test['predictions'] = y_pred
    return X_test

def sci_metric(y_test, y_processed):
    if isinstance(y_processed, np.ndarray):
        data = y_processed[:, indices]
        solution = pd.DataFrame(columns=['efs', 'efs_time', 'race_group'], data=data)
        predicted = y_processed[:, -1]
    else:
        solution = y_processed
        predicted = y_processed['predictions']

    metric_value = stratified_concordance_index(solution,
                                                predicted,
                                                'efs',
                                                'efs_time',
                                                'race_group')
    return metric_value

In [8]:
from sklearn.model_selection import KFold
from ktools.fitting.cross_validation_executor import CrossValidationExecutor


kf = KFold(5, shuffle=True, random_state=42)
score_tuple, oof_predictions, model_list = CrossValidationExecutor(cb_model,
                              sci_metric,
                              kf,
                              train_features,
                              verbose=2).run(X, y, output_transform_list=[output_transform])

  solution_group_dict = dict(solution.groupby([group_col_name]).groups)


The CV results of the current fold is 0.6588076178652762


  solution_group_dict = dict(solution.groupby([group_col_name]).groups)


The CV results of the current fold is 0.6613100704558396


  solution_group_dict = dict(solution.groupby([group_col_name]).groups)


The CV results of the current fold is 0.6620737003779054


  solution_group_dict = dict(solution.groupby([group_col_name]).groups)


The CV results of the current fold is 0.6555574046534938
The CV results of the current fold is 0.6560811964346313


  solution_group_dict = dict(solution.groupby([group_col_name]).groups)


####################################################################################################
OOF prediction score :  0.6601325635219006
Mean 5-cv results : 0.6587659979574292 +- 0.0026426563379750474
####################################################################################################


In [None]:
test_predictions = np.zeros(test_df.shape[0])
for model in model_list:
    test_predictions += model.predict(test_df)/kf.get_n_splits()

In [None]:
model_string = "naive_cat"

sample_sub = pd.read_csv(sample_submission_file)
sample_sub.iloc[:, 1] =  test_predictions
sample_sub.to_csv(f"{model_string}_submission.csv", index=False)
sample_sub.head()