In [19]:
import pandas as pd
import numpy as np
import lifelines
from lifelines import KaplanMeierFitter, NelsonAalenFitter, CoxPHFitter
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from lifelines.utils import concordance_index
# import seaborn as sns
# from sklearn.impute import SimpleImputer
from catboost import CatBoostRegressor
# import autogluon.tabular as agt
import lightgbm as lgb
from termcolor import colored
import warnings
from sksurv.linear_model import CoxnetSurvivalAnalysis
import tqdm
from openfe import OpenFE, transform
from scipy.stats import rankdata

warnings.filterwarnings('ignore')

In [3]:
def score_(y_true, y_pred, return_details = False):
    metric_list = []
    for race in race_groups:
        mask = y_true.race_group.values == race
        metric_list.append(concordance_index(y_true.efs_time[mask], - y_pred[mask], y_true.efs[mask]))
        
    if return_details:
        return np.mean(metric_list) - np.std(metric_list), np.mean(metric_list), np.std(metric_list)
    else:
        return np.mean(metric_list) - np.std(metric_list)

In [4]:
train = pd.read_csv('HCT/train.csv', index_col='ID')
data_description = pd.read_csv('HCT/data_dictionary.csv')

In [5]:
race_groups = list(train["race_group"].unique())

In [6]:
cat_cols = []
num_cols = []
for v, t in data_description[['variable', 'type']].values:
    if t == 'Categorical' and v != 'efs':
        cat_cols.append(v)
    elif not v in ['efs_time', 'efs']:
        num_cols.append(v)

In [7]:
def label_encoding(df: pd.DataFrame):
    df['label'] = 0
    df.loc[df['efs'] == 0, 'label'] = -df.loc[df['efs'] == 0, 'efs_time']
    df.loc[df['efs'] == 1, 'label'] = df.loc[df['efs'] == 1, 'efs_time']
    return df

In [8]:
train = label_encoding(train)

In [9]:
naf = NelsonAalenFitter()
naf.fit(train['efs_time'], train['efs'])
train['naf_label'] = -naf.cumulative_hazard_at_times(train['efs_time']).values
train.loc[train['efs'] == 0, 'naf_label'] -= 0.1

In [10]:
kmf = KaplanMeierFitter()
kmf.fit(train['efs_time'], train['efs'])
train['km_label'] = kmf.survival_function_at_times(train['efs_time']).values
train.loc[train['efs'] == 0, 'km_label'] -= 0.1

In [1]:
cat_naf0_params = {'depth': 7,
                   'random_state': 48, 
                   'iterations': 10_000, 
                   'cat_features': cat_cols, 
                   'verbose': 1000} # 0.6752753459266848
cat_naf1_params = {'random_state': 26, 
                   'iterations': 9889, 
                   'depth': 7,
                   'verbose': 1000,
                   'cat_features': cat_cols} # 0.6758788851732089
cat_km_params = {'random_state': 3, 
                 'iterations': 8234, 
                 'depth': 8, 
                 'cat_features': cat_cols, 
                 'verbose': 1000} # 0.6750865340917307
xgb_naf_params = {'max_depth': 6,
                 'learning_rate': 0.0032301744306691763,
                 'n_estimators': 5225,
                 'min_child_weight': 51,
                 'colsample_bytree': 0.3646259272612244,
                 'subsample': 0.5286721292338277,
                 'random_state': 74,
                 'enable_categorical': True} # 0.6752506355127709
xgb_km_params = {'max_depth': 3,
                 'learning_rate': 0.003987965677070203,
                 'n_estimators': 8510,
                 'min_child_weight': 140,
                 'colsample_bytree': 0.8698994335997811,
                 'subsample': 0.583154256898487,
                 'random_state': 13,
                 'objective': 'reg:squarederror',
                 'enable_categorical': True} # 0.6742435014513065
lgb_naf_params = {'max_depth': 9,
                 'learning_rate': 0.00256889248188078,
                 'n_estimators': 8022,
                 'min_child_weight': 68,
                 'colsample_bytree': 0.29850229921441546,
                 'subsample': 0.31325940646236816,
                 'random_state': 91, 
                 'verbose': -1} # 0.6752369559836727
n_rounds_lgb_naf = 3580
lgb_km_params = {'max_depth': 7,
                 'learning_rate': 0.0025190259392153425,
                 'n_estimators': 9589,
                 'min_child_weight': 83,
                 'colsample_bytree': 0.24291593810960194,
                 'subsample': 0.39946192482546417,
                 'random_state': 17,
                 'verbose': -1} # 0.6747976782371884
n_rounds_lgb_km = 9656
cat_naf_race_int = {'random_state': 18, 'iterations': 8999, 'depth': 6} # 0.6762042400449366


NameError: name 'cat_cols' is not defined

In [31]:
def cv_ensemble(ranked: bool | str = 'both'):
    target_cols = ['efs', 'efs_time', 'label', 'km_label', 'naf_label']
    all_preds = []
    all_preds_r = []
    all_efs = []
    all_efs_time = []
    scores = []
    scores_r = []
    cv = KFold(n_splits=3)
    
    for i, (train_indexes, val_indexes) in enumerate(cv.split(train)):
        train_data = train.iloc[train_indexes]
        val_data = train.iloc[val_indexes]
        cat_cols = train_data.drop(columns=target_cols).select_dtypes(include=object).columns.values.tolist()
        
        cat_train = train_data.copy()
        cat_test = val_data.copy()
        cat_train[cat_cols] = cat_train[cat_cols].astype('str').fillna('NaN')
        cat_test[cat_cols] = cat_test[cat_cols].astype('str').fillna('NaN')
        cat_test = cat_test.drop(columns=target_cols)
        
        train_lgb = train_data.copy()
        test_lgb = val_data.copy()
        train_lgb[cat_cols] = train_lgb[cat_cols].astype('category')
        test_lgb[cat_cols] = test_lgb[cat_cols].astype('category')
        test_lgb = test_lgb.drop(columns=target_cols)
        
        train_lgb_naf = lgb.Dataset(train_lgb.drop(columns=target_cols), label=train_lgb['naf_label'], categorical_feature=cat_cols)
        train_lgb_km = lgb.Dataset(train_lgb.drop(columns=target_cols), label=train_lgb['km_label'], categorical_feature=cat_cols)
    
        best_naf0 = lgb.train(lgb_naf_params, train_lgb_naf, n_rounds_lgb_naf, valid_sets=[train_lgb_naf])
        best_km = lgb.train(lgb_km_params, train_lgb_km, n_rounds_lgb_km, valid_sets=[train_lgb_km])
    
        # cat_cox = CatBoostRegressor(iterations=10000, cat_features=cat_cols, loss_function='Cox', eval_metric='Cox', random_state=11, verbose=1000)
        # cat_cox1 = CatBoostRegressor(cat_features=cat_cols, loss_function='Cox', eval_metric='Cox', verbose=100, **params_cox1)
        # cat_cox2 = CatBoostRegressor(iterations=10000, cat_features=cat_cols, loss_function='Cox', eval_metric='Cox', random_state=999, verbose=1000)
        cat_naf = CatBoostRegressor(**cat_naf0_params, loss_function='RMSE')
        cat_km = CatBoostRegressor(**cat_km_params, loss_function='RMSE')
        cat_naf1 = CatBoostRegressor(**cat_naf1_params, loss_function='RMSE')
        
    
        xgb_naf0 = xgb.XGBRegressor(**xgb_naf_params)
        xgb_km = xgb.XGBRegressor(**xgb_km_params)
    
        cat_naf.fit(cat_train.drop(columns=target_cols), cat_train['naf_label'])
        cat_km.fit(cat_train.drop(columns=target_cols), cat_train['km_label'])
        cat_naf1.fit(cat_train.drop(columns=target_cols), cat_train['naf_label'])
        
    
        xgb_naf0.fit(train_lgb.drop(columns=target_cols), train_lgb['naf_label'])
        xgb_km.fit(train_lgb.drop(columns=target_cols), train_lgb['km_label'])

        preds0 = cat_naf.predict(cat_test)
        preds1 = cat_naf1.predict(cat_test)
        preds2 = cat_km.predict(cat_test)
        preds3 = best_naf0.predict(test_lgb)
        preds4 = best_km.predict(test_lgb)
        preds5 = xgb_naf0.predict(test_lgb)
        preds6 = xgb_km.predict(test_lgb)

        preds = (preds0 + preds1 + preds2 + preds3 + preds4 + preds5 + preds6) / 7
        score = score_(val_data[['efs', 'efs_time', 'race_group']], preds)
        scores.append(score)
        all_preds += list(preds)

        if ranked:
            preds0r = rankdata(preds0)
            preds1r = rankdata(preds1)
            preds2r = rankdata(preds2)
            preds3r = rankdata(preds3)
            preds4r = rankdata(preds4)
            preds5r = rankdata(preds5)
            preds6r = rankdata(preds6)

            preds_r = (preds0r + preds1r + preds2r + preds3r + preds4r + preds5r + preds6r) / 7
            score_r = score_(val_data[['efs', 'efs_time', 'race_group']], preds_r)
            all_preds_r += list(preds_r)
            scores_r.append(score_r)

            if ranked != 'both':
                print(colored(f'Fold #{i} score ranked: {score_r}', 'black', 'on_red'))
            else:
                print(colored(f'Fold #{i} score: {score}\tFold #{i} score ranked: {score_r}', 'black', 'on_red'))
        else:
            print(colored(f'Fold #{i} score: {score}', 'black', 'on_red'))

    print()
    print()
    if not ranked:
        print(colored(f"Mean C-index: {sum(scores) / 3}\tCV C-index: {score_(train[['efs', 'efs_time', 'race_group']], np.array(all_preds))}", 'grey', 'on_red', attrs=['bold']))
    else:
        if ranked == 'both':
            print(colored(f"Mean C-index: {sum(scores) / 3}\tCV C-index: {score_(train[['efs', 'efs_time', 'race_group']], np.array(all_preds))}", 'grey', 'on_red', attrs=['bold']))
            print(colored(f"RANKED: Mean C-index: {sum(scores_r) / 3}\tCV C-index: {score_(train[['efs', 'efs_time', 'race_group']], np.array(all_preds_r))}", 'grey', 'on_red', attrs=['bold']))
        else:
            print(colored(f"RANKED: Mean C-index: {sum(scores_r) / 3}\tCV C-index: {score_(train[['efs', 'efs_time', 'race_group']], np.array(all_preds_r))}", 'grey', 'on_red', attrs=['bold']))

In [37]:
train['age_at_hct']

ID
0         9.942
1        43.705
2        33.997
3        43.245
4        29.740
          ...  
28795    51.136
28796    18.075
28797    51.005
28798     0.044
28799     1.035
Name: age_at_hct, Length: 28800, dtype: float64

In [39]:
rankdata(train.loc[train['efs'] == 0, 'age_at_hct'].values)

array([2884. , 6162. , 8136. , ..., 9425. ,  387.5, 1114.5])

In [41]:
def cv_ensemble_race(ranked: bool | str = 'both'):
    target_cols = ['efs', 'efs_time', 'label', 'km_label', 'naf_label']
    all_preds = []
    all_preds_r = []
    all_efs = []
    all_efs_time = []
    scores = []
    scores_r = []
    cv = KFold(n_splits=3)
    
    for i, (train_indexes, val_indexes) in enumerate(cv.split(train)):
        train_data = train.iloc[train_indexes]
        val_data = train.iloc[val_indexes]
        cat_cols = train_data.drop(columns=target_cols).select_dtypes(include=object).columns.values.tolist()
        
        cat_train = train_data.copy()
        cat_test = val_data.copy()
        cat_train[cat_cols] = cat_train[cat_cols].astype('str').fillna('NaN')
        cat_test[cat_cols] = cat_test[cat_cols].astype('str').fillna('NaN')
        cat_test = cat_test.drop(columns=target_cols)
        
        train_lgb = train_data.copy()
        test_lgb = val_data.copy()
        train_lgb[cat_cols] = train_lgb[cat_cols].astype('category')
        test_lgb[cat_cols] = test_lgb[cat_cols].astype('category')
        test_lgb = test_lgb.drop(columns=target_cols)
        
        train_lgb_naf = lgb.Dataset(train_lgb.drop(columns=target_cols), label=train_lgb['naf_label'], categorical_feature=cat_cols)
        train_lgb_km = lgb.Dataset(train_lgb.drop(columns=target_cols), label=train_lgb['km_label'], categorical_feature=cat_cols)
    
        best_naf0 = lgb.train(lgb_naf_params, train_lgb_naf, n_rounds_lgb_naf, valid_sets=[train_lgb_naf])
        best_km = lgb.train(lgb_km_params, train_lgb_km, n_rounds_lgb_km, valid_sets=[train_lgb_km])
    
        # cat_cox = CatBoostRegressor(iterations=10000, cat_features=cat_cols, loss_function='Cox', eval_metric='Cox', random_state=11, verbose=1000)
        # cat_cox1 = CatBoostRegressor(cat_features=cat_cols, loss_function='Cox', eval_metric='Cox', verbose=100, **params_cox1)
        # cat_cox2 = CatBoostRegressor(iterations=10000, cat_features=cat_cols, loss_function='Cox', eval_metric='Cox', random_state=999, verbose=1000)
        cat_naf = CatBoostRegressor(**cat_naf0_params, loss_function='RMSE')
        cat_km = CatBoostRegressor(**cat_km_params, loss_function='RMSE')
        cat_naf1 = CatBoostRegressor(**cat_naf1_params, loss_function='RMSE')
        
    
        xgb_naf0 = xgb.XGBRegressor(**xgb_naf_params)
        xgb_km = xgb.XGBRegressor(**xgb_km_params)
    
        cat_naf.fit(cat_train.drop(columns=target_cols), cat_train['naf_label'])
        cat_km.fit(cat_train.drop(columns=target_cols), cat_train['km_label'])
        cat_naf1.fit(cat_train.drop(columns=target_cols), cat_train['naf_label'])
        
    
        xgb_naf0.fit(train_lgb.drop(columns=target_cols), train_lgb['naf_label'])
        xgb_km.fit(train_lgb.drop(columns=target_cols), train_lgb['km_label'])

        
        preds0 = cat_naf.predict(cat_test)
        preds1 = cat_naf1.predict(cat_test)
        preds2 = cat_km.predict(cat_test)
        preds3 = best_naf0.predict(test_lgb)
        preds4 = best_km.predict(test_lgb)
        preds5 = xgb_naf0.predict(test_lgb)
        preds6 = xgb_km.predict(test_lgb)

        preds = (preds0 + preds1 + preds2 + preds3 + preds4 + preds5 + preds6) / 7
        val_data['preds'] = preds.copy()
        for r in race_groups:
            val_data.loc[val_data['race_group'] == r, 'preds'] = rankdata(val_data.loc[val_data['race_group'] == r, 'preds'].values)
        score = score_(val_data[['efs', 'efs_time', 'race_group']], val_data['preds'].values)
        scores.append(score)
        all_preds += list(preds)

    #     if ranked:
    #         preds0r = rankdata(preds0)
    #         preds1r = rankdata(preds1)
    #         preds2r = rankdata(preds2)
    #         preds3r = rankdata(preds3)
    #         preds4r = rankdata(preds4)
    #         preds5r = rankdata(preds5)
    #         preds6r = rankdata(preds6)

    #         preds_r = (preds0r + preds1r + preds2r + preds3r + preds4r + preds5r + preds6r) / 7
    #         score_r = score_(val_data[['efs', 'efs_time', 'race_group']], preds_r)
    #         all_preds_r += list(preds_r)
    #         scores_r.append(score_r)

    #         if ranked != 'both':
    #             print(colored(f'Fold #{i} score ranked: {score_r}', 'black', 'on_red'))
    #         else:
    #             print(colored(f'Fold #{i} score: {score}\tFold #{i} score ranked: {score_r}', 'black', 'on_red'))
    #     else:
        print(colored(f'Fold #{i} score: {score}', 'black', 'on_red'))

    # print()
    # print()
    # if not ranked:
    #     print(colored(f"Mean C-index: {sum(scores) / 3}\tCV C-index: {score_(train[['efs', 'efs_time', 'race_group']], np.array(all_preds))}", 'grey', 'on_red', attrs=['bold']))
    # else:
    #     if ranked == 'both':
    print(colored(f"Mean C-index: {sum(scores) / 3}\tCV C-index: {score_(train[['efs', 'efs_time', 'race_group']], np.array(all_preds))}", 'grey', 'on_red', attrs=['bold']))
        #     print(colored(f"RANKED: Mean C-index: {sum(scores_r) / 3}\tCV C-index: {score_(train[['efs', 'efs_time', 'race_group']], np.array(all_preds_r))}", 'grey', 'on_red', attrs=['bold']))
        # else:
        #     print(colored(f"RANKED: Mean C-index: {sum(scores_r) / 3}\tCV C-index: {score_(train[['efs', 'efs_time', 'race_group']], np.array(all_preds_r))}", 'grey', 'on_red', attrs=['bold']))

In [30]:
cv_ensemble()

Learning rate set to 0.010045
0:	learn: 0.3135392	total: 27.7ms	remaining: 4m 37s
1000:	learn: 0.2718191	total: 25.3s	remaining: 3m 47s
2000:	learn: 0.2608548	total: 52.1s	remaining: 3m 28s
3000:	learn: 0.2533346	total: 1m 20s	remaining: 3m 6s
4000:	learn: 0.2471765	total: 1m 48s	remaining: 2m 42s
5000:	learn: 0.2416894	total: 2m 16s	remaining: 2m 16s
6000:	learn: 0.2365276	total: 2m 45s	remaining: 1m 50s
7000:	learn: 0.2318358	total: 3m 14s	remaining: 1m 23s
8000:	learn: 0.2271718	total: 3m 43s	remaining: 55.9s
9000:	learn: 0.2229196	total: 4m 13s	remaining: 28.1s
9999:	learn: 0.2188738	total: 4m 42s	remaining: 0us
Learning rate set to 0.011764
0:	learn: 0.2171138	total: 56.2ms	remaining: 7m 42s
1000:	learn: 0.1813706	total: 1m 6s	remaining: 8m 3s
2000:	learn: 0.1712642	total: 2m 15s	remaining: 7m 2s
3000:	learn: 0.1641920	total: 3m 24s	remaining: 5m 56s
4000:	learn: 0.1582740	total: 4m 33s	remaining: 4m 49s
5000:	learn: 0.1527425	total: 5m 42s	remaining: 3m 41s
6000:	learn: 0.1475692

In [42]:
cv_ensemble_race()

Learning rate set to 0.010045
0:	learn: 0.3135392	total: 36.1ms	remaining: 6m 1s
1000:	learn: 0.2718191	total: 24.5s	remaining: 3m 40s
2000:	learn: 0.2608548	total: 50s	remaining: 3m 19s
3000:	learn: 0.2533346	total: 1m 16s	remaining: 2m 58s
4000:	learn: 0.2471765	total: 1m 43s	remaining: 2m 35s
5000:	learn: 0.2416894	total: 2m 11s	remaining: 2m 11s
6000:	learn: 0.2365276	total: 2m 39s	remaining: 1m 46s
7000:	learn: 0.2318358	total: 3m 7s	remaining: 1m 20s
8000:	learn: 0.2271718	total: 3m 35s	remaining: 53.9s
9000:	learn: 0.2229196	total: 4m 4s	remaining: 27.1s
9999:	learn: 0.2188738	total: 4m 33s	remaining: 0us
Learning rate set to 0.011764
0:	learn: 0.2171138	total: 59.9ms	remaining: 8m 13s
1000:	learn: 0.1813706	total: 1m 5s	remaining: 7m 55s
2000:	learn: 0.1712642	total: 2m 13s	remaining: 6m 56s
3000:	learn: 0.1641920	total: 3m 21s	remaining: 5m 51s
4000:	learn: 0.1582740	total: 4m 29s	remaining: 4m 45s
5000:	learn: 0.1527425	total: 5m 37s	remaining: 3m 38s
6000:	learn: 0.1475692	t


KeyboardInterrupt



In [32]:
(0.6724430787063874 + 0.6839399751459193 + 0.6757035136638331) / 3

0.6773621891720466