In [2]:
!pip install --upgrade lifelines

Collecting lifelines
  Downloading lifelines-0.30.0-py3-none-any.whl.metadata (3.2 kB)
Collecting autograd-gamma>=0.3 (from lifelines)
  Downloading autograd-gamma-0.5.0.tar.gz (4.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting formulaic>=0.2.2 (from lifelines)
  Downloading formulaic-1.1.1-py3-none-any.whl.metadata (6.9 kB)
Collecting interface-meta>=1.2.0 (from formulaic>=0.2.2->lifelines)
  Downloading interface_meta-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Downloading lifelines-0.30.0-py3-none-any.whl (349 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m349.3/349.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading formulaic-1.1.1-py3-none-any.whl (115 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.7/115.7 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading interface_meta-1.3.0-py3-none-any.whl (14 kB)
Building wheels for collected packages: autograd-gamma
  Building wheel for autograd

In [1]:
import os
import joblib
import numpy as np
import pandas as pd
import polars as pl

import pandas.api.types
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import lifelines
from lifelines.utils import concordance_index

import sys

print(sys.version)
for i in [np, pd, pl, mpl, sns, lifelines, sklearn, lgb, xgb, cb]:
    try:
        print(i.__name__, i.__version__)
    except:
        print(i.__name__)

3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]
numpy 1.26.4
pandas 2.2.3
polars 1.12.0
matplotlib 3.8.4
seaborn 0.13.2
lifelines 0.30.0
sklearn 1.5.2
lightgbm
xgboost 2.1.2
catboost 1.2.5


In [32]:
import dproc, sgutil, sgpp, sgml, custpp
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, KFold, ShuffleSplit, train_test_split
from sklearn.impute import SimpleImputer
from lifelines import NelsonAalenFitter, KaplanMeierFitter

In [3]:
data_path = 'data'
img_path = 'img'
result_path = 'result'
model_path = 'model'

sc = sgutil.SGCache(img_path, result_path)

data_path = 'data'
model_path = 'model'

p3 =joblib.load(os.path.join(model_path, 'p3.joblib'))
df_train = p3.transform([os.path.join(data_path, 'train.csv')])

In [4]:
def get_naf(df, time_col='efs_time', event_col='efs'):
    naf = NelsonAalenFitter()
    naf.fit(durations=df['efs_time'], event_observed=df['efs'])
    return naf.cumulative_hazard_at_times(df['efs_time']).values * -1
df_train['naf'] = get_naf(df_train, time_col='efs_time', event_col='efs')

In [5]:
X_bool = ['graft_type', 'prod_type']
X_tri = [
    'arrhythmia', 'cardiac', 'diabetes', 'hepatic_mild', 'hepatic_severe',
    'in_vivo_tcd', 'melphalan_dose', 'mrd_hct', 'obesity', 'peptic_ulcer',
    'prior_tumor', 'psych_disturb', 'pulm_moderate', 'pulm_severe', 'renal_issue',
    'rheum_issue', 'rituximab', 'vent_hist'
]
X_nom = [
    'cmv_status', 'conditioning_intensity', 'cyto_score', 'cyto_score_detail', 'donor_related',
    'dri_score', 'ethnicity', 'gvhd_proph', 'prim_disease_hct', 'race_group', 'sex_match',
    'tbi_status', 'tce_div_match', 'tce_imm_match', 'tce_match'
]
X_na = [
    'arrhythmia_na', 'cardiac_na', 'diabetes_na', 'hepatic_mild_na', 'hepatic_severe_na',
    'obesity_na', 'peptic_ulcer_na', 'prior_tumor_na', 'psych_disturb_na', 'pulm_moderate_na',
    'pulm_severe_na', 'renal_issue_na', 'rheum_issue_na'
]
X_cont = ['age_at_hct', 'donor_age']
X_int = [
    'comorbidity_score', 'hla_high_res_10', 'hla_high_res_6', 'hla_high_res_8', 'hla_low_res_10',
    'hla_low_res_6', 'hla_low_res_8', 'hla_match_a_high', 'hla_match_a_low', 'hla_match_b_high',
    'hla_match_b_low', 'hla_match_drb1_low', 'hla_match_c_high', 'hla_match_c_low', 'hla_match_dqb1_high', 'hla_match_dqb1_low',
    'hla_match_drb1_high', 'hla_nmdp_6', 'karnofsky_score', 'year_hct'
]


In [6]:
def score(df, prds):
    return df.groupby('race_group', observed=True).apply(
        lambda x: concordance_index(x['efs_time'], -prds.loc[x.index], x['efs']), include_groups=False
    ).pipe(
        lambda x: float(x.mean() - x.std(ddof=0))
    )
def get_validation_splitter(validation_fraction):
    return lambda x: train_test_split(x, test_size = validation_fraction)

config = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': score,
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': 'naf',
}

kf = KFold(n_splits=5, shuffle=True, random_state = 123)
ss = ShuffleSplit(n_splits=1, random_state = 123)
lgb_adapter = sgml.LGBMAdapter(lgb.LGBMRegressor)
xgb_adapter = sgml.XGBAdapter(xgb.XGBRegressor)
cb_adapter = sgml.CBAdapter(cb.CatBoostRegressor)

# NAF

In [9]:
lgb6 = sgml.CVModel('model', 'lgb6', kf, config, lgb_adapter).load_if_exists()
hparams = {
    'model_params': {'num_leaves':  15, 'n_estimators':  3000, 'colsample_bytree': 0.25, 'learning_rate': 0.02},
    'X_num': X_tri + X_cont + X_int + X_na + X_bool, 
    'X_cat': X_nom, 'cat': {'handle_unknown': 'use_encoded_value', 'unknown_value': -1},
    #'validation_fraction': 0.1
}
#result = lgb6.adhoc(df_train, ss, hparams)
result = lgb6.cv(df_train, hparams)
np.mean(result['valid_scores']), np.mean(result['train_scores'])#,result['model_result'][0]['valid_result'].idxmin()

(0.6710307291576871, 0.7719411070846158)

In [11]:
score(df_train, lgb6.cv_best_['prd'].sort_index())

0.6749783679685392

In [16]:
xgb6 = sgml.CVModel('model', 'xgb6', kf, config, xgb_adapter).load_if_exists()
hparams = {
    'model_params': {
        'max_depth': 4, 'colsample_bytree': 0.25, 'subsample': 0.9, 'n_estimators': 3000, 'learning_rate': 0.02
    },
    'X_num':  X_tri + X_cont + X_int + X_na + X_bool, 'X_cat': X_nom, 'cat': {'handle_unknown': 'ignore'},
    #'validation_fraction': 0.1,
}
#result = xgb6.adhoc(df_train, ss, hparams, device = 'cuda')
result = xgb6.cv(df_train, hparams, device = 'cuda')
np.mean(result['valid_scores'])#, result['model_result'][0]['valid_result'].idxmin()

0.66963522328283

In [18]:
score(df_train, xgb6.cv_best_['prd'].sort_index())

0.674085593192686

In [22]:
cb6 = sgml.CVModel('model', 'cb6', kf, config, cb_adapter).load_if_exists()
hparams = {
    'model_params': {
        'max_depth': 6, 'n_estimators': 3500, 'learning_rate': 0.03
    },
    'X_num':  X_tri + X_cont + X_int + X_na + X_bool, 'X_cat': X_nom
    #'validation_fraction': 0.1,
}
#result = cb6.adhoc(df_train, ss, hparams, task_type='GPU')
result = cb6.cv(df_train, hparams, task_type = 'GPU')
np.mean(result['valid_scores'])#, result['model_result'][0]['valid_result'].idxmin()

0.6691212050264922

In [23]:
score(df_train, cb6.cv_best_['prd'].sort_index())

0.6735768673796051

In [70]:
models = [lgb6, xgb6, cb6]
df_stk = sc.cache_result(
    'phase10_stk',
    lambda : sgml.stack_cv(models)
)

In [73]:
score(
    df_train,
    df_stk.sort_index()[['lgb6', 'xgb6', 'cb6']].dot([0.5, 0.1, 0.4])
)

0.6765512805195582

In [23]:
for i in models:
    if i.name.startswith('cb'):
        i.train(df_train, task_type = 'GPU')
    elif i.name.startswith('xgb'):
        i.train(df_train, device = 'cuda')
    else:
        i.train(df_train)

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

In [35]:
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y
df_train['kmf'] = transform_survival_probability(df_train, time_col='efs_time', event_col='efs')

config2 = {
    'predict_func': lambda m, df, X: pd.Series(m.predict(df[X]), index = df.index),
    'score_func': score,
    'validation_splitter': get_validation_splitter,
    'progress_callback': sgml.ProgressCallBack(), 
    'return_train_scores': True,
    'y': 'kmf',
}

kf = KFold(n_splits=5, shuffle=True, random_state = 123)
ss = ShuffleSplit(n_splits=1, random_state = 123)
lgb_adapter2 = sgml.LGBMAdapter(lgb.LGBMRegressor)
xgb_adapter2 = sgml.XGBAdapter(xgb.XGBRegressor)
cb_adapter2 = sgml.CBAdapter(cb.CatBoostRegressor)

In [49]:
lgb7 = sgml.CVModel('model', 'lgb7', kf, config2, lgb_adapter2).load_if_exists()
hparams = {
    'model_params': {'num_leaves':  15, 'n_estimators':  3000, 'colsample_bytree': 0.25, 'learning_rate': 0.01},
    'X_num': X_tri + X_cont + X_int + X_na + X_bool, 
    'X_cat': X_nom, 'cat': {'handle_unknown': 'use_encoded_value', 'unknown_value': -1},
    #'validation_fraction': 0.1
}
#result = lgb6.adhoc(df_train, ss, hparams)
result = lgb7.cv(df_train, hparams)
np.mean(result['valid_scores']), np.mean(result['train_scores'])#,result['model_result'][0]['valid_result'].idxmin()

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

(0.6688680707946435, 0.7337469601005966)

In [50]:
score(df_train, lgb7.cv_best_['prd'].sort_index())

0.673101723413942

In [57]:
xgb7 = sgml.CVModel('model', 'xgb7', kf, config2, xgb_adapter).load_if_exists()
hparams = {
    'model_params': {
        'max_depth': 5, 'colsample_bytree': 0.25, 'subsample': 0.9, 'n_estimators': 3000, 'learning_rate': 0.01
    },
    'X_num':  X_tri + X_cont + X_int + X_na + X_bool, 'X_cat': X_nom, 'cat': {'handle_unknown': 'ignore'},
    #'validation_fraction': 0.1,
}
#result = xgb7.adhoc(df_train, ss, hparams, device = 'cuda')
result = xgb7.cv(df_train, hparams, device = 'cuda')
np.mean(result['valid_scores'])#, result['model_result'][0]['valid_result'].idxmin()

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

Round:   0%|          | 0/3000 [00:00<?, ?it/s]

0.6675974664754063

In [58]:
score(df_train, xgb7.cv_best_['prd'].sort_index())

0.6720638440371624

In [59]:
cb7 = sgml.CVModel('model', 'cb7', kf, config2, cb_adapter).load_if_exists()
hparams = {
    'model_params': {
        'max_depth': 7, 'n_estimators': 3500, 'learning_rate': 0.03
    },
    'X_num':  X_tri + X_cont + X_int + X_na + X_bool, 'X_cat': X_nom
    #'validation_fraction': 0.1,
}
#result = cb6.adhoc(df_train, ss, hparams, task_type='GPU')
result = cb7.cv(df_train, hparams, task_type = 'GPU')
np.mean(result['valid_scores'])#, result['model_result'][0]['valid_result'].idxmin()

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

0.6674576368479176

In [60]:
score(df_train, cb7.cv_best_['prd'].sort_index())

0.6718775695255039

In [61]:
models = [lgb7, xgb7, cb7]
df_stk = sc.cache_result(
    'phase10_stk_2',
    lambda : sgml.stack_cv(models)
)

In [68]:
score(
    df_train,
    df_stk.sort_index()[['lgb7', 'xgb7', 'cb7']].dot([0.5, 0.1, 0.4])
)

0.6740629781109544